src/gromacs/mdlib/sim_util.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "sim_util.h"
  40
  41 #include "config.h"
  42
  43 #include <assert.h>
  44 #include <math.h>
  45 #include <stdio.h>
  46 #include <string.h>
  47
  48 #include <cstdint>
  49
  50 #include <array>
  51
  52 #include "gromacs/domdec/domdec.h"
  53 #include "gromacs/domdec/domdec_struct.h"
  54 #include "gromacs/essentialdynamics/edsam.h"
  55 #include "gromacs/ewald/pme.h"
  56 #include "gromacs/gmxlib/chargegroup.h"
  57 #include "gromacs/gmxlib/network.h"
  58 #include "gromacs/gmxlib/nrnb.h"
  59 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
  60 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
  61 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
  62 #include "gromacs/imd/imd.h"
  63 #include "gromacs/listed-forces/bonded.h"
  64 #include "gromacs/listed-forces/disre.h"
  65 #include "gromacs/listed-forces/orires.h"
  66 #include "gromacs/math/functions.h"
  67 #include "gromacs/math/units.h"
  68 #include "gromacs/math/vec.h"
  69 #include "gromacs/math/vecdump.h"
  70 #include "gromacs/mdlib/calcmu.h"
  71 #include "gromacs/mdlib/constr.h"
  72 #include "gromacs/mdlib/force.h"
  73 #include "gromacs/mdlib/forcerec.h"
  74 #include "gromacs/mdlib/genborn.h"
  75 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  76 #include "gromacs/mdlib/mdrun.h"
  77 #include "gromacs/mdlib/nb_verlet.h"
  78 #include "gromacs/mdlib/nbnxn_atomdata.h"
  79 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  80 #include "gromacs/mdlib/nbnxn_grid.h"
  81 #include "gromacs/mdlib/nbnxn_search.h"
  82 #include "gromacs/mdlib/qmmm.h"
  83 #include "gromacs/mdlib/update.h"
  84 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
  85 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h"
  86 #include "gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
  87 #include "gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h"
  88 #include "gromacs/mdtypes/commrec.h"
  89 #include "gromacs/mdtypes/iforceprovider.h"
  90 #include "gromacs/mdtypes/inputrec.h"
  91 #include "gromacs/mdtypes/md_enums.h"
  92 #include "gromacs/mdtypes/state.h"
  93 #include "gromacs/pbcutil/ishift.h"
  94 #include "gromacs/pbcutil/mshift.h"
  95 #include "gromacs/pbcutil/pbc.h"
  96 #include "gromacs/pulling/pull.h"
  97 #include "gromacs/pulling/pull_rotation.h"
  98 #include "gromacs/timing/cyclecounter.h"
  99 #include "gromacs/timing/gpu_timing.h"
 100 #include "gromacs/timing/wallcycle.h"
 101 #include "gromacs/timing/wallcyclereporting.h"
 102 #include "gromacs/timing/walltime_accounting.h"
 103 #include "gromacs/utility/arrayref.h"
 104 #include "gromacs/utility/basedefinitions.h"
 105 #include "gromacs/utility/cstringutil.h"
 106 #include "gromacs/utility/exceptions.h"
 107 #include "gromacs/utility/fatalerror.h"
 108 #include "gromacs/utility/gmxassert.h"
 109 #include "gromacs/utility/gmxmpi.h"
 110 #include "gromacs/utility/logger.h"
 111 #include "gromacs/utility/pleasecite.h"
 112 #include "gromacs/utility/smalloc.h"
 113 #include "gromacs/utility/sysinfo.h"
 114
 115 #include "nbnxn_gpu.h"
 116
 117 void print_time(FILE                     *out,
 118                 gmx_walltime_accounting_t walltime_accounting,
 119                 gmx_int64_t               step,
 120                 t_inputrec               *ir,
 121                 t_commrec gmx_unused     *cr)
 122 {
 123     time_t finish;
 124     char   timebuf[STRLEN];
 125     double dt, elapsed_seconds, time_per_step;
 126     char   buf[48];
 127
 128 #if !GMX_THREAD_MPI
 129     if (!PAR(cr))
 130 #endif
 131     {
 132         fprintf(out, "\r");
 133     }
 134     fprintf(out, "step %s", gmx_step_str(step, buf));
 135     fflush(out);
 136
 137     if ((step >= ir->nstlist))
 138     {
 139         double seconds_since_epoch = gmx_gettime();
 140         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
 141         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
 142         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
 143
 144         if (ir->nsteps >= 0)
 145         {
 146             if (dt >= 300)
 147             {
 148                 finish = (time_t) (seconds_since_epoch + dt);
 149                 gmx_ctime_r(&finish, timebuf, STRLEN);
 150                 sprintf(buf, "%s", timebuf);
 151                 buf[strlen(buf)-1] = '\0';
 152                 fprintf(out, ", will finish %s", buf);
 153             }
 154             else
 155             {
 156                 fprintf(out, ", remaining wall clock time: %5d s          ", (int)dt);
 157             }
 158         }
 159         else
 160         {
 161             fprintf(out, " performance: %.1f ns/day    ",
 162                     ir->delta_t/1000*24*60*60/time_per_step);
 163         }
 164     }
 165 #if !GMX_THREAD_MPI
 166     if (PAR(cr))
 167     {
 168         fprintf(out, "\n");
 169     }
 170 #endif
 171
 172     fflush(out);
 173 }
 174
 175 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
 176                          double the_time)
 177 {
 178     char   time_string[STRLEN];
 179
 180     if (!fplog)
 181     {
 182         return;
 183     }
 184
 185     {
 186         int    i;
 187         char   timebuf[STRLEN];
 188         time_t temp_time = (time_t) the_time;
 189
 190         gmx_ctime_r(&temp_time, timebuf, STRLEN);
 191         for (i = 0; timebuf[i] >= ' '; i++)
 192         {
 193             time_string[i] = timebuf[i];
 194         }
 195         time_string[i] = '\0';
 196     }
 197
 198     fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
 199 }
 200
 201 void print_start(FILE *fplog, t_commrec *cr,
 202                  gmx_walltime_accounting_t walltime_accounting,
 203                  const char *name)
 204 {
 205     char buf[STRLEN];
 206
 207     sprintf(buf, "Started %s", name);
 208     print_date_and_time(fplog, cr->nodeid, buf,
 209                         walltime_accounting_get_start_time_stamp(walltime_accounting));
 210 }
 211
 212 static void sum_forces(rvec f[], const PaddedRVecVector *forceToAdd)
 213 {
 214     /* TODO: remove this - 1 when padding is properly implemented */
 215     int         end  = forceToAdd->size() - 1;
 216     const rvec *fAdd = as_rvec_array(forceToAdd->data());
 217
 218     // cppcheck-suppress unreadVariable
 219     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
 220 #pragma omp parallel for num_threads(nt) schedule(static)
 221     for (int i = 0; i < end; i++)
 222     {
 223         rvec_inc(f[i], fAdd[i]);
 224     }
 225 }
 226
 227 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
 228                         tensor vir_part, t_graph *graph, matrix box,
 229                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
 230 {
 231     int    i;
 232
 233     /* The short-range virial from surrounding boxes */
 234     clear_mat(vir_part);
 235     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
 236     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
 237
 238     /* Calculate partial virial, for local atoms only, based on short range.
 239      * Total virial is computed in global_stat, called from do_md
 240      */
 241     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
 242     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
 243
 244     /* Add position restraint contribution */
 245     for (i = 0; i < DIM; i++)
 246     {
 247         vir_part[i][i] += fr->vir_diag_posres[i];
 248     }
 249
 250     /* Add wall contribution */
 251     for (i = 0; i < DIM; i++)
 252     {
 253         vir_part[i][ZZ] += fr->vir_wall_z[i];
 254     }
 255
 256     if (debug)
 257     {
 258         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
 259     }
 260 }
 261
 262 static void pull_potential_wrapper(t_commrec *cr,
 263                                    t_inputrec *ir,
 264                                    matrix box, rvec x[],
 265                                    rvec f[],
 266                                    tensor vir_force,
 267                                    t_mdatoms *mdatoms,
 268                                    gmx_enerdata_t *enerd,
 269                                    real *lambda,
 270                                    double t,
 271                                    gmx_wallcycle_t wcycle)
 272 {
 273     t_pbc  pbc;
 274     real   dvdl;
 275
 276     /* Calculate the center of mass forces, this requires communication,
 277      * which is why pull_potential is called close to other communication.
 278      * The virial contribution is calculated directly,
 279      * which is why we call pull_potential after calc_virial.
 280      */
 281     wallcycle_start(wcycle, ewcPULLPOT);
 282     set_pbc(&pbc, ir->ePBC, box);
 283     dvdl                     = 0;
 284     enerd->term[F_COM_PULL] +=
 285         pull_potential(ir->pull_work, mdatoms, &pbc,
 286                        cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
 287     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
 288     wallcycle_stop(wcycle, ewcPULLPOT);
 289 }
 290
 291 static void pme_receive_force_ener(t_commrec      *cr,
 292                                    gmx_wallcycle_t wcycle,
 293                                    gmx_enerdata_t *enerd,
 294                                    t_forcerec     *fr)
 295 {
 296     real   e_q, e_lj, dvdl_q, dvdl_lj;
 297     float  cycles_ppdpme, cycles_seppme;
 298
 299     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
 300     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
 301
 302     /* In case of node-splitting, the PP nodes receive the long-range
 303      * forces, virial and energy from the PME nodes here.
 304      */
 305     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
 306     dvdl_q  = 0;
 307     dvdl_lj = 0;
 308     gmx_pme_receive_f(cr, as_rvec_array(fr->f_novirsum->data()), fr->vir_el_recip, &e_q,
 309                       fr->vir_lj_recip, &e_lj, &dvdl_q, &dvdl_lj,
 310                       &cycles_seppme);
 311     enerd->term[F_COUL_RECIP] += e_q;
 312     enerd->term[F_LJ_RECIP]   += e_lj;
 313     enerd->dvdl_lin[efptCOUL] += dvdl_q;
 314     enerd->dvdl_lin[efptVDW]  += dvdl_lj;
 315
 316     if (wcycle)
 317     {
 318         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
 319     }
 320     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
 321 }
 322
 323 static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
 324                                gmx_int64_t step, real forceTolerance,
 325                                const rvec *x, const rvec *f)
 326 {
 327     real           force2Tolerance = gmx::square(forceTolerance);
 328     std::uintmax_t numNonFinite    = 0;
 329     for (int i = 0; i < md->homenr; i++)
 330     {
 331         real force2    = norm2(f[i]);
 332         bool nonFinite = !std::isfinite(force2);
 333         if (force2 >= force2Tolerance || nonFinite)
 334         {
 335             fprintf(fp, "step %" GMX_PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
 336                     step,
 337                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
 338         }
 339         if (nonFinite)
 340         {
 341             numNonFinite++;
 342         }
 343     }
 344     if (numNonFinite > 0)
 345     {
 346         /* Note that with MPI this fatal call on one rank might interrupt
 347          * the printing on other ranks. But we can only avoid that with
 348          * an expensive MPI barrier that we would need at each step.
 349          */
 350         gmx_fatal(FARGS, "At step %" GMX_PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
 351     }
 352 }
 353
 354 static void post_process_forces(t_commrec *cr,
 355                                 gmx_int64_t step,
 356                                 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 357                                 gmx_localtop_t *top,
 358                                 matrix box, rvec x[],
 359                                 rvec f[],
 360                                 tensor vir_force,
 361                                 t_mdatoms *mdatoms,
 362                                 t_graph *graph,
 363                                 t_forcerec *fr, gmx_vsite_t *vsite,
 364                                 int flags)
 365 {
 366     if (fr->bF_NoVirSum)
 367     {
 368         if (vsite)
 369         {
 370             /* Spread the mesh force on virtual sites to the other particles...
 371              * This is parallellized. MPI communication is performed
 372              * if the constructing atoms aren't local.
 373              */
 374             wallcycle_start(wcycle, ewcVSITESPREAD);
 375             spread_vsite_f(vsite, x, as_rvec_array(fr->f_novirsum->data()), nullptr,
 376                            (flags & GMX_FORCE_VIRIAL), fr->vir_el_recip,
 377                            nrnb,
 378                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
 379             wallcycle_stop(wcycle, ewcVSITESPREAD);
 380         }
 381         if (flags & GMX_FORCE_VIRIAL)
 382         {
 383             /* Now add the forces, this is local */
 384             sum_forces(f, fr->f_novirsum);
 385
 386             if (EEL_FULL(fr->eeltype))
 387             {
 388                 /* Add the mesh contribution to the virial */
 389                 m_add(vir_force, fr->vir_el_recip, vir_force);
 390             }
 391             if (EVDW_PME(fr->vdwtype))
 392             {
 393                 /* Add the mesh contribution to the virial */
 394                 m_add(vir_force, fr->vir_lj_recip, vir_force);
 395             }
 396             if (debug)
 397             {
 398                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
 399             }
 400         }
 401     }
 402
 403     if (fr->print_force >= 0)
 404     {
 405         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
 406     }
 407 }
 408
 409 static void do_nb_verlet(t_forcerec *fr,
 410                          interaction_const_t *ic,
 411                          gmx_enerdata_t *enerd,
 412                          int flags, int ilocality,
 413                          int clearF,
 414                          t_nrnb *nrnb,
 415                          gmx_wallcycle_t wcycle)
 416 {
 417     int                        enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
 418     nonbonded_verlet_group_t  *nbvg;
 419     gmx_bool                   bUsingGpuKernels;
 420
 421     if (!(flags & GMX_FORCE_NONBONDED))
 422     {
 423         /* skip non-bonded calculation */
 424         return;
 425     }
 426
 427     nbvg = &fr->nbv->grp[ilocality];
 428
 429     /* GPU kernel launch overhead is already timed separately */
 430     if (fr->cutoff_scheme != ecutsVERLET)
 431     {
 432         gmx_incons("Invalid cut-off scheme passed!");
 433     }
 434
 435     bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
 436
 437     if (!bUsingGpuKernels)
 438     {
 439         wallcycle_sub_start(wcycle, ewcsNONBONDED);
 440     }
 441     switch (nbvg->kernel_type)
 442     {
 443         case nbnxnk4x4_PlainC:
 444             nbnxn_kernel_ref(&nbvg->nbl_lists,
 445                              nbvg->nbat, ic,
 446                              fr->shift_vec,
 447                              flags,
 448                              clearF,
 449                              fr->fshift[0],
 450                              enerd->grpp.ener[egCOULSR],
 451                              fr->bBHAM ?
 452                              enerd->grpp.ener[egBHAMSR] :
 453                              enerd->grpp.ener[egLJSR]);
 454             break;
 455
 456         case nbnxnk4xN_SIMD_4xN:
 457             nbnxn_kernel_simd_4xn(&nbvg->nbl_lists,
 458                                   nbvg->nbat, ic,
 459                                   nbvg->ewald_excl,
 460                                   fr->shift_vec,
 461                                   flags,
 462                                   clearF,
 463                                   fr->fshift[0],
 464                                   enerd->grpp.ener[egCOULSR],
 465                                   fr->bBHAM ?
 466                                   enerd->grpp.ener[egBHAMSR] :
 467                                   enerd->grpp.ener[egLJSR]);
 468             break;
 469         case nbnxnk4xN_SIMD_2xNN:
 470             nbnxn_kernel_simd_2xnn(&nbvg->nbl_lists,
 471                                    nbvg->nbat, ic,
 472                                    nbvg->ewald_excl,
 473                                    fr->shift_vec,
 474                                    flags,
 475                                    clearF,
 476                                    fr->fshift[0],
 477                                    enerd->grpp.ener[egCOULSR],
 478                                    fr->bBHAM ?
 479                                    enerd->grpp.ener[egBHAMSR] :
 480                                    enerd->grpp.ener[egLJSR]);
 481             break;
 482
 483         case nbnxnk8x8x8_GPU:
 484             nbnxn_gpu_launch_kernel(fr->nbv->gpu_nbv, nbvg->nbat, flags, ilocality);
 485             break;
 486
 487         case nbnxnk8x8x8_PlainC:
 488             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
 489                                  nbvg->nbat, ic,
 490                                  fr->shift_vec,
 491                                  flags,
 492                                  clearF,
 493                                  nbvg->nbat->out[0].f,
 494                                  fr->fshift[0],
 495                                  enerd->grpp.ener[egCOULSR],
 496                                  fr->bBHAM ?
 497                                  enerd->grpp.ener[egBHAMSR] :
 498                                  enerd->grpp.ener[egLJSR]);
 499             break;
 500
 501         default:
 502             gmx_incons("Invalid nonbonded kernel type passed!");
 503
 504     }
 505     if (!bUsingGpuKernels)
 506     {
 507         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 508     }
 509
 510     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
 511     {
 512         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
 513     }
 514     else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
 515              (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(fr->nbv->gpu_nbv)))
 516     {
 517         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
 518     }
 519     else
 520     {
 521         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
 522     }
 523     enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
 524     if (flags & GMX_FORCE_ENERGY)
 525     {
 526         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
 527         enr_nbnxn_kernel_ljc += 1;
 528         enr_nbnxn_kernel_lj  += 1;
 529     }
 530
 531     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
 532              nbvg->nbl_lists.natpair_ljq);
 533     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
 534              nbvg->nbl_lists.natpair_lj);
 535     /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
 536     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
 537              nbvg->nbl_lists.natpair_q);
 538
 539     if (ic->vdw_modifier == eintmodFORCESWITCH)
 540     {
 541         /* We add up the switch cost separately */
 542         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 543                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 544     }
 545     if (ic->vdw_modifier == eintmodPOTSWITCH)
 546     {
 547         /* We add up the switch cost separately */
 548         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 549                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 550     }
 551     if (ic->vdwtype == evdwPME)
 552     {
 553         /* We add up the LJ Ewald cost separately */
 554         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
 555                  nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
 556     }
 557 }
 558
 559 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
 560                              t_forcerec           *fr,
 561                              rvec                  x[],
 562                              rvec                  f[],
 563                              t_mdatoms            *mdatoms,
 564                              t_lambda             *fepvals,
 565                              real                 *lambda,
 566                              gmx_enerdata_t       *enerd,
 567                              int                   flags,
 568                              t_nrnb               *nrnb,
 569                              gmx_wallcycle_t       wcycle)
 570 {
 571     int              donb_flags;
 572     nb_kernel_data_t kernel_data;
 573     real             lam_i[efptNR];
 574     real             dvdl_nb[efptNR];
 575     int              th;
 576     int              i, j;
 577
 578     donb_flags = 0;
 579     /* Add short-range interactions */
 580     donb_flags |= GMX_NONBONDED_DO_SR;
 581
 582     /* Currently all group scheme kernels always calculate (shift-)forces */
 583     if (flags & GMX_FORCE_FORCES)
 584     {
 585         donb_flags |= GMX_NONBONDED_DO_FORCE;
 586     }
 587     if (flags & GMX_FORCE_VIRIAL)
 588     {
 589         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
 590     }
 591     if (flags & GMX_FORCE_ENERGY)
 592     {
 593         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
 594     }
 595
 596     kernel_data.flags  = donb_flags;
 597     kernel_data.lambda = lambda;
 598     kernel_data.dvdl   = dvdl_nb;
 599
 600     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
 601     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
 602
 603     /* reset free energy components */
 604     for (i = 0; i < efptNR; i++)
 605     {
 606         dvdl_nb[i]  = 0;
 607     }
 608
 609     assert(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl);
 610
 611     wallcycle_sub_start(wcycle, ewcsNONBONDED);
 612 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 613     for (th = 0; th < nbl_lists->nnbl; th++)
 614     {
 615         try
 616         {
 617             gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 618                                       x, f, fr, mdatoms, &kernel_data, nrnb);
 619         }
 620         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 621     }
 622
 623     if (fepvals->sc_alpha != 0)
 624     {
 625         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
 626         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
 627     }
 628     else
 629     {
 630         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
 631         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
 632     }
 633
 634     /* If we do foreign lambda and we have soft-core interactions
 635      * we have to recalculate the (non-linear) energies contributions.
 636      */
 637     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
 638     {
 639         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
 640         kernel_data.lambda         = lam_i;
 641         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
 642         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
 643         /* Note that we add to kernel_data.dvdl, but ignore the result */
 644
 645         for (i = 0; i < enerd->n_lambda; i++)
 646         {
 647             for (j = 0; j < efptNR; j++)
 648             {
 649                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
 650             }
 651             reset_foreign_enerdata(enerd);
 652 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
 653             for (th = 0; th < nbl_lists->nnbl; th++)
 654             {
 655                 try
 656                 {
 657                     gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
 658                                               x, f, fr, mdatoms, &kernel_data, nrnb);
 659                 }
 660                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 661             }
 662
 663             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
 664             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
 665         }
 666     }
 667
 668     wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 669 }
 670
 671 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
 672 {
 673     return nbv != nullptr && nbv->bUseGPU;
 674 }
 675
 676 static gmx_inline void clear_rvecs_omp(int n, rvec v[])
 677 {
 678     int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
 679
 680     /* Note that we would like to avoid this conditional by putting it
 681      * into the omp pragma instead, but then we still take the full
 682      * omp parallel for overhead (at least with gcc5).
 683      */
 684     if (nth == 1)
 685     {
 686         for (int i = 0; i < n; i++)
 687         {
 688             clear_rvec(v[i]);
 689         }
 690     }
 691     else
 692     {
 693 #pragma omp parallel for num_threads(nth) schedule(static)
 694         for (int i = 0; i < n; i++)
 695         {
 696             clear_rvec(v[i]);
 697         }
 698     }
 699 }
 700
 701 /*! \brief  This routine checks if the potential energy is finite.
 702  *
 703  * Note that passing this check does not guarantee finite forces,
 704  * since those use slightly different arithmetics. But in most cases
 705  * there is just a narrow coordinate range where forces are not finite
 706  * and energies are finite.
 707  *
 708  * \param[in] enerd  The energy data; the non-bonded group energies need to be added in here before calling this routine
 709  */
 710 static void checkPotentialEnergyValidity(const gmx_enerdata_t *enerd)
 711 {
 712     if (!std::isfinite(enerd->term[F_EPOT]))
 713     {
 714         gmx_fatal(FARGS, "The total potential energy is %g, which is not finite. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A non-finite potential energy can be caused by overlapping interactions in bonded interactions or very large or NaN coordinate values. Usually this is caused by a badly or non-equilibrated initial configuration or incorrect interactions or parameters in the topology.",
 715                   enerd->term[F_EPOT],
 716                   enerd->term[F_LJ],
 717                   enerd->term[F_COUL_SR]);
 718     }
 719 }
 720
 721 void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
 722                          t_inputrec *inputrec,
 723                          gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 724                          gmx_localtop_t *top,
 725                          gmx_groups_t gmx_unused *groups,
 726                          matrix box, rvec x[], history_t *hist,
 727                          PaddedRVecVector *force,
 728                          tensor vir_force,
 729                          t_mdatoms *mdatoms,
 730                          gmx_enerdata_t *enerd, t_fcdata *fcd,
 731                          real *lambda, t_graph *graph,
 732                          t_forcerec *fr, interaction_const_t *ic,
 733                          gmx_vsite_t *vsite, rvec mu_tot,
 734                          double t, gmx_edsam_t ed,
 735                          gmx_bool bBornRadii,
 736                          int flags)
 737 {
 738     int                 cg1, i, j;
 739     double              mu[2*DIM];
 740     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
 741     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
 742     gmx_bool            bDiffKernels = FALSE;
 743     rvec                vzero, box_diag;
 744     float               cycles_pme, cycles_force, cycles_wait_gpu;
 745     /* TODO To avoid loss of precision, float can't be used for a
 746      * cycle count. Build an object that can do this right and perhaps
 747      * also be used by gmx_wallcycle_t */
 748     gmx_cycles_t        cycleCountBeforeLocalWorkCompletes = 0;
 749     nonbonded_verlet_t *nbv;
 750
 751     cycles_force    = 0;
 752     cycles_wait_gpu = 0;
 753     nbv             = fr->nbv;
 754
 755     const int start  = 0;
 756     const int homenr = mdatoms->homenr;
 757
 758     clear_mat(vir_force);
 759
 760     if (DOMAINDECOMP(cr))
 761     {
 762         cg1 = cr->dd->ncg_tot;
 763     }
 764     else
 765     {
 766         cg1 = top->cgs.nr;
 767     }
 768     if (fr->n_tpi > 0)
 769     {
 770         cg1--;
 771     }
 772
 773     bStateChanged = (flags & GMX_FORCE_STATECHANGED);
 774     bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
 775     bFillGrid     = (bNS && bStateChanged);
 776     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
 777     bDoForces     = (flags & GMX_FORCE_FORCES);
 778     bUseGPU       = fr->nbv->bUseGPU;
 779     bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbnxnk8x8x8_PlainC);
 780
 781     if (bStateChanged)
 782     {
 783         update_forcerec(fr, box);
 784
 785         if (inputrecNeedMutot(inputrec))
 786         {
 787             /* Calculate total (local) dipole moment in a temporary common array.
 788              * This makes it possible to sum them over nodes faster.
 789              */
 790             calc_mu(start, homenr,
 791                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
 792                     mu, mu+DIM);
 793         }
 794     }
 795
 796     if (fr->ePBC != epbcNONE)
 797     {
 798         /* Compute shift vectors every step,
 799          * because of pressure coupling or box deformation!
 800          */
 801         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
 802         {
 803             calc_shifts(box, fr->shift_vec);
 804         }
 805
 806         if (bCalcCGCM)
 807         {
 808             put_atoms_in_box_omp(fr->ePBC, box, homenr, x);
 809             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
 810         }
 811         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
 812         {
 813             unshift_self(graph, box, x);
 814         }
 815     }
 816
 817     nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
 818                                  fr->shift_vec, nbv->grp[0].nbat);
 819
 820 #if GMX_MPI
 821     if (!(cr->duty & DUTY_PME))
 822     {
 823         gmx_bool bBS;
 824         matrix   boxs;
 825
 826         /* Send particle coordinates to the pme nodes.
 827          * Since this is only implemented for domain decomposition
 828          * and domain decomposition does not use the graph,
 829          * we do not need to worry about shifting.
 830          */
 831
 832         wallcycle_start(wcycle, ewcPP_PMESENDX);
 833
 834         bBS = (inputrec->nwall == 2);
 835         if (bBS)
 836         {
 837             copy_mat(box, boxs);
 838             svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
 839         }
 840
 841         gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
 842                                  lambda[efptCOUL], lambda[efptVDW],
 843                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
 844                                  step);
 845
 846         wallcycle_stop(wcycle, ewcPP_PMESENDX);
 847     }
 848 #endif /* GMX_MPI */
 849
 850     /* do gridding for pair search */
 851     if (bNS)
 852     {
 853         if (graph && bStateChanged)
 854         {
 855             /* Calculate intramolecular shift vectors to make molecules whole */
 856             mk_mshift(fplog, graph, fr->ePBC, box, x);
 857         }
 858
 859         clear_rvec(vzero);
 860         box_diag[XX] = box[XX][XX];
 861         box_diag[YY] = box[YY][YY];
 862         box_diag[ZZ] = box[ZZ][ZZ];
 863
 864         wallcycle_start(wcycle, ewcNS);
 865         if (!fr->bDomDec)
 866         {
 867             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
 868             nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
 869                               0, vzero, box_diag,
 870                               0, mdatoms->homenr, -1, fr->cginfo, x,
 871                               0, nullptr,
 872                               nbv->grp[eintLocal].kernel_type,
 873                               nbv->grp[eintLocal].nbat);
 874             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
 875         }
 876         else
 877         {
 878             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
 879             nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
 880                                        fr->cginfo, x,
 881                                        nbv->grp[eintNonlocal].kernel_type,
 882                                        nbv->grp[eintNonlocal].nbat);
 883             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
 884         }
 885
 886         if (nbv->ngrp == 1 ||
 887             nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
 888         {
 889             nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatAll,
 890                                nbv->nbs, mdatoms, fr->cginfo);
 891         }
 892         else
 893         {
 894             nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatLocal,
 895                                nbv->nbs, mdatoms, fr->cginfo);
 896             nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat, eatAll,
 897                                nbv->nbs, mdatoms, fr->cginfo);
 898         }
 899         wallcycle_stop(wcycle, ewcNS);
 900     }
 901
 902     /* initialize the GPU atom data and copy shift vector */
 903     if (bUseGPU)
 904     {
 905         if (bNS)
 906         {
 907             wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
 908             nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
 909             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
 910         }
 911
 912         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
 913         nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
 914         wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
 915     }
 916
 917     /* do local pair search */
 918     if (bNS)
 919     {
 920         wallcycle_start_nocount(wcycle, ewcNS);
 921         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
 922         nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintLocal].nbat,
 923                             &top->excls,
 924                             ic->rlist,
 925                             nbv->min_ci_balanced,
 926                             &nbv->grp[eintLocal].nbl_lists,
 927                             eintLocal,
 928                             nbv->grp[eintLocal].kernel_type,
 929                             nrnb);
 930         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
 931
 932         if (bUseGPU)
 933         {
 934             /* initialize local pair-list on the GPU */
 935             nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
 936                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
 937                                     eintLocal);
 938         }
 939         wallcycle_stop(wcycle, ewcNS);
 940     }
 941     else
 942     {
 943         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
 944         wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
 945         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, x,
 946                                         nbv->grp[eintLocal].nbat);
 947         wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
 948         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
 949     }
 950
 951     if (bUseGPU)
 952     {
 953         wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
 954         /* launch local nonbonded F on GPU */
 955         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
 956                      nrnb, wcycle);
 957         wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
 958     }
 959
 960     /* Communicate coordinates and sum dipole if necessary +
 961        do non-local pair search */
 962     if (DOMAINDECOMP(cr))
 963     {
 964         bDiffKernels = (nbv->grp[eintNonlocal].kernel_type !=
 965                         nbv->grp[eintLocal].kernel_type);
 966
 967         if (bDiffKernels)
 968         {
 969             /* With GPU+CPU non-bonded calculations we need to copy
 970              * the local coordinates to the non-local nbat struct
 971              * (in CPU format) as the non-local kernel call also
 972              * calculates the local - non-local interactions.
 973              */
 974             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
 975             wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
 976             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, TRUE, x,
 977                                             nbv->grp[eintNonlocal].nbat);
 978             wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
 979             wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
 980         }
 981
 982         if (bNS)
 983         {
 984             wallcycle_start_nocount(wcycle, ewcNS);
 985             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
 986
 987             if (bDiffKernels)
 988             {
 989                 nbnxn_grid_add_simple(nbv->nbs, nbv->grp[eintNonlocal].nbat);
 990             }
 991
 992             nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintNonlocal].nbat,
 993                                 &top->excls,
 994                                 ic->rlist,
 995                                 nbv->min_ci_balanced,
 996                                 &nbv->grp[eintNonlocal].nbl_lists,
 997                                 eintNonlocal,
 998                                 nbv->grp[eintNonlocal].kernel_type,
 999                                 nrnb);
1000
1001             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1002
1003             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1004             {
1005                 /* initialize non-local pair-list on the GPU */
1006                 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1007                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1008                                         eintNonlocal);
1009             }
1010             wallcycle_stop(wcycle, ewcNS);
1011         }
1012         else
1013         {
1014             wallcycle_start(wcycle, ewcMOVEX);
1015             dd_move_x(cr->dd, box, x);
1016             wallcycle_stop(wcycle, ewcMOVEX);
1017
1018             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1019             wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
1020             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, x,
1021                                             nbv->grp[eintNonlocal].nbat);
1022             wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
1023             cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1024         }
1025
1026         if (bUseGPU && !bDiffKernels)
1027         {
1028             wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
1029             /* launch non-local nonbonded F on GPU */
1030             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1031                          nrnb, wcycle);
1032             cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1033         }
1034     }
1035
1036     if (bUseGPU)
1037     {
1038         /* launch D2H copy-back F */
1039         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
1040         if (DOMAINDECOMP(cr) && !bDiffKernels)
1041         {
1042             nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintNonlocal].nbat,
1043                                      flags, eatNonlocal);
1044         }
1045         nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintLocal].nbat,
1046                                  flags, eatLocal);
1047         cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1048     }
1049
1050     if (bStateChanged && inputrecNeedMutot(inputrec))
1051     {
1052         if (PAR(cr))
1053         {
1054             gmx_sumd(2*DIM, mu, cr);
1055         }
1056
1057         for (i = 0; i < 2; i++)
1058         {
1059             for (j = 0; j < DIM; j++)
1060             {
1061                 fr->mu_tot[i][j] = mu[i*DIM + j];
1062             }
1063         }
1064     }
1065     if (fr->efep == efepNO)
1066     {
1067         copy_rvec(fr->mu_tot[0], mu_tot);
1068     }
1069     else
1070     {
1071         for (j = 0; j < DIM; j++)
1072         {
1073             mu_tot[j] =
1074                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1075                 lambda[efptCOUL]*fr->mu_tot[1][j];
1076         }
1077     }
1078
1079     /* Reset energies */
1080     reset_enerdata(enerd);
1081     clear_rvecs(SHIFTS, fr->fshift);
1082
1083     if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
1084     {
1085         wallcycle_start(wcycle, ewcPPDURINGPME);
1086         dd_force_flop_start(cr->dd, nrnb);
1087     }
1088
1089     if (inputrec->bRot)
1090     {
1091         /* Enforced rotation has its own cycle counter that starts after the collective
1092          * coordinates have been communicated. It is added to ddCyclF to allow
1093          * for proper load-balancing */
1094         wallcycle_start(wcycle, ewcROT);
1095         do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
1096         wallcycle_stop(wcycle, ewcROT);
1097     }
1098
1099     /* Temporary solution until all routines take PaddedRVecVector */
1100     rvec *f = as_rvec_array(force->data());
1101
1102     /* Start the force cycle counter.
1103      * This counter is stopped after do_force_lowlevel.
1104      * No parallel communication should occur while this counter is running,
1105      * since that will interfere with the dynamic load balancing.
1106      */
1107     wallcycle_start(wcycle, ewcFORCE);
1108     if (bDoForces)
1109     {
1110         /* Reset forces for which the virial is calculated separately:
1111          * PME/Ewald forces if necessary */
1112         if (fr->bF_NoVirSum)
1113         {
1114             if (flags & GMX_FORCE_VIRIAL)
1115             {
1116                 fr->f_novirsum = fr->forceBufferNoVirialSummation;
1117             }
1118             else
1119             {
1120                 /* We are not calculating the pressure so we do not need
1121                  * a separate array for forces that do not contribute
1122                  * to the pressure.
1123                  */
1124                 fr->f_novirsum = force;
1125             }
1126         }
1127
1128         if (fr->bF_NoVirSum)
1129         {
1130             if (flags & GMX_FORCE_VIRIAL)
1131             {
1132                 /* TODO: remove this - 1 when padding is properly implemented */
1133                 clear_rvecs_omp(fr->f_novirsum->size() - 1,
1134                                 as_rvec_array(fr->f_novirsum->data()));
1135             }
1136         }
1137         /* Clear the short- and long-range forces */
1138         clear_rvecs_omp(fr->natoms_force_constr, f);
1139
1140         clear_rvec(fr->vir_diag_posres);
1141     }
1142
1143     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1144     {
1145         clear_pull_forces(inputrec->pull_work);
1146     }
1147
1148     /* We calculate the non-bonded forces, when done on the CPU, here.
1149      * We do this before calling do_force_lowlevel, because in that
1150      * function, the listed forces are calculated before PME, which
1151      * does communication.  With this order, non-bonded and listed
1152      * force calculation imbalance can be balanced out by the domain
1153      * decomposition load balancing.
1154      */
1155
1156     if (!bUseOrEmulGPU)
1157     {
1158         /* Maybe we should move this into do_force_lowlevel */
1159         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1160                      nrnb, wcycle);
1161     }
1162
1163     if (fr->efep != efepNO)
1164     {
1165         /* Calculate the local and non-local free energy interactions here.
1166          * Happens here on the CPU both with and without GPU.
1167          */
1168         if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1169         {
1170             do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1171                              fr, x, f, mdatoms,
1172                              inputrec->fepvals, lambda,
1173                              enerd, flags, nrnb, wcycle);
1174         }
1175
1176         if (DOMAINDECOMP(cr) &&
1177             fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1178         {
1179             do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1180                              fr, x, f, mdatoms,
1181                              inputrec->fepvals, lambda,
1182                              enerd, flags, nrnb, wcycle);
1183         }
1184     }
1185
1186     if (!bUseOrEmulGPU || bDiffKernels)
1187     {
1188         int aloc;
1189
1190         if (DOMAINDECOMP(cr))
1191         {
1192             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
1193                          bDiffKernels ? enbvClearFYes : enbvClearFNo,
1194                          nrnb, wcycle);
1195         }
1196
1197         if (!bUseOrEmulGPU)
1198         {
1199             aloc = eintLocal;
1200         }
1201         else
1202         {
1203             aloc = eintNonlocal;
1204         }
1205
1206         /* Add all the non-bonded force to the normal force array.
1207          * This can be split into a local and a non-local part when overlapping
1208          * communication with calculation with domain decomposition.
1209          */
1210         cycles_force += wallcycle_stop(wcycle, ewcFORCE);
1211         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1212         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1213         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->grp[aloc].nbat, f);
1214         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1215         cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1216         wallcycle_start_nocount(wcycle, ewcFORCE);
1217
1218         /* if there are multiple fshift output buffers reduce them */
1219         if ((flags & GMX_FORCE_VIRIAL) &&
1220             nbv->grp[aloc].nbl_lists.nnbl > 1)
1221         {
1222             /* This is not in a subcounter because it takes a
1223                negligible and constant-sized amount of time */
1224             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
1225                                                      fr->fshift);
1226         }
1227     }
1228
1229     /* update QMMMrec, if necessary */
1230     if (fr->bQMMM)
1231     {
1232         update_QMMMrec(cr, fr, x, mdatoms, box, top);
1233     }
1234
1235     /* Compute the bonded and non-bonded energies and optionally forces */
1236     do_force_lowlevel(fr, inputrec, &(top->idef),
1237                       cr, nrnb, wcycle, mdatoms,
1238                       x, hist, f, enerd, fcd, top, fr->born,
1239                       bBornRadii, box,
1240                       inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1241                       flags, &cycles_pme);
1242
1243     cycles_force += wallcycle_stop(wcycle, ewcFORCE);
1244
1245     if (ed)
1246     {
1247         do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1248     }
1249
1250     if (bUseOrEmulGPU && !bDiffKernels)
1251     {
1252         /* wait for non-local forces (or calculate in emulation mode) */
1253         if (DOMAINDECOMP(cr))
1254         {
1255             if (bUseGPU)
1256             {
1257                 float cycles_tmp;
1258
1259                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1260                 nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
1261                                        flags, eatNonlocal,
1262                                        enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1263                                        fr->fshift);
1264                 cycles_tmp       = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1265                 cycles_wait_gpu += cycles_tmp;
1266                 cycles_force    += cycles_tmp;
1267             }
1268             else
1269             {
1270                 wallcycle_start_nocount(wcycle, ewcFORCE);
1271                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1272                              nrnb, wcycle);
1273                 cycles_force += wallcycle_stop(wcycle, ewcFORCE);
1274             }
1275             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1276             wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1277             /* skip the reduction if there was no non-local work to do */
1278             if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1279             {
1280                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
1281                                                nbv->grp[eintNonlocal].nbat, f);
1282             }
1283             wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1284             cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1285         }
1286     }
1287
1288     if (bDoForces && DOMAINDECOMP(cr))
1289     {
1290         if (bUseGPU)
1291         {
1292             /* We are done with the CPU compute, but the GPU local non-bonded
1293              * kernel can still be running while we communicate the forces.
1294              * We start a counter here, so we can, hopefully, time the rest
1295              * of the GPU kernel execution and data transfer.
1296              */
1297             cycleCountBeforeLocalWorkCompletes = gmx_cycles_read();
1298         }
1299
1300         /* Communicate the forces */
1301         wallcycle_start(wcycle, ewcMOVEF);
1302         dd_move_f(cr->dd, f, fr->fshift);
1303         wallcycle_stop(wcycle, ewcMOVEF);
1304     }
1305
1306     if (bUseOrEmulGPU)
1307     {
1308         /* wait for local forces (or calculate in emulation mode) */
1309         if (bUseGPU)
1310         {
1311             float       cycles_tmp, cycles_wait_est;
1312             /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1313              * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1314              * but even with a step of 0.1 ms the difference is less than 1%
1315              * of the step time.
1316              */
1317             const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1318
1319             wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1320             nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
1321                                    flags, eatLocal,
1322                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1323                                    fr->fshift);
1324             cycles_tmp      = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1325
1326             if (bDoForces && DOMAINDECOMP(cr))
1327             {
1328                 cycles_wait_est = gmx_cycles_read() - cycleCountBeforeLocalWorkCompletes;
1329
1330                 if (cycles_tmp < gpuWaitApiOverheadMargin)
1331                 {
1332                     /* We measured few cycles, it could be that the kernel
1333                      * and transfer finished earlier and there was no actual
1334                      * wait time, only API call overhead.
1335                      * Then the actual time could be anywhere between 0 and
1336                      * cycles_wait_est. As a compromise, we use half the time.
1337                      */
1338                     cycles_wait_est *= 0.5f;
1339                 }
1340             }
1341             else
1342             {
1343                 /* No force communication so we actually timed the wait */
1344                 cycles_wait_est = cycles_tmp;
1345             }
1346             /* Even though this is after dd_move_f, the actual task we are
1347              * waiting for runs asynchronously with dd_move_f and we usually
1348              * have nothing to balance it with, so we can and should add
1349              * the time to the force time for load balancing.
1350              */
1351             cycles_force    += cycles_wait_est;
1352             cycles_wait_gpu += cycles_wait_est;
1353
1354             /* now clear the GPU outputs while we finish the step on the CPU */
1355             wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
1356             nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1357             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1358         }
1359         else
1360         {
1361             wallcycle_start_nocount(wcycle, ewcFORCE);
1362             do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1363                          DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1364                          nrnb, wcycle);
1365             wallcycle_stop(wcycle, ewcFORCE);
1366         }
1367         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1368         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1369         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
1370                                        nbv->grp[eintLocal].nbat, f);
1371         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1372         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1373     }
1374
1375     if (DOMAINDECOMP(cr))
1376     {
1377         dd_force_flop_stop(cr->dd, nrnb);
1378         if (wcycle)
1379         {
1380             dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
1381             if (bUseGPU)
1382             {
1383                 dd_cycles_add(cr->dd, cycles_wait_gpu, ddCyclWaitGPU);
1384             }
1385         }
1386     }
1387
1388     if (bDoForces)
1389     {
1390         /* Collect forces from modules */
1391         gmx::ArrayRef<gmx::RVec> fNoVirSum;
1392         if (fr->bF_NoVirSum)
1393         {
1394             fNoVirSum = *fr->f_novirsum;
1395         }
1396         fr->forceProviders->calculateForces(cr, mdatoms, box, t, x, *force, fNoVirSum);
1397
1398         /* If we have NoVirSum forces, but we do not calculate the virial,
1399          * we sum fr->f_novirsum=f later.
1400          */
1401         if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1402         {
1403             wallcycle_start(wcycle, ewcVSITESPREAD);
1404             spread_vsite_f(vsite, x, f, fr->fshift, FALSE, nullptr, nrnb,
1405                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1406             wallcycle_stop(wcycle, ewcVSITESPREAD);
1407         }
1408
1409         if (flags & GMX_FORCE_VIRIAL)
1410         {
1411             /* Calculation of the virial must be done after vsites! */
1412             calc_virial(0, mdatoms->homenr, x, f,
1413                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1414         }
1415     }
1416
1417     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
1418     {
1419         /* Since the COM pulling is always done mass-weighted, no forces are
1420          * applied to vsites and this call can be done after vsite spreading.
1421          */
1422         pull_potential_wrapper(cr, inputrec, box, x,
1423                                f, vir_force, mdatoms, enerd, lambda, t,
1424                                wcycle);
1425     }
1426
1427     /* Add the forces from enforced rotation potentials (if any) */
1428     if (inputrec->bRot)
1429     {
1430         wallcycle_start(wcycle, ewcROTadd);
1431         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1432         wallcycle_stop(wcycle, ewcROTadd);
1433     }
1434
1435     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
1436     IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
1437
1438     if (PAR(cr) && !(cr->duty & DUTY_PME))
1439     {
1440         /* In case of node-splitting, the PP nodes receive the long-range
1441          * forces, virial and energy from the PME nodes here.
1442          */
1443         pme_receive_force_ener(cr, wcycle, enerd, fr);
1444     }
1445
1446     if (bDoForces)
1447     {
1448         post_process_forces(cr, step, nrnb, wcycle,
1449                             top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1450                             flags);
1451     }
1452
1453     if (flags & GMX_FORCE_ENERGY)
1454     {
1455         /* Sum the potential energy terms from group contributions */
1456         sum_epot(&(enerd->grpp), enerd->term);
1457
1458         if (!EI_TPI(inputrec->eI))
1459         {
1460             checkPotentialEnergyValidity(enerd);
1461         }
1462     }
1463 }
1464
1465 void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
1466                         t_inputrec *inputrec,
1467                         gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1468                         gmx_localtop_t *top,
1469                         gmx_groups_t *groups,
1470                         matrix box, rvec x[], history_t *hist,
1471                         PaddedRVecVector *force,
1472                         tensor vir_force,
1473                         t_mdatoms *mdatoms,
1474                         gmx_enerdata_t *enerd, t_fcdata *fcd,
1475                         real *lambda, t_graph *graph,
1476                         t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
1477                         double t, gmx_edsam_t ed,
1478                         gmx_bool bBornRadii,
1479                         int flags)
1480 {
1481     int        cg0, cg1, i, j;
1482     double     mu[2*DIM];
1483     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
1484     gmx_bool   bDoForces;
1485     float      cycles_pme, cycles_force;
1486
1487     const int  start  = 0;
1488     const int  homenr = mdatoms->homenr;
1489
1490     clear_mat(vir_force);
1491
1492     cg0 = 0;
1493     if (DOMAINDECOMP(cr))
1494     {
1495         cg1 = cr->dd->ncg_tot;
1496     }
1497     else
1498     {
1499         cg1 = top->cgs.nr;
1500     }
1501     if (fr->n_tpi > 0)
1502     {
1503         cg1--;
1504     }
1505
1506     bStateChanged  = (flags & GMX_FORCE_STATECHANGED);
1507     bNS            = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1508     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1509     bFillGrid      = (bNS && bStateChanged);
1510     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1511     bDoForces      = (flags & GMX_FORCE_FORCES);
1512
1513     if (bStateChanged)
1514     {
1515         update_forcerec(fr, box);
1516
1517         if (inputrecNeedMutot(inputrec))
1518         {
1519             /* Calculate total (local) dipole moment in a temporary common array.
1520              * This makes it possible to sum them over nodes faster.
1521              */
1522             calc_mu(start, homenr,
1523                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1524                     mu, mu+DIM);
1525         }
1526     }
1527
1528     if (fr->ePBC != epbcNONE)
1529     {
1530         /* Compute shift vectors every step,
1531          * because of pressure coupling or box deformation!
1532          */
1533         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1534         {
1535             calc_shifts(box, fr->shift_vec);
1536         }
1537
1538         if (bCalcCGCM)
1539         {
1540             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1541                                      &(top->cgs), x, fr->cg_cm);
1542             inc_nrnb(nrnb, eNR_CGCM, homenr);
1543             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1544         }
1545         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1546         {
1547             unshift_self(graph, box, x);
1548         }
1549     }
1550     else if (bCalcCGCM)
1551     {
1552         calc_cgcm(fplog, cg0, cg1, &(top->cgs), x, fr->cg_cm);
1553         inc_nrnb(nrnb, eNR_CGCM, homenr);
1554     }
1555
1556     if (bCalcCGCM && gmx_debug_at)
1557     {
1558         pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1559     }
1560
1561 #if GMX_MPI
1562     if (!(cr->duty & DUTY_PME))
1563     {
1564         gmx_bool bBS;
1565         matrix   boxs;
1566
1567         /* Send particle coordinates to the pme nodes.
1568          * Since this is only implemented for domain decomposition
1569          * and domain decomposition does not use the graph,
1570          * we do not need to worry about shifting.
1571          */
1572
1573         wallcycle_start(wcycle, ewcPP_PMESENDX);
1574
1575         bBS = (inputrec->nwall == 2);
1576         if (bBS)
1577         {
1578             copy_mat(box, boxs);
1579             svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
1580         }
1581
1582         gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
1583                                  lambda[efptCOUL], lambda[efptVDW],
1584                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1585                                  step);
1586
1587         wallcycle_stop(wcycle, ewcPP_PMESENDX);
1588     }
1589 #endif /* GMX_MPI */
1590
1591     /* Communicate coordinates and sum dipole if necessary */
1592     if (DOMAINDECOMP(cr))
1593     {
1594         wallcycle_start(wcycle, ewcMOVEX);
1595         dd_move_x(cr->dd, box, x);
1596         wallcycle_stop(wcycle, ewcMOVEX);
1597     }
1598
1599     if (inputrecNeedMutot(inputrec))
1600     {
1601         if (bStateChanged)
1602         {
1603             if (PAR(cr))
1604             {
1605                 gmx_sumd(2*DIM, mu, cr);
1606             }
1607             for (i = 0; i < 2; i++)
1608             {
1609                 for (j = 0; j < DIM; j++)
1610                 {
1611                     fr->mu_tot[i][j] = mu[i*DIM + j];
1612                 }
1613             }
1614         }
1615         if (fr->efep == efepNO)
1616         {
1617             copy_rvec(fr->mu_tot[0], mu_tot);
1618         }
1619         else
1620         {
1621             for (j = 0; j < DIM; j++)
1622             {
1623                 mu_tot[j] =
1624                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1625             }
1626         }
1627     }
1628
1629     /* Reset energies */
1630     reset_enerdata(enerd);
1631     clear_rvecs(SHIFTS, fr->fshift);
1632
1633     if (bNS)
1634     {
1635         wallcycle_start(wcycle, ewcNS);
1636
1637         if (graph && bStateChanged)
1638         {
1639             /* Calculate intramolecular shift vectors to make molecules whole */
1640             mk_mshift(fplog, graph, fr->ePBC, box, x);
1641         }
1642
1643         /* Do the actual neighbour searching */
1644         ns(fplog, fr, box,
1645            groups, top, mdatoms,
1646            cr, nrnb, bFillGrid);
1647
1648         wallcycle_stop(wcycle, ewcNS);
1649     }
1650
1651     if (inputrec->implicit_solvent && bNS)
1652     {
1653         make_gb_nblist(cr, inputrec->gb_algorithm,
1654                        x, box, fr, &top->idef, graph, fr->born);
1655     }
1656
1657     if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
1658     {
1659         wallcycle_start(wcycle, ewcPPDURINGPME);
1660         dd_force_flop_start(cr->dd, nrnb);
1661     }
1662
1663     if (inputrec->bRot)
1664     {
1665         /* Enforced rotation has its own cycle counter that starts after the collective
1666          * coordinates have been communicated. It is added to ddCyclF to allow
1667          * for proper load-balancing */
1668         wallcycle_start(wcycle, ewcROT);
1669         do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
1670         wallcycle_stop(wcycle, ewcROT);
1671     }
1672
1673     /* Temporary solution until all routines take PaddedRVecVector */
1674     rvec *f = as_rvec_array(force->data());
1675
1676     /* Start the force cycle counter.
1677      * This counter is stopped after do_force_lowlevel.
1678      * No parallel communication should occur while this counter is running,
1679      * since that will interfere with the dynamic load balancing.
1680      */
1681     wallcycle_start(wcycle, ewcFORCE);
1682
1683     if (bDoForces)
1684     {
1685         /* Reset forces for which the virial is calculated separately:
1686          * PME/Ewald forces if necessary */
1687         if (fr->bF_NoVirSum)
1688         {
1689             if (flags & GMX_FORCE_VIRIAL)
1690             {
1691                 fr->f_novirsum = fr->forceBufferNoVirialSummation;
1692                 /* TODO: remove this - 1 when padding is properly implemented */
1693                 clear_rvecs(fr->f_novirsum->size() - 1,
1694                             as_rvec_array(fr->f_novirsum->data()));
1695             }
1696             else
1697             {
1698                 /* We are not calculating the pressure so we do not need
1699                  * a separate array for forces that do not contribute
1700                  * to the pressure.
1701                  */
1702                 fr->f_novirsum = force;
1703             }
1704         }
1705
1706         /* Clear the short- and long-range forces */
1707         clear_rvecs(fr->natoms_force_constr, f);
1708
1709         clear_rvec(fr->vir_diag_posres);
1710     }
1711     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1712     {
1713         clear_pull_forces(inputrec->pull_work);
1714     }
1715
1716     /* update QMMMrec, if necessary */
1717     if (fr->bQMMM)
1718     {
1719         update_QMMMrec(cr, fr, x, mdatoms, box, top);
1720     }
1721
1722     /* Compute the bonded and non-bonded energies and optionally forces */
1723     do_force_lowlevel(fr, inputrec, &(top->idef),
1724                       cr, nrnb, wcycle, mdatoms,
1725                       x, hist, f, enerd, fcd, top, fr->born,
1726                       bBornRadii, box,
1727                       inputrec->fepvals, lambda,
1728                       graph, &(top->excls), fr->mu_tot,
1729                       flags,
1730                       &cycles_pme);
1731
1732     cycles_force = wallcycle_stop(wcycle, ewcFORCE);
1733
1734     if (ed)
1735     {
1736         do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1737     }
1738
1739     if (DOMAINDECOMP(cr))
1740     {
1741         dd_force_flop_stop(cr->dd, nrnb);
1742         if (wcycle)
1743         {
1744             dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
1745         }
1746     }
1747
1748     if (bDoForces)
1749     {
1750         /* Collect forces from modules */
1751         gmx::ArrayRef<gmx::RVec> fNoVirSum;
1752         if (fr->bF_NoVirSum)
1753         {
1754             fNoVirSum = *fr->f_novirsum;
1755         }
1756         fr->forceProviders->calculateForces(cr, mdatoms, box, t, x, *force, fNoVirSum);
1757
1758         /* Communicate the forces */
1759         if (DOMAINDECOMP(cr))
1760         {
1761             wallcycle_start(wcycle, ewcMOVEF);
1762             dd_move_f(cr->dd, f, fr->fshift);
1763             /* Do we need to communicate the separate force array
1764              * for terms that do not contribute to the single sum virial?
1765              * Position restraints and electric fields do not introduce
1766              * inter-cg forces, only full electrostatics methods do.
1767              * When we do not calculate the virial, fr->f_novirsum = f,
1768              * so we have already communicated these forces.
1769              */
1770             if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
1771                 (flags & GMX_FORCE_VIRIAL))
1772             {
1773                 dd_move_f(cr->dd, as_rvec_array(fr->f_novirsum->data()), nullptr);
1774             }
1775             wallcycle_stop(wcycle, ewcMOVEF);
1776         }
1777
1778         /* If we have NoVirSum forces, but we do not calculate the virial,
1779          * we sum fr->f_novirsum=f later.
1780          */
1781         if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1782         {
1783             wallcycle_start(wcycle, ewcVSITESPREAD);
1784             spread_vsite_f(vsite, x, f, fr->fshift, FALSE, nullptr, nrnb,
1785                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1786             wallcycle_stop(wcycle, ewcVSITESPREAD);
1787         }
1788
1789         if (flags & GMX_FORCE_VIRIAL)
1790         {
1791             /* Calculation of the virial must be done after vsites! */
1792             calc_virial(0, mdatoms->homenr, x, f,
1793                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1794         }
1795     }
1796
1797     if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
1798     {
1799         pull_potential_wrapper(cr, inputrec, box, x,
1800                                f, vir_force, mdatoms, enerd, lambda, t,
1801                                wcycle);
1802     }
1803
1804     /* Add the forces from enforced rotation potentials (if any) */
1805     if (inputrec->bRot)
1806     {
1807         wallcycle_start(wcycle, ewcROTadd);
1808         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1809         wallcycle_stop(wcycle, ewcROTadd);
1810     }
1811
1812     /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
1813     IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
1814
1815     if (PAR(cr) && !(cr->duty & DUTY_PME))
1816     {
1817         /* In case of node-splitting, the PP nodes receive the long-range
1818          * forces, virial and energy from the PME nodes here.
1819          */
1820         pme_receive_force_ener(cr, wcycle, enerd, fr);
1821     }
1822
1823     if (bDoForces)
1824     {
1825         post_process_forces(cr, step, nrnb, wcycle,
1826                             top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1827                             flags);
1828     }
1829
1830     if (flags & GMX_FORCE_ENERGY)
1831     {
1832         /* Sum the potential energy terms from group contributions */
1833         sum_epot(&(enerd->grpp), enerd->term);
1834
1835         if (!EI_TPI(inputrec->eI))
1836         {
1837             checkPotentialEnergyValidity(enerd);
1838         }
1839     }
1840
1841 }
1842
1843 void do_force(FILE *fplog, t_commrec *cr,
1844               t_inputrec *inputrec,
1845               gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1846               gmx_localtop_t *top,
1847               gmx_groups_t *groups,
1848               matrix box, PaddedRVecVector *coordinates, history_t *hist,
1849               PaddedRVecVector *force,
1850               tensor vir_force,
1851               t_mdatoms *mdatoms,
1852               gmx_enerdata_t *enerd, t_fcdata *fcd,
1853               gmx::ArrayRef<real> lambda, t_graph *graph,
1854               t_forcerec *fr,
1855               gmx_vsite_t *vsite, rvec mu_tot,
1856               double t, gmx_edsam_t ed,
1857               gmx_bool bBornRadii,
1858               int flags)
1859 {
1860     /* modify force flag if not doing nonbonded */
1861     if (!fr->bNonbonded)
1862     {
1863         flags &= ~GMX_FORCE_NONBONDED;
1864     }
1865
1866     GMX_ASSERT(coordinates->size() >= static_cast<unsigned int>(fr->natoms_force + 1) ||
1867                fr->natoms_force == 0, "We might need 1 element extra for SIMD");
1868     GMX_ASSERT(force->size() >= static_cast<unsigned int>(fr->natoms_force + 1) ||
1869                fr->natoms_force == 0, "We might need 1 element extra for SIMD");
1870
1871     rvec *x = as_rvec_array(coordinates->data());
1872
1873     switch (inputrec->cutoff_scheme)
1874     {
1875         case ecutsVERLET:
1876             do_force_cutsVERLET(fplog, cr, inputrec,
1877                                 step, nrnb, wcycle,
1878                                 top,
1879                                 groups,
1880                                 box, x, hist,
1881                                 force, vir_force,
1882                                 mdatoms,
1883                                 enerd, fcd,
1884                                 lambda.data(), graph,
1885                                 fr, fr->ic,
1886                                 vsite, mu_tot,
1887                                 t, ed,
1888                                 bBornRadii,
1889                                 flags);
1890             break;
1891         case ecutsGROUP:
1892             do_force_cutsGROUP(fplog, cr, inputrec,
1893                                step, nrnb, wcycle,
1894                                top,
1895                                groups,
1896                                box, x, hist,
1897                                force, vir_force,
1898                                mdatoms,
1899                                enerd, fcd,
1900                                lambda.data(), graph,
1901                                fr, vsite, mu_tot,
1902                                t, ed,
1903                                bBornRadii,
1904                                flags);
1905             break;
1906         default:
1907             gmx_incons("Invalid cut-off scheme passed!");
1908     }
1909 }
1910
1911
1912 void do_constrain_first(FILE *fplog, gmx_constr_t constr,
1913                         t_inputrec *ir, t_mdatoms *md,
1914                         t_state *state, t_commrec *cr, t_nrnb *nrnb,
1915                         t_forcerec *fr, gmx_localtop_t *top)
1916 {
1917     int             i, m, start, end;
1918     gmx_int64_t     step;
1919     real            dt = ir->delta_t;
1920     real            dvdl_dum;
1921     rvec           *savex;
1922
1923     /* We need to allocate one element extra, since we might use
1924      * (unaligned) 4-wide SIMD loads to access rvec entries.
1925      */
1926     snew(savex, state->natoms + 1);
1927
1928     start = 0;
1929     end   = md->homenr;
1930
1931     if (debug)
1932     {
1933         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
1934                 start, md->homenr, end);
1935     }
1936     /* Do a first constrain to reset particles... */
1937     step = ir->init_step;
1938     if (fplog)
1939     {
1940         char buf[STEPSTRSIZE];
1941         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
1942                 gmx_step_str(step, buf));
1943     }
1944     dvdl_dum = 0;
1945
1946     /* constrain the current position */
1947     constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1948               ir, cr, step, 0, 1.0, md,
1949               as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
1950               fr->bMolPBC, state->box,
1951               state->lambda[efptBONDED], &dvdl_dum,
1952               nullptr, nullptr, nrnb, econqCoord);
1953     if (EI_VV(ir->eI))
1954     {
1955         /* constrain the inital velocity, and save it */
1956         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
1957         constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1958                   ir, cr, step, 0, 1.0, md,
1959                   as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
1960                   fr->bMolPBC, state->box,
1961                   state->lambda[efptBONDED], &dvdl_dum,
1962                   nullptr, nullptr, nrnb, econqVeloc);
1963     }
1964     /* constrain the inital velocities at t-dt/2 */
1965     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
1966     {
1967         for (i = start; (i < end); i++)
1968         {
1969             for (m = 0; (m < DIM); m++)
1970             {
1971                 /* Reverse the velocity */
1972                 state->v[i][m] = -state->v[i][m];
1973                 /* Store the position at t-dt in buf */
1974                 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
1975             }
1976         }
1977         /* Shake the positions at t=-dt with the positions at t=0
1978          * as reference coordinates.
1979          */
1980         if (fplog)
1981         {
1982             char buf[STEPSTRSIZE];
1983             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
1984                     gmx_step_str(step, buf));
1985         }
1986         dvdl_dum = 0;
1987         constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1988                   ir, cr, step, -1, 1.0, md,
1989                   as_rvec_array(state->x.data()), savex, nullptr,
1990                   fr->bMolPBC, state->box,
1991                   state->lambda[efptBONDED], &dvdl_dum,
1992                   as_rvec_array(state->v.data()), nullptr, nrnb, econqCoord);
1993
1994         for (i = start; i < end; i++)
1995         {
1996             for (m = 0; m < DIM; m++)
1997             {
1998                 /* Re-reverse the velocities */
1999                 state->v[i][m] = -state->v[i][m];
2000             }
2001         }
2002     }
2003     sfree(savex);
2004 }
2005
2006
2007 static void
2008 integrate_table(real vdwtab[], real scale, int offstart, int rstart, int rend,
2009                 double *enerout, double *virout)
2010 {
2011     double enersum, virsum;
2012     double invscale, invscale2, invscale3;
2013     double r, ea, eb, ec, pa, pb, pc, pd;
2014     double y0, f, g, h;
2015     int    ri, offset;
2016     double tabfactor;
2017
2018     invscale  = 1.0/scale;
2019     invscale2 = invscale*invscale;
2020     invscale3 = invscale*invscale2;
2021
2022     /* Following summation derived from cubic spline definition,
2023      * Numerical Recipies in C, second edition, p. 113-116.  Exact for
2024      * the cubic spline.  We first calculate the negative of the
2025      * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2026      * add the more standard, abrupt cutoff correction to that result,
2027      * yielding the long-range correction for a switched function.  We
2028      * perform both the pressure and energy loops at the same time for
2029      * simplicity, as the computational cost is low. */
2030
2031     if (offstart == 0)
2032     {
2033         /* Since the dispersion table has been scaled down a factor
2034          * 6.0 and the repulsion a factor 12.0 to compensate for the
2035          * c6/c12 parameters inside nbfp[] being scaled up (to save
2036          * flops in kernels), we need to correct for this.
2037          */
2038         tabfactor = 6.0;
2039     }
2040     else
2041     {
2042         tabfactor = 12.0;
2043     }
2044
2045     enersum = 0.0;
2046     virsum  = 0.0;
2047     for (ri = rstart; ri < rend; ++ri)
2048     {
2049         r  = ri*invscale;
2050         ea = invscale3;
2051         eb = 2.0*invscale2*r;
2052         ec = invscale*r*r;
2053
2054         pa = invscale3;
2055         pb = 3.0*invscale2*r;
2056         pc = 3.0*invscale*r*r;
2057         pd = r*r*r;
2058
2059         /* this "8" is from the packing in the vdwtab array - perhaps
2060            should be defined? */
2061
2062         offset = 8*ri + offstart;
2063         y0     = vdwtab[offset];
2064         f      = vdwtab[offset+1];
2065         g      = vdwtab[offset+2];
2066         h      = vdwtab[offset+3];
2067
2068         enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2069         virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2070     }
2071     *enerout = 4.0*M_PI*enersum*tabfactor;
2072     *virout  = 4.0*M_PI*virsum*tabfactor;
2073 }
2074
2075 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2076 {
2077     double   eners[2], virs[2], enersum, virsum;
2078     double   r0, rc3, rc9;
2079     int      ri0, ri1, i;
2080     real     scale, *vdwtab;
2081
2082     fr->enershiftsix    = 0;
2083     fr->enershifttwelve = 0;
2084     fr->enerdiffsix     = 0;
2085     fr->enerdifftwelve  = 0;
2086     fr->virdiffsix      = 0;
2087     fr->virdifftwelve   = 0;
2088
2089     if (eDispCorr != edispcNO)
2090     {
2091         for (i = 0; i < 2; i++)
2092         {
2093             eners[i] = 0;
2094             virs[i]  = 0;
2095         }
2096         if ((fr->vdw_modifier == eintmodPOTSHIFT) ||
2097             (fr->vdw_modifier == eintmodPOTSWITCH) ||
2098             (fr->vdw_modifier == eintmodFORCESWITCH) ||
2099             (fr->vdwtype == evdwSHIFT) ||
2100             (fr->vdwtype == evdwSWITCH))
2101         {
2102             if (((fr->vdw_modifier == eintmodPOTSWITCH) ||
2103                  (fr->vdw_modifier == eintmodFORCESWITCH) ||
2104                  (fr->vdwtype == evdwSWITCH)) && fr->rvdw_switch == 0)
2105             {
2106                 gmx_fatal(FARGS,
2107                           "With dispersion correction rvdw-switch can not be zero "
2108                           "for vdw-type = %s", evdw_names[fr->vdwtype]);
2109             }
2110
2111             /* TODO This code depends on the logic in tables.c that
2112                constructs the table layout, which should be made
2113                explicit in future cleanup. */
2114             GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2115                        "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2116             scale  = fr->dispersionCorrectionTable->scale;
2117             vdwtab = fr->dispersionCorrectionTable->data;
2118
2119             /* Round the cut-offs to exact table values for precision */
2120             ri0  = static_cast<int>(floor(fr->rvdw_switch*scale));
2121             ri1  = static_cast<int>(ceil(fr->rvdw*scale));
2122
2123             /* The code below has some support for handling force-switching, i.e.
2124              * when the force (instead of potential) is switched over a limited
2125              * region. This leads to a constant shift in the potential inside the
2126              * switching region, which we can handle by adding a constant energy
2127              * term in the force-switch case just like when we do potential-shift.
2128              *
2129              * For now this is not enabled, but to keep the functionality in the
2130              * code we check separately for switch and shift. When we do force-switch
2131              * the shifting point is rvdw_switch, while it is the cutoff when we
2132              * have a classical potential-shift.
2133              *
2134              * For a pure potential-shift the potential has a constant shift
2135              * all the way out to the cutoff, and that is it. For other forms
2136              * we need to calculate the constant shift up to the point where we
2137              * start modifying the potential.
2138              */
2139             ri0  = (fr->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2140
2141             r0   = ri0/scale;
2142             rc3  = r0*r0*r0;
2143             rc9  = rc3*rc3*rc3;
2144
2145             if ((fr->vdw_modifier == eintmodFORCESWITCH) ||
2146                 (fr->vdwtype == evdwSHIFT))
2147             {
2148                 /* Determine the constant energy shift below rvdw_switch.
2149                  * Table has a scale factor since we have scaled it down to compensate
2150                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2151                  */
2152                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2153                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2154             }
2155             else if (fr->vdw_modifier == eintmodPOTSHIFT)
2156             {
2157                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3));
2158                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
2159             }
2160
2161             /* Add the constant part from 0 to rvdw_switch.
2162              * This integration from 0 to rvdw_switch overcounts the number
2163              * of interactions by 1, as it also counts the self interaction.
2164              * We will correct for this later.
2165              */
2166             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2167             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2168
2169             /* Calculate the contribution in the range [r0,r1] where we
2170              * modify the potential. For a pure potential-shift modifier we will
2171              * have ri0==ri1, and there will not be any contribution here.
2172              */
2173             for (i = 0; i < 2; i++)
2174             {
2175                 enersum = 0;
2176                 virsum  = 0;
2177                 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2178                 eners[i] -= enersum;
2179                 virs[i]  -= virsum;
2180             }
2181
2182             /* Alright: Above we compensated by REMOVING the parts outside r0
2183              * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2184              *
2185              * Regardless of whether r0 is the point where we start switching,
2186              * or the cutoff where we calculated the constant shift, we include
2187              * all the parts we are missing out to infinity from r0 by
2188              * calculating the analytical dispersion correction.
2189              */
2190             eners[0] += -4.0*M_PI/(3.0*rc3);
2191             eners[1] +=  4.0*M_PI/(9.0*rc9);
2192             virs[0]  +=  8.0*M_PI/rc3;
2193             virs[1]  += -16.0*M_PI/(3.0*rc9);
2194         }
2195         else if (fr->vdwtype == evdwCUT ||
2196                  EVDW_PME(fr->vdwtype) ||
2197                  fr->vdwtype == evdwUSER)
2198         {
2199             if (fr->vdwtype == evdwUSER && fplog)
2200             {
2201                 fprintf(fplog,
2202                         "WARNING: using dispersion correction with user tables\n");
2203             }
2204
2205             /* Note that with LJ-PME, the dispersion correction is multiplied
2206              * by the difference between the actual C6 and the value of C6
2207              * that would produce the combination rule.
2208              * This means the normal energy and virial difference formulas
2209              * can be used here.
2210              */
2211
2212             rc3  = fr->rvdw*fr->rvdw*fr->rvdw;
2213             rc9  = rc3*rc3*rc3;
2214             /* Contribution beyond the cut-off */
2215             eners[0] += -4.0*M_PI/(3.0*rc3);
2216             eners[1] +=  4.0*M_PI/(9.0*rc9);
2217             if (fr->vdw_modifier == eintmodPOTSHIFT)
2218             {
2219                 /* Contribution within the cut-off */
2220                 eners[0] += -4.0*M_PI/(3.0*rc3);
2221                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2222             }
2223             /* Contribution beyond the cut-off */
2224             virs[0]  +=  8.0*M_PI/rc3;
2225             virs[1]  += -16.0*M_PI/(3.0*rc9);
2226         }
2227         else
2228         {
2229             gmx_fatal(FARGS,
2230                       "Dispersion correction is not implemented for vdw-type = %s",
2231                       evdw_names[fr->vdwtype]);
2232         }
2233
2234         /* When we deprecate the group kernels the code below can go too */
2235         if (fr->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2236         {
2237             /* Calculate self-interaction coefficient (assuming that
2238              * the reciprocal-space contribution is constant in the
2239              * region that contributes to the self-interaction).
2240              */
2241             fr->enershiftsix = gmx::power6(fr->ewaldcoeff_lj) / 6.0;
2242
2243             eners[0] += -gmx::power3(std::sqrt(M_PI)*fr->ewaldcoeff_lj)/3.0;
2244             virs[0]  +=  gmx::power3(std::sqrt(M_PI)*fr->ewaldcoeff_lj);
2245         }
2246
2247         fr->enerdiffsix    = eners[0];
2248         fr->enerdifftwelve = eners[1];
2249         /* The 0.5 is due to the Gromacs definition of the virial */
2250         fr->virdiffsix     = 0.5*virs[0];
2251         fr->virdifftwelve  = 0.5*virs[1];
2252     }
2253 }
2254
2255 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2256                    matrix box, real lambda, tensor pres, tensor virial,
2257                    real *prescorr, real *enercorr, real *dvdlcorr)
2258 {
2259     gmx_bool bCorrAll, bCorrPres;
2260     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2261     int      m;
2262
2263     *prescorr = 0;
2264     *enercorr = 0;
2265     *dvdlcorr = 0;
2266
2267     clear_mat(virial);
2268     clear_mat(pres);
2269
2270     if (ir->eDispCorr != edispcNO)
2271     {
2272         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2273                      ir->eDispCorr == edispcAllEnerPres);
2274         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2275                      ir->eDispCorr == edispcAllEnerPres);
2276
2277         invvol = 1/det(box);
2278         if (fr->n_tpi)
2279         {
2280             /* Only correct for the interactions with the inserted molecule */
2281             dens   = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2282             ninter = fr->n_tpi;
2283         }
2284         else
2285         {
2286             dens   = fr->numAtomsForDispersionCorrection*invvol;
2287             ninter = 0.5*fr->numAtomsForDispersionCorrection;
2288         }
2289
2290         if (ir->efep == efepNO)
2291         {
2292             avcsix    = fr->avcsix[0];
2293             avctwelve = fr->avctwelve[0];
2294         }
2295         else
2296         {
2297             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2298             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2299         }
2300
2301         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2302         *enercorr += avcsix*enerdiff;
2303         dvdlambda  = 0.0;
2304         if (ir->efep != efepNO)
2305         {
2306             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2307         }
2308         if (bCorrAll)
2309         {
2310             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2311             *enercorr += avctwelve*enerdiff;
2312             if (fr->efep != efepNO)
2313             {
2314                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2315             }
2316         }
2317
2318         if (bCorrPres)
2319         {
2320             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2321             if (ir->eDispCorr == edispcAllEnerPres)
2322             {
2323                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2324             }
2325             /* The factor 2 is because of the Gromacs virial definition */
2326             spres = -2.0*invvol*svir*PRESFAC;
2327
2328             for (m = 0; m < DIM; m++)
2329             {
2330                 virial[m][m] += svir;
2331                 pres[m][m]   += spres;
2332             }
2333             *prescorr += spres;
2334         }
2335
2336         /* Can't currently control when it prints, for now, just print when degugging */
2337         if (debug)
2338         {
2339             if (bCorrAll)
2340             {
2341                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2342                         avcsix, avctwelve);
2343             }
2344             if (bCorrPres)
2345             {
2346                 fprintf(debug,
2347                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2348                         *enercorr, spres, svir);
2349             }
2350             else
2351             {
2352                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2353             }
2354         }
2355
2356         if (fr->efep != efepNO)
2357         {
2358             *dvdlcorr += dvdlambda;
2359         }
2360     }
2361 }
2362
2363 static void low_do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2364                             const gmx_mtop_t *mtop, rvec x[],
2365                             gmx_bool bFirst)
2366 {
2367     t_graph        *graph;
2368     int             mb, as, mol;
2369     gmx_molblock_t *molb;
2370
2371     if (bFirst && fplog)
2372     {
2373         fprintf(fplog, "Removing pbc first time\n");
2374     }
2375
2376     snew(graph, 1);
2377     as = 0;
2378     for (mb = 0; mb < mtop->nmolblock; mb++)
2379     {
2380         molb = &mtop->molblock[mb];
2381         if (molb->natoms_mol == 1 ||
2382             (!bFirst && mtop->moltype[molb->type].cgs.nr == 1))
2383         {
2384             /* Just one atom or charge group in the molecule, no PBC required */
2385             as += molb->nmol*molb->natoms_mol;
2386         }
2387         else
2388         {
2389             /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2390             mk_graph_ilist(nullptr, mtop->moltype[molb->type].ilist,
2391                            0, molb->natoms_mol, FALSE, FALSE, graph);
2392
2393             for (mol = 0; mol < molb->nmol; mol++)
2394             {
2395                 mk_mshift(fplog, graph, ePBC, box, x+as);
2396
2397                 shift_self(graph, box, x+as);
2398                 /* The molecule is whole now.
2399                  * We don't need the second mk_mshift call as in do_pbc_first,
2400                  * since we no longer need this graph.
2401                  */
2402
2403                 as += molb->natoms_mol;
2404             }
2405             done_graph(graph);
2406         }
2407     }
2408     sfree(graph);
2409 }
2410
2411 void do_pbc_first_mtop(FILE *fplog, int ePBC, matrix box,
2412                        const gmx_mtop_t *mtop, rvec x[])
2413 {
2414     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2415 }
2416
2417 void do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2418                  gmx_mtop_t *mtop, rvec x[])
2419 {
2420     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2421 }
2422
2423 void put_atoms_in_box_omp(int ePBC, const matrix box, int natoms, rvec x[])
2424 {
2425     int t, nth;
2426     nth = gmx_omp_nthreads_get(emntDefault);
2427
2428 #pragma omp parallel for num_threads(nth) schedule(static)
2429     for (t = 0; t < nth; t++)
2430     {
2431         try
2432         {
2433             int offset, len;
2434
2435             offset = (natoms*t    )/nth;
2436             len    = (natoms*(t + 1))/nth - offset;
2437             put_atoms_in_box(ePBC, box, len, x + offset);
2438         }
2439         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2440     }
2441 }
2442
2443 // TODO This can be cleaned up a lot, and move back to runner.cpp
2444 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, t_commrec *cr,
2445                 t_inputrec *inputrec,
2446                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2447                 gmx_walltime_accounting_t walltime_accounting,
2448                 nonbonded_verlet_t *nbv,
2449                 gmx_bool bWriteStat)
2450 {
2451     t_nrnb *nrnb_tot = nullptr;
2452     double  delta_t  = 0;
2453     double  nbfs     = 0, mflop = 0;
2454     double  elapsed_time,
2455             elapsed_time_over_all_ranks,
2456             elapsed_time_over_all_threads,
2457             elapsed_time_over_all_threads_over_all_ranks;
2458     /* Control whether it is valid to print a report. Only the
2459        simulation master may print, but it should not do so if the run
2460        terminated e.g. before a scheduled reset step. This is
2461        complicated by the fact that PME ranks are unaware of the
2462        reason why they were sent a pmerecvqxFINISH. To avoid
2463        communication deadlocks, we always do the communication for the
2464        report, even if we've decided not to write the report, because
2465        how long it takes to finish the run is not important when we've
2466        decided not to report on the simulation performance. */
2467     bool    printReport = SIMMASTER(cr);
2468
2469     if (!walltime_accounting_get_valid_finish(walltime_accounting))
2470     {
2471         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2472         printReport = false;
2473     }
2474
2475     if (cr->nnodes > 1)
2476     {
2477         snew(nrnb_tot, 1);
2478 #if GMX_MPI
2479         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2480                       cr->mpi_comm_mysim);
2481 #endif
2482     }
2483     else
2484     {
2485         nrnb_tot = nrnb;
2486     }
2487
2488     elapsed_time                                 = walltime_accounting_get_elapsed_time(walltime_accounting);
2489     elapsed_time_over_all_ranks                  = elapsed_time;
2490     elapsed_time_over_all_threads                = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2491     elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2492 #if GMX_MPI
2493     if (cr->nnodes > 1)
2494     {
2495         /* reduce elapsed_time over all MPI ranks in the current simulation */
2496         MPI_Allreduce(&elapsed_time,
2497                       &elapsed_time_over_all_ranks,
2498                       1, MPI_DOUBLE, MPI_SUM,
2499                       cr->mpi_comm_mysim);
2500         elapsed_time_over_all_ranks /= cr->nnodes;
2501         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2502          * current simulation. */
2503         MPI_Allreduce(&elapsed_time_over_all_threads,
2504                       &elapsed_time_over_all_threads_over_all_ranks,
2505                       1, MPI_DOUBLE, MPI_SUM,
2506                       cr->mpi_comm_mysim);
2507     }
2508 #endif
2509
2510     if (printReport)
2511     {
2512         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2513     }
2514     if (cr->nnodes > 1)
2515     {
2516         sfree(nrnb_tot);
2517     }
2518
2519     if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
2520     {
2521         print_dd_statistics(cr, inputrec, fplog);
2522     }
2523
2524     /* TODO Move the responsibility for any scaling by thread counts
2525      * to the code that handled the thread region, so that there's a
2526      * mechanism to keep cycle counting working during the transition
2527      * to task parallelism. */
2528     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
2529     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2530     wallcycle_scale_by_num_threads(wcycle, cr->duty == DUTY_PME, nthreads_pp, nthreads_pme);
2531     auto cycle_sum(wallcycle_sum(cr, wcycle));
2532
2533     if (printReport)
2534     {
2535         struct gmx_wallclock_gpu_t* gputimes = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2536
2537         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2538                         elapsed_time_over_all_ranks,
2539                         wcycle, cycle_sum, gputimes);
2540
2541         if (EI_DYNAMICS(inputrec->eI))
2542         {
2543             delta_t = inputrec->delta_t;
2544         }
2545
2546         if (fplog)
2547         {
2548             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2549                        elapsed_time_over_all_ranks,
2550                        walltime_accounting_get_nsteps_done(walltime_accounting),
2551                        delta_t, nbfs, mflop);
2552         }
2553         if (bWriteStat)
2554         {
2555             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2556                        elapsed_time_over_all_ranks,
2557                        walltime_accounting_get_nsteps_done(walltime_accounting),
2558                        delta_t, nbfs, mflop);
2559         }
2560     }
2561 }
2562
2563 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2564 {
2565     /* this function works, but could probably use a logic rewrite to keep all the different
2566        types of efep straight. */
2567
2568     if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2569     {
2570         return;
2571     }
2572
2573     t_lambda *fep = ir->fepvals;
2574     *fep_state    = fep->init_fep_state; /* this might overwrite the checkpoint
2575                                             if checkpoint is set -- a kludge is in for now
2576                                             to prevent this.*/
2577
2578     for (int i = 0; i < efptNR; i++)
2579     {
2580         /* overwrite lambda state with init_lambda for now for backwards compatibility */
2581         if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2582         {
2583             lambda[i] = fep->init_lambda;
2584             if (lam0)
2585             {
2586                 lam0[i] = lambda[i];
2587             }
2588         }
2589         else
2590         {
2591             lambda[i] = fep->all_lambda[i][*fep_state];
2592             if (lam0)
2593             {
2594                 lam0[i] = lambda[i];
2595             }
2596         }
2597     }
2598     if (ir->bSimTemp)
2599     {
2600         /* need to rescale control temperatures to match current state */
2601         for (int i = 0; i < ir->opts.ngtc; i++)
2602         {
2603             if (ir->opts.ref_t[i] > 0)
2604             {
2605                 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2606             }
2607         }
2608     }
2609
2610     /* Send to the log the information on the current lambdas */
2611     if (fplog != nullptr)
2612     {
2613         fprintf(fplog, "Initial vector of lambda components:[ ");
2614         for (const auto &l : lambda)
2615         {
2616             fprintf(fplog, "%10.4f ", l);
2617         }
2618         fprintf(fplog, "]\n");
2619     }
2620     return;
2621 }
2622
2623
2624 void init_md(FILE *fplog,
2625              t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2626              t_inputrec *ir, const gmx_output_env_t *oenv,
2627              double *t, double *t0,
2628              gmx::ArrayRef<real> lambda, int *fep_state, double *lam0,
2629              t_nrnb *nrnb, gmx_mtop_t *mtop,
2630              gmx_update_t **upd,
2631              int nfile, const t_filenm fnm[],
2632              gmx_mdoutf_t *outf, t_mdebin **mdebin,
2633              tensor force_vir, tensor shake_vir, rvec mu_tot,
2634              gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags,
2635              gmx_wallcycle_t wcycle)
2636 {
2637     int  i;
2638
2639     /* Initial values */
2640     *t = *t0       = ir->init_t;
2641
2642     *bSimAnn = FALSE;
2643     for (i = 0; i < ir->opts.ngtc; i++)
2644     {
2645         /* set bSimAnn if any group is being annealed */
2646         if (ir->opts.annealing[i] != eannNO)
2647         {
2648             *bSimAnn = TRUE;
2649         }
2650     }
2651
2652     /* Initialize lambda variables */
2653     initialize_lambdas(fplog, ir, fep_state, lambda, lam0);
2654
2655     // TODO upd is never NULL in practice, but the analysers don't know that
2656     if (upd)
2657     {
2658         *upd = init_update(ir);
2659     }
2660     if (*bSimAnn)
2661     {
2662         update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2663     }
2664
2665     if (vcm != nullptr)
2666     {
2667         *vcm = init_vcm(fplog, &mtop->groups, ir);
2668     }
2669
2670     if (EI_DYNAMICS(ir->eI) && !(Flags & MD_APPENDFILES))
2671     {
2672         if (ir->etc == etcBERENDSEN)
2673         {
2674             please_cite(fplog, "Berendsen84a");
2675         }
2676         if (ir->etc == etcVRESCALE)
2677         {
2678             please_cite(fplog, "Bussi2007a");
2679         }
2680         if (ir->eI == eiSD1)
2681         {
2682             please_cite(fplog, "Goga2012");
2683         }
2684     }
2685     init_nrnb(nrnb);
2686
2687     if (nfile != -1)
2688     {
2689         *outf = init_mdoutf(fplog, nfile, fnm, Flags, cr, outputProvider, ir, mtop, oenv, wcycle);
2690
2691         *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? nullptr : mdoutf_get_fp_ene(*outf),
2692                               mtop, ir, mdoutf_get_fp_dhdl(*outf));
2693     }
2694
2695     /* Initiate variables */
2696     clear_mat(force_vir);
2697     clear_mat(shake_vir);
2698     clear_rvec(mu_tot);
2699 }