src/gromacs/mdlib/perf_est.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2012,2014,2015,2016,2017, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "perf_est.h"
  40
  41 #include <cmath>
  42
  43 #include "gromacs/math/functions.h"
  44 #include "gromacs/math/utilities.h"
  45 #include "gromacs/math/vec.h"
  46 #include "gromacs/mdlib/nbnxn_consts.h"
  47 #include "gromacs/mdlib/nbnxn_search.h"
  48 #include "gromacs/mdtypes/commrec.h"
  49 #include "gromacs/mdtypes/inputrec.h"
  50 #include "gromacs/mdtypes/md_enums.h"
  51 #include "gromacs/simd/simd.h"
  52 #include "gromacs/topology/ifunc.h"
  53 #include "gromacs/topology/topology.h"
  54 #include "gromacs/utility/fatalerror.h"
  55
  56 /* Computational cost of bonded, non-bonded and PME calculations.
  57  * This will be machine dependent.
  58  * The numbers are only used for estimating the relative cost of PME vs PP,
  59  * so only relative numbers matter.
  60  * The numbers here are accurate cycle counts for Haswell in single precision
  61  * compiled with gcc5.2. A correction factor for other architectures is given
  62  * by simd_cycle_factor().
  63  * In double precision PME mesh is slightly cheaper, although not so much
  64  * that the numbers need to be adjusted.
  65  */
  66
  67 /* Cost of a pair interaction in the "group" cut-off scheme */
  68 static const double c_group_fq        = 18.0;
  69 static const double c_group_qlj_cut   = 18.0;
  70 static const double c_group_qlj_tab   = 24.0;
  71 static const double c_group_lj_cut    = 12.0;
  72 static const double c_group_lj_tab    = 21.0;
  73 /* Cost of 1 water with one Q/LJ atom */
  74 static const double c_group_qljw_cut  = 24.0;
  75 static const double c_group_qljw_tab  = 27.0;
  76 /* Cost of 1 water with one Q atom or with 1/3 water (LJ negligible) */
  77 static const double c_group_qw        = 21.0;
  78
  79 /* Cost of a pair interaction in the "Verlet" cut-off scheme, QEXP is Ewald */
  80 static const double c_nbnxn_lj        =  2.5;
  81 static const double c_nbnxn_qrf_lj    =  2.9;
  82 static const double c_nbnxn_qrf       =  2.4;
  83 static const double c_nbnxn_qexp_lj   =  4.2;
  84 static const double c_nbnxn_qexp      =  3.8;
  85 /* Extra cost for expensive LJ interaction, e.g. pot-switch or LJ-PME */
  86 static const double c_nbnxn_ljexp_add =  1.0;
  87
  88 /* Cost of the different components of PME. */
  89 /* Cost of particle reordering and redistribution (no SIMD correction).
  90  * This will be zero without MPI and can be very high with load imbalance.
  91  * Thus we use an approximate value for medium parallelization.
  92  */
  93 static const double c_pme_redist = 100.0;
  94 /* Cost of q spreading and force interpolation per charge. This part almost
  95  * doesn't accelerate with SIMD, so we don't use SIMD correction.
  96  */
  97 static const double c_pme_spread =   5.0;
  98 /* Cost of fft's, will be multiplied with 2 N log2(N) (no SIMD correction)
  99  * Without MPI the number is 2-3, depending on grid factors and thread count.
 100  * We take the high limit to be on the safe side and account for some MPI
 101  * communication cost, which will dominate at high parallelization.
 102  */
 103 static const double c_pme_fft    =   3.0;
 104 /* Cost of pme_solve, will be multiplied with N */
 105 static const double c_pme_solve  =   9.0;
 106
 107 /* Cost of a bonded interaction divided by the number of distances calculations
 108  * required in one interaction. The actual cost is nearly propotional to this.
 109  */
 110 static const double c_bond       =  25.0;
 111
 112
 113 #if GMX_SIMD_HAVE_REAL
 114 static const gmx_bool bHaveSIMD = TRUE;
 115 #else
 116 static const gmx_bool bHaveSIMD = FALSE;
 117 #endif
 118
 119 /* Gives a correction factor for the currently compiled SIMD implementations
 120  * versus the reference used for the coefficients above (8-wide SIMD with FMA).
 121  * bUseSIMD sets if we asking for plain-C (FALSE) or SIMD (TRUE) code.
 122  */
 123 static double simd_cycle_factor(gmx_bool bUseSIMD)
 124 {
 125     /* The (average) ratio of the time taken by plain-C force calculations
 126      * relative to SIMD versions, for the reference platform Haswell:
 127      * 8-wide SIMD with FMA, factor: sqrt(2*8)*1.25 = 5.
 128      * This factor is used for normalization in simd_cycle_factor().
 129      */
 130     const double simd_cycle_no_simd = 5.0;
 131     double       speedup;
 132
 133 #if GMX_SIMD_HAVE_REAL
 134     if (bUseSIMD)
 135     {
 136         /* We never get full speed-up of a factor GMX_SIMD_REAL_WIDTH.
 137          * The actual speed-up depends very much on gather+scatter overhead,
 138          * which is different for different bonded and non-bonded kernels.
 139          * As a rough, but actually not bad, approximation we use a sqrt
 140          * dependence on the width which gives a factor 4 for width=8.
 141          */
 142         speedup = std::sqrt(2.0*GMX_SIMD_REAL_WIDTH);
 143 #if GMX_SIMD_HAVE_FMA
 144         /* FMA tends to give a bit more speedup */
 145         speedup *= 1.25;
 146 #endif
 147     }
 148     else
 149     {
 150         speedup  = 1.0;
 151     }
 152 #else
 153     if (bUseSIMD)
 154     {
 155         gmx_incons("gmx_cycle_factor() compiled without SIMD called with bUseSIMD=TRUE");
 156     }
 157     /* No SIMD, no speedup */
 158     speedup      = 1.0;
 159 #endif
 160
 161     /* Return speed compared to the reference (Haswell).
 162      * For x86 SIMD, the nbnxn kernels are relatively much slower on
 163      * Sandy/Ivy Bridge than Haswell, but that only leads to a too high
 164      * PME load estimate on SB/IB, which is erring on the safe side.
 165      */
 166     return simd_cycle_no_simd/speedup;
 167 }
 168
 169 void count_bonded_distances(const gmx_mtop_t *mtop, const t_inputrec *ir,
 170                             double *ndistance_c, double *ndistance_simd)
 171 {
 172     gmx_bool       bExcl;
 173     double         nonsimd_step_frac;
 174     int            mb, nmol, ftype;
 175     gmx_moltype_t *molt;
 176     double         ndtot_c, ndtot_simd;
 177 #if GMX_SIMD_HAVE_REAL
 178     gmx_bool       bSimdBondeds = TRUE;
 179 #else
 180     gmx_bool       bSimdBondeds = FALSE;
 181 #endif
 182
 183     bExcl = (ir->cutoff_scheme == ecutsGROUP && inputrecExclForces(ir)
 184              && !EEL_FULL(ir->coulombtype));
 185
 186     if (bSimdBondeds)
 187     {
 188         /* We only have SIMD versions of these bondeds without energy and
 189          * without shift-forces, we take that into account here.
 190          */
 191         if (ir->nstcalcenergy > 0)
 192         {
 193             nonsimd_step_frac = 1.0/ir->nstcalcenergy;
 194         }
 195         else
 196         {
 197             nonsimd_step_frac = 0;
 198         }
 199         if (ir->epc != epcNO && 1.0/ir->nstpcouple > nonsimd_step_frac)
 200         {
 201             nonsimd_step_frac = 1.0/ir->nstpcouple;
 202         }
 203     }
 204     else
 205     {
 206         nonsimd_step_frac = 1;
 207     }
 208
 209     /* Count the number of pbc_rvec_sub calls required for bonded interactions.
 210      * This number is also roughly proportional to the computational cost.
 211      */
 212     ndtot_c    = 0;
 213     ndtot_simd = 0;
 214 #if defined _ICC && __ICC == 1400 || defined __ICL && __ICL == 1400
 215 #pragma novector /* Work-around for incorrect vectorization */
 216 #endif
 217     for (mb = 0; mb < mtop->nmolblock; mb++)
 218     {
 219         molt = &mtop->moltype[mtop->molblock[mb].type];
 220         nmol = mtop->molblock[mb].nmol;
 221         for (ftype = 0; ftype < F_NRE; ftype++)
 222         {
 223             int nbonds;
 224
 225             if (interaction_function[ftype].flags & IF_BOND)
 226             {
 227                 double nd_c, nd_simd;
 228
 229                 nd_c    = 0;
 230                 nd_simd = 0;
 231                 /* For all interactions, except for the three exceptions
 232                  * in the switch below, #distances = #atoms - 1.
 233                  */
 234                 switch (ftype)
 235                 {
 236                     case F_POSRES:
 237                     case F_FBPOSRES:
 238                         nd_c    = 1;
 239                         break;
 240                     case F_CONNBONDS:
 241                         break;
 242                     /* These bonded potentially use SIMD */
 243                     case F_ANGLES:
 244                     case F_PDIHS:
 245                     case F_RBDIHS:
 246                     case F_LJ14:
 247                         nd_c    =      nonsimd_step_frac *(NRAL(ftype) - 1);
 248                         nd_simd = (1 - nonsimd_step_frac)*(NRAL(ftype) - 1);
 249                         break;
 250                     default:
 251                         nd_c    = NRAL(ftype) - 1;
 252                         break;
 253                 }
 254                 nbonds      = nmol*molt->ilist[ftype].nr/(1 + NRAL(ftype));
 255                 ndtot_c    += nbonds*nd_c;
 256                 ndtot_simd += nbonds*nd_simd;
 257             }
 258         }
 259         if (bExcl)
 260         {
 261             ndtot_c += nmol*(molt->excls.nra - molt->atoms.nr)/2;
 262         }
 263     }
 264
 265     if (debug)
 266     {
 267         fprintf(debug, "nr. of distance calculations in bondeds: C %.1f SIMD %.1f\n", ndtot_c, ndtot_simd);
 268     }
 269
 270     if (ndistance_c    != nullptr)
 271     {
 272         *ndistance_c    = ndtot_c;
 273     }
 274     if (ndistance_simd != nullptr)
 275     {
 276         *ndistance_simd = ndtot_simd;
 277     }
 278 }
 279
 280 static void pp_group_load(const gmx_mtop_t *mtop, const t_inputrec *ir,
 281                           const matrix box,
 282                           int *nq_tot, int *nlj_tot,
 283                           double *cost_pp,
 284                           gmx_bool *bChargePerturbed, gmx_bool *bTypePerturbed)
 285 {
 286     t_atom        *atom;
 287     int            mb, nmol, atnr, cg, a, a0, ncqlj, ncq, nclj;
 288     gmx_bool       bBHAM, bLJcut, bWater, bQ, bLJ;
 289     int            nw, nqlj, nq, nlj;
 290     double         fq, fqlj, flj, fqljw, fqw;
 291     t_iparams     *iparams;
 292     gmx_moltype_t *molt;
 293
 294     bBHAM = (mtop->ffparams.functype[0] == F_BHAM);
 295
 296     bLJcut = ((ir->vdwtype == evdwCUT) && !bBHAM);
 297
 298     /* Computational cost of bonded, non-bonded and PME calculations.
 299      * This will be machine dependent.
 300      * The numbers here are accurate for Intel Core2 and AMD Athlon 64
 301      * in single precision. In double precision PME mesh is slightly cheaper,
 302      * although not so much that the numbers need to be adjusted.
 303      */
 304     fq    = c_group_fq;
 305     fqlj  = (bLJcut ? c_group_qlj_cut : c_group_qlj_tab);
 306     flj   = (bLJcut ? c_group_lj_cut  : c_group_lj_tab);
 307     /* Cost of 1 water with one Q/LJ atom */
 308     fqljw = (bLJcut ? c_group_qljw_cut : c_group_qljw_tab);
 309     /* Cost of 1 water with one Q atom or with 1/3 water (LJ negligible) */
 310     fqw   = c_group_qw;
 311
 312     iparams           = mtop->ffparams.iparams;
 313     atnr              = mtop->ffparams.atnr;
 314     nw                = 0;
 315     nqlj              = 0;
 316     nq                = 0;
 317     nlj               = 0;
 318     *bChargePerturbed = FALSE;
 319     *bTypePerturbed   = FALSE;
 320     for (mb = 0; mb < mtop->nmolblock; mb++)
 321     {
 322         molt = &mtop->moltype[mtop->molblock[mb].type];
 323         atom = molt->atoms.atom;
 324         nmol = mtop->molblock[mb].nmol;
 325         a    = 0;
 326         for (cg = 0; cg < molt->cgs.nr; cg++)
 327         {
 328             bWater = !bBHAM;
 329             ncqlj  = 0;
 330             ncq    = 0;
 331             nclj   = 0;
 332             a0     = a;
 333             while (a < molt->cgs.index[cg+1])
 334             {
 335                 bQ  = (atom[a].q != 0 || atom[a].qB != 0);
 336                 bLJ = (iparams[(atnr+1)*atom[a].type].lj.c6  != 0 ||
 337                        iparams[(atnr+1)*atom[a].type].lj.c12 != 0);
 338                 if (atom[a].q != atom[a].qB)
 339                 {
 340                     *bChargePerturbed = TRUE;
 341                 }
 342                 if (atom[a].type != atom[a].typeB)
 343                 {
 344                     *bTypePerturbed = TRUE;
 345                 }
 346                 /* This if this atom fits into water optimization */
 347                 if (!((a == a0   &&  bQ &&  bLJ) ||
 348                       (a == a0+1 &&  bQ && !bLJ) ||
 349                       (a == a0+2 &&  bQ && !bLJ && atom[a].q == atom[a-1].q) ||
 350                       (a == a0+3 && !bQ &&  bLJ)))
 351                 {
 352                     bWater = FALSE;
 353                 }
 354                 if (bQ && bLJ)
 355                 {
 356                     ncqlj++;
 357                 }
 358                 else
 359                 {
 360                     if (bQ)
 361                     {
 362                         ncq++;
 363                     }
 364                     if (bLJ)
 365                     {
 366                         nclj++;
 367                     }
 368                 }
 369                 a++;
 370             }
 371             if (bWater)
 372             {
 373                 nw   += nmol;
 374             }
 375             else
 376             {
 377                 nqlj += nmol*ncqlj;
 378                 nq   += nmol*ncq;
 379                 nlj  += nmol*nclj;
 380             }
 381         }
 382     }
 383
 384     *nq_tot  = nq  + nqlj + nw*3;
 385     *nlj_tot = nlj + nqlj + nw;
 386
 387     if (debug)
 388     {
 389         fprintf(debug, "nw %d nqlj %d nq %d nlj %d\n", nw, nqlj, nq, nlj);
 390     }
 391
 392     /* For the PP non-bonded cost it is (unrealistically) assumed
 393      * that all atoms are distributed homogeneously in space.
 394      * Factor 3 is used because a water molecule has 3 atoms
 395      * (and TIP4P effectively has 3 interactions with (water) atoms)).
 396      */
 397     *cost_pp = 0.5*(fqljw*nw*nqlj +
 398                     fqw  *nw*(3*nw + nq) +
 399                     fqlj *nqlj*nqlj +
 400                     fq   *nq*(3*nw + nqlj + nq) +
 401                     flj  *nlj*(nw + nqlj + nlj))
 402         *4/3*M_PI*ir->rlist*ir->rlist*ir->rlist/det(box);
 403
 404     *cost_pp *= simd_cycle_factor(bHaveSIMD);
 405 }
 406
 407 static void pp_verlet_load(const gmx_mtop_t *mtop, const t_inputrec *ir,
 408                            const matrix box,
 409                            int *nq_tot, int *nlj_tot,
 410                            double *cost_pp,
 411                            gmx_bool *bChargePerturbed, gmx_bool *bTypePerturbed)
 412 {
 413     t_atom        *atom;
 414     int            mb, nmol, atnr, a, nqlj, nq, nlj;
 415     gmx_bool       bQRF;
 416     t_iparams     *iparams;
 417     gmx_moltype_t *molt;
 418     real           r_eff;
 419     double         c_qlj, c_q, c_lj;
 420     double         nppa;
 421     int            j_cluster_size;
 422     /* Conversion factor for reference vs SIMD kernel performance.
 423      * The factor is about right for SSE2/4, but should be 2 higher for AVX256.
 424      */
 425 #if GMX_DOUBLE
 426     const real     nbnxn_refkernel_fac = 4.0;
 427 #else
 428     const real     nbnxn_refkernel_fac = 8.0;
 429 #endif
 430
 431     bQRF = (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT);
 432
 433     iparams           = mtop->ffparams.iparams;
 434     atnr              = mtop->ffparams.atnr;
 435     nqlj              = 0;
 436     nq                = 0;
 437     *bChargePerturbed = FALSE;
 438     *bTypePerturbed   = FALSE;
 439     for (mb = 0; mb < mtop->nmolblock; mb++)
 440     {
 441         molt = &mtop->moltype[mtop->molblock[mb].type];
 442         atom = molt->atoms.atom;
 443         nmol = mtop->molblock[mb].nmol;
 444         for (a = 0; a < molt->atoms.nr; a++)
 445         {
 446             if (atom[a].q != 0 || atom[a].qB != 0)
 447             {
 448                 if (iparams[(atnr+1)*atom[a].type].lj.c6  != 0 ||
 449                     iparams[(atnr+1)*atom[a].type].lj.c12 != 0)
 450                 {
 451                     nqlj += nmol;
 452                 }
 453                 else
 454                 {
 455                     nq += nmol;
 456                 }
 457             }
 458             if (atom[a].q != atom[a].qB)
 459             {
 460                 *bChargePerturbed = TRUE;
 461             }
 462             if (atom[a].type != atom[a].typeB)
 463             {
 464                 *bTypePerturbed = TRUE;
 465             }
 466         }
 467     }
 468
 469     nlj = mtop->natoms - nqlj - nq;
 470
 471     *nq_tot  = nqlj + nq;
 472     *nlj_tot = nqlj + nlj;
 473
 474     /* Effective cut-off for cluster pair list of 4x4 or 4x8 atoms.
 475      * This choice should match the one of pick_nbnxn_kernel_cpu().
 476      * TODO: Make this function use pick_nbnxn_kernel_cpu().
 477      */
 478 #if GMX_SIMD_HAVE_REAL && ((GMX_SIMD_REAL_WIDTH == 8 && defined GMX_SIMD_HAVE_FMA) || GMX_SIMD_REAL_WIDTH > 8)
 479     j_cluster_size = 8;
 480 #else
 481     j_cluster_size = 4;
 482 #endif
 483     r_eff = ir->rlist + nbnxn_get_rlist_effective_inc(j_cluster_size, mtop->natoms/det(box));
 484
 485     /* The average number of pairs per atom */
 486     nppa  = 0.5*4/3*M_PI*r_eff*r_eff*r_eff*mtop->natoms/det(box);
 487
 488     if (debug)
 489     {
 490         fprintf(debug, "nqlj %d nq %d nlj %d rlist %.3f r_eff %.3f pairs per atom %.1f\n",
 491                 nqlj, nq, nlj, ir->rlist, r_eff, nppa);
 492     }
 493
 494     /* Determine the cost per pair interaction */
 495     c_qlj = (bQRF ? c_nbnxn_qrf_lj : c_nbnxn_qexp_lj);
 496     c_q   = (bQRF ? c_nbnxn_qrf    : c_nbnxn_qexp);
 497     c_lj  = c_nbnxn_lj;
 498     if (ir->vdw_modifier == eintmodPOTSWITCH || EVDW_PME(ir->vdwtype))
 499     {
 500         c_qlj += c_nbnxn_ljexp_add;
 501         c_lj  += c_nbnxn_ljexp_add;
 502     }
 503     if (EVDW_PME(ir->vdwtype) && ir->ljpme_combination_rule == eljpmeLB)
 504     {
 505         /* We don't have LJ-PME LB comb. rule kernels, we use slow kernels */
 506         c_qlj *= nbnxn_refkernel_fac;
 507         c_q   *= nbnxn_refkernel_fac;
 508         c_lj  *= nbnxn_refkernel_fac;
 509     }
 510
 511     /* For the PP non-bonded cost it is (unrealistically) assumed
 512      * that all atoms are distributed homogeneously in space.
 513      */
 514     *cost_pp = (nqlj*c_qlj + nq*c_q + nlj*c_lj)*nppa;
 515
 516     *cost_pp *= simd_cycle_factor(bHaveSIMD);
 517 }
 518
 519 float pme_load_estimate(const gmx_mtop_t *mtop, const t_inputrec *ir,
 520                         const matrix box)
 521 {
 522     int            nq_tot, nlj_tot;
 523     gmx_bool       bChargePerturbed, bTypePerturbed;
 524     double         ndistance_c, ndistance_simd;
 525     double         cost_bond, cost_pp, cost_redist, cost_spread, cost_fft, cost_solve, cost_pme;
 526     float          ratio;
 527
 528     /* Computational cost of bonded, non-bonded and PME calculations.
 529      * This will be machine dependent.
 530      * The numbers here are accurate for Intel Core2 and AMD Athlon 64
 531      * in single precision. In double precision PME mesh is slightly cheaper,
 532      * although not so much that the numbers need to be adjusted.
 533      */
 534
 535     count_bonded_distances(mtop, ir, &ndistance_c, &ndistance_simd);
 536     /* C_BOND is the cost for bonded interactions with SIMD implementations,
 537      * so we need to scale the number of bonded interactions for which there
 538      * are only C implementations to the number of SIMD equivalents.
 539      */
 540     cost_bond = c_bond*(ndistance_c   *simd_cycle_factor(FALSE) +
 541                         ndistance_simd*simd_cycle_factor(bHaveSIMD));
 542
 543     if (ir->cutoff_scheme == ecutsGROUP)
 544     {
 545         pp_group_load(mtop, ir, box,
 546                       &nq_tot, &nlj_tot, &cost_pp,
 547                       &bChargePerturbed, &bTypePerturbed);
 548     }
 549     else
 550     {
 551         pp_verlet_load(mtop, ir, box,
 552                        &nq_tot, &nlj_tot, &cost_pp,
 553                        &bChargePerturbed, &bTypePerturbed);
 554     }
 555
 556     cost_redist = 0;
 557     cost_spread = 0;
 558     cost_fft    = 0;
 559     cost_solve  = 0;
 560
 561     if (EEL_PME(ir->coulombtype))
 562     {
 563         double grid = ir->nkx*ir->nky*((ir->nkz + 1)/2);
 564
 565         int    f     = ((ir->efep != efepNO && bChargePerturbed) ? 2 : 1);
 566         cost_redist +=   c_pme_redist*nq_tot;
 567         cost_spread += f*c_pme_spread*nq_tot*gmx::power3(ir->pme_order);
 568         cost_fft    += f*c_pme_fft*grid*std::log(grid)/std::log(2.0);
 569         cost_solve  += f*c_pme_solve*grid*simd_cycle_factor(bHaveSIMD);
 570     }
 571
 572     if (EVDW_PME(ir->vdwtype))
 573     {
 574         double grid = ir->nkx*ir->nky*((ir->nkz + 1)/2);
 575
 576         int    f     = ((ir->efep != efepNO && bTypePerturbed) ? 2 : 1);
 577         if (ir->ljpme_combination_rule == eljpmeLB)
 578         {
 579             /* LB combination rule: we have 7 mesh terms */
 580             f       *= 7;
 581         }
 582         cost_redist +=   c_pme_redist*nlj_tot;
 583         cost_spread += f*c_pme_spread*nlj_tot*gmx::power3(ir->pme_order);
 584         cost_fft    += f*c_pme_fft*2*grid*std::log(grid)/std::log(2.0);
 585         cost_solve  += f*c_pme_solve*grid*simd_cycle_factor(bHaveSIMD);
 586     }
 587
 588     cost_pme = cost_redist + cost_spread + cost_fft + cost_solve;
 589
 590     ratio = cost_pme/(cost_bond + cost_pp + cost_pme);
 591
 592     if (debug)
 593     {
 594         fprintf(debug,
 595                 "cost_bond   %f\n"
 596                 "cost_pp     %f\n"
 597                 "cost_redist %f\n"
 598                 "cost_spread %f\n"
 599                 "cost_fft    %f\n"
 600                 "cost_solve  %f\n",
 601                 cost_bond, cost_pp, cost_redist, cost_spread, cost_fft, cost_solve);
 602
 603         fprintf(debug, "Estimate for relative PME load: %.3f\n", ratio);
 604     }
 605
 606     return ratio;
 607 }