src/gromacs/tools/tune_pme.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2009-2018, The GROMACS development team.
   5  * Copyright (c) 2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 #include "gmxpre.h"
  37
  38 #include "tune_pme.h"
  39
  40 #include "config.h"
  41
  42 #include <cmath>
  43 #include <cstdlib>
  44 #include <cstring>
  45 #include <ctime>
  46
  47 #include <algorithm>
  48 #include <numeric>
  49 #include <string>
  50
  51 #ifdef HAVE_SYS_TIME_H
  52 #    include <sys/time.h>
  53 #endif
  54
  55 #include "gromacs/commandline/pargs.h"
  56 #include "gromacs/ewald/pme.h"
  57 #include "gromacs/fft/calcgrid.h"
  58 #include "gromacs/fileio/checkpoint.h"
  59 #include "gromacs/fileio/tpxio.h"
  60 #include "gromacs/math/utilities.h"
  61 #include "gromacs/math/vec.h"
  62 #include "gromacs/mdlib/perf_est.h"
  63 #include "gromacs/mdtypes/commrec.h"
  64 #include "gromacs/mdtypes/inputrec.h"
  65 #include "gromacs/mdtypes/md_enums.h"
  66 #include "gromacs/mdtypes/state.h"
  67 #include "gromacs/pbcutil/pbc.h"
  68 #include "gromacs/taskassignment/usergpuids.h"
  69 #include "gromacs/timing/walltime_accounting.h"
  70 #include "gromacs/topology/topology.h"
  71 #include "gromacs/utility/arraysize.h"
  72 #include "gromacs/utility/baseversion.h"
  73 #include "gromacs/utility/cstringutil.h"
  74 #include "gromacs/utility/fatalerror.h"
  75 #include "gromacs/utility/futil.h"
  76 #include "gromacs/utility/gmxassert.h"
  77 #include "gromacs/utility/path.h"
  78 #include "gromacs/utility/smalloc.h"
  79 #include "gromacs/utility/stringutil.h"
  80
  81 /* Enum for situations that can occur during log file parsing, the
  82  * corresponding string entries can be found in do_the_tests() in
  83  * const char* ParseLog[] */
  84 /* TODO clean up CamelCasing of these enum names */
  85 enum
  86 {
  87     eParselogOK,
  88     eParselogNotFound,
  89     eParselogNoPerfData,
  90     eParselogTerm,
  91     eParselogResetProblem,
  92     eParselogNoDDGrid,
  93     eParselogTPXVersion,
  94     eParselogNotParallel,
  95     eParselogLargePrimeFactor,
  96     eParselogMismatchOfNumberOfPPRanksAndAvailableGPUs,
  97     eParselogGpuProblem,
  98     eParselogFatal,
  99     eParselogNr
 100 };
 101
 102
 103 typedef struct
 104 {
 105     int     nPMEnodes;  /* number of PME-only nodes used in this test */
 106     int     nx, ny, nz; /* DD grid */
 107     int     guessPME;   /* if nPMEnodes == -1, this is the guessed number of PME nodes */
 108     double* Gcycles;    /* This can contain more than one value if doing multiple tests */
 109     double  Gcycles_Av;
 110     float*  ns_per_day;
 111     float   ns_per_day_Av;
 112     float*  PME_f_load;     /* PME mesh/force load average*/
 113     float   PME_f_load_Av;  /* Average average ;) ... */
 114     char*   mdrun_cmd_line; /* Mdrun command line used for this test */
 115 } t_perf;
 116
 117
 118 typedef struct
 119 {
 120     int     nr_inputfiles;  /* The number of tpr and mdp input files */
 121     int64_t orig_sim_steps; /* Number of steps to be done in the real simulation */
 122     int64_t orig_init_step; /* Init step for the real simulation */
 123     real*   rcoulomb;       /* The coulomb radii [0...nr_inputfiles] */
 124     real*   rvdw;           /* The vdW radii */
 125     real*   rlist;          /* Neighbourlist cutoff radius */
 126     int *   nkx, *nky, *nkz;
 127     real *  fsx, *fsy, *fsz; /* Fourierspacing in x,y,z dimension */
 128 } t_inputinfo;
 129
 130
 131 static void sep_line(FILE* fp)
 132 {
 133     fprintf(fp, "\n------------------------------------------------------------\n");
 134 }
 135
 136
 137 /* Wrapper for system calls */
 138 static int gmx_system_call(char* command)
 139 {
 140     return (system(command));
 141 }
 142
 143
 144 /* Check if string starts with substring */
 145 static gmx_bool str_starts(const char* string, const char* substring)
 146 {
 147     return (std::strncmp(string, substring, std::strlen(substring)) == 0);
 148 }
 149
 150
 151 static void cleandata(t_perf* perfdata, int test_nr)
 152 {
 153     perfdata->Gcycles[test_nr]    = 0.0;
 154     perfdata->ns_per_day[test_nr] = 0.0;
 155     perfdata->PME_f_load[test_nr] = 0.0;
 156 }
 157
 158
 159 static void remove_if_exists(const char* fn)
 160 {
 161     if (gmx_fexist(fn))
 162     {
 163         fprintf(stdout, "Deleting %s\n", fn);
 164         remove(fn);
 165     }
 166 }
 167
 168
 169 static void finalize(const char* fn_out)
 170 {
 171     char  buf[STRLEN];
 172     FILE* fp;
 173
 174
 175     fp = fopen(fn_out, "r");
 176     fprintf(stdout, "\n\n");
 177
 178     while (fgets(buf, STRLEN - 1, fp) != nullptr)
 179     {
 180         fprintf(stdout, "%s", buf);
 181     }
 182     fclose(fp);
 183     fprintf(stdout, "\n\n");
 184 }
 185
 186
 187 enum
 188 {
 189     eFoundNothing,
 190     eFoundDDStr,
 191     eFoundAccountingStr,
 192     eFoundCycleStr
 193 };
 194
 195 static int parse_logfile(const char* logfile,
 196                          const char* errfile,
 197                          t_perf*     perfdata,
 198                          int         test_nr,
 199                          int         presteps,
 200                          int64_t     cpt_steps,
 201                          int         nnodes)
 202 {
 203     FILE*      fp;
 204     char       line[STRLEN], dumstring[STRLEN], dumstring2[STRLEN];
 205     const char matchstrdd[]  = "Domain decomposition grid";
 206     const char matchstrcr[]  = "resetting all time and cycle counters";
 207     const char matchstrbal[] = "Average PME mesh/force load:";
 208     const char matchstring[] = "R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G";
 209     const char errSIG[]      = "signal, stopping at the next";
 210     int        iFound;
 211     float      dum1, dum2, dum3, dum4;
 212     int        ndum;
 213     int        npme;
 214     int64_t    resetsteps     = -1;
 215     gmx_bool   bFoundResetStr = FALSE;
 216     gmx_bool   bResetChecked  = FALSE;
 217
 218
 219     if (!gmx_fexist(logfile))
 220     {
 221         fprintf(stderr, "WARNING: Could not find logfile %s.\n", logfile);
 222         cleandata(perfdata, test_nr);
 223         return eParselogNotFound;
 224     }
 225
 226     fp                            = fopen(logfile, "r");
 227     perfdata->PME_f_load[test_nr] = -1.0;
 228     perfdata->guessPME            = -1;
 229
 230     iFound = eFoundNothing;
 231     if (1 == nnodes)
 232     {
 233         iFound = eFoundDDStr; /* Skip some case statements */
 234     }
 235
 236     while (fgets(line, STRLEN, fp) != nullptr)
 237     {
 238         /* Remove leading spaces */
 239         ltrim(line);
 240
 241         /* Check for TERM and INT signals from user: */
 242         if (std::strstr(line, errSIG) != nullptr)
 243         {
 244             fclose(fp);
 245             cleandata(perfdata, test_nr);
 246             return eParselogTerm;
 247         }
 248
 249         /* Check whether cycle resetting  worked */
 250         if (presteps > 0 && !bFoundResetStr)
 251         {
 252             if (std::strstr(line, matchstrcr) != nullptr)
 253             {
 254                 sprintf(dumstring, "step %s", "%" SCNd64);
 255                 sscanf(line, dumstring, &resetsteps);
 256                 bFoundResetStr = TRUE;
 257                 if (resetsteps == presteps + cpt_steps)
 258                 {
 259                     bResetChecked = TRUE;
 260                 }
 261                 else
 262                 {
 263                     sprintf(dumstring, "%" PRId64, resetsteps);
 264                     sprintf(dumstring2, "%" PRId64, presteps + cpt_steps);
 265                     fprintf(stderr,
 266                             "WARNING: Time step counters were reset at step %s,\n"
 267                             "         though they were supposed to be reset at step %s!\n",
 268                             dumstring, dumstring2);
 269                 }
 270             }
 271         }
 272
 273         /* Look for strings that appear in a certain order in the log file: */
 274         switch (iFound)
 275         {
 276             case eFoundNothing:
 277                 /* Look for domain decomp grid and separate PME nodes: */
 278                 if (str_starts(line, matchstrdd))
 279                 {
 280                     sscanf(line, "Domain decomposition grid %d x %d x %d, separate PME ranks %d",
 281                            &(perfdata->nx), &(perfdata->ny), &(perfdata->nz), &npme);
 282                     if (perfdata->nPMEnodes == -1)
 283                     {
 284                         perfdata->guessPME = npme;
 285                     }
 286                     else if (perfdata->nPMEnodes != npme)
 287                     {
 288                         gmx_fatal(FARGS,
 289                                   "PME ranks from command line and output file are not identical");
 290                     }
 291                     iFound = eFoundDDStr;
 292                 }
 293                 /* Catch a few errors that might have occurred: */
 294                 else if (str_starts(line, "There is no domain decomposition for"))
 295                 {
 296                     fclose(fp);
 297                     return eParselogNoDDGrid;
 298                 }
 299                 else if (str_starts(line, "The number of ranks you selected"))
 300                 {
 301                     fclose(fp);
 302                     return eParselogLargePrimeFactor;
 303                 }
 304                 else if (str_starts(line, "reading tpx file"))
 305                 {
 306                     fclose(fp);
 307                     return eParselogTPXVersion;
 308                 }
 309                 else if (str_starts(line, "The -dd or -npme option request a parallel simulation"))
 310                 {
 311                     fclose(fp);
 312                     return eParselogNotParallel;
 313                 }
 314                 break;
 315             case eFoundDDStr:
 316                 /* Even after the "Domain decomposition grid" string was found,
 317                  * it could be that mdrun had to quit due to some error. */
 318                 if (str_starts(line, "Incorrect launch configuration: mismatching number of"))
 319                 {
 320                     fclose(fp);
 321                     return eParselogMismatchOfNumberOfPPRanksAndAvailableGPUs;
 322                 }
 323                 else if (str_starts(line, "Some of the requested GPUs do not exist"))
 324                 {
 325                     fclose(fp);
 326                     return eParselogGpuProblem;
 327                 }
 328                 /* Look for PME mesh/force balance (not necessarily present, though) */
 329                 else if (str_starts(line, matchstrbal))
 330                 {
 331                     sscanf(&line[std::strlen(matchstrbal)], "%f", &(perfdata->PME_f_load[test_nr]));
 332                 }
 333                 /* Look for matchstring */
 334                 else if (str_starts(line, matchstring))
 335                 {
 336                     iFound = eFoundAccountingStr;
 337                 }
 338                 break;
 339             case eFoundAccountingStr:
 340                 /* Already found matchstring - look for cycle data */
 341                 if (str_starts(line, "Total  "))
 342                 {
 343                     sscanf(line, "Total %*f %lf", &(perfdata->Gcycles[test_nr]));
 344                     iFound = eFoundCycleStr;
 345                 }
 346                 break;
 347             case eFoundCycleStr:
 348                 /* Already found cycle data - look for remaining performance info and return */
 349                 if (str_starts(line, "Performance:"))
 350                 {
 351                     ndum = sscanf(line, "%s %f %f %f %f", dumstring, &dum1, &dum2, &dum3, &dum4);
 352                     /* (ns/day) is the second last entry, depending on whether GMX_DETAILED_PERF_STATS was set in print_perf(), nrnb.c */
 353                     perfdata->ns_per_day[test_nr] = (ndum == 5) ? dum3 : dum1;
 354                     fclose(fp);
 355                     if (bResetChecked || presteps == 0)
 356                     {
 357                         return eParselogOK;
 358                     }
 359                     else
 360                     {
 361                         return eParselogResetProblem;
 362                     }
 363                 }
 364                 break;
 365         }
 366     } /* while */
 367
 368     /* Close the log file */
 369     fclose(fp);
 370
 371     /* Check why there is no performance data in the log file.
 372      * Did a fatal errors occur? */
 373     if (gmx_fexist(errfile))
 374     {
 375         fp = fopen(errfile, "r");
 376         while (fgets(line, STRLEN, fp) != nullptr)
 377         {
 378             if (str_starts(line, "Fatal error:"))
 379             {
 380                 if (fgets(line, STRLEN, fp) != nullptr)
 381                 {
 382                     fprintf(stderr,
 383                             "\nWARNING: An error occurred during this benchmark:\n"
 384                             "%s\n",
 385                             line);
 386                 }
 387                 fclose(fp);
 388                 cleandata(perfdata, test_nr);
 389                 return eParselogFatal;
 390             }
 391         }
 392         fclose(fp);
 393     }
 394     else
 395     {
 396         fprintf(stderr, "WARNING: Could not find stderr file %s.\n", errfile);
 397     }
 398
 399     /* Giving up ... we could not find out why there is no performance data in
 400      * the log file. */
 401     fprintf(stdout, "No performance data in log file.\n");
 402     cleandata(perfdata, test_nr);
 403
 404     return eParselogNoPerfData;
 405 }
 406
 407
 408 static gmx_bool analyze_data(FILE*        fp,
 409                              const char*  fn,
 410                              t_perf**     perfdata,
 411                              int          nnodes,
 412                              int          ntprs,
 413                              int          ntests,
 414                              int          nrepeats,
 415                              t_inputinfo* info,
 416                              int*         index_tpr, /* OUT: Nr of mdp file with best settings */
 417                              int*         npme_optimal)      /* OUT: Optimal number of PME nodes */
 418 {
 419     int      i, j, k;
 420     int      line = 0, line_win = -1;
 421     int      k_win = -1, i_win = -1, winPME;
 422     double   s = 0.0; /* standard deviation */
 423     t_perf*  pd;
 424     char     strbuf[STRLEN];
 425     char     str_PME_f_load[13];
 426     gmx_bool bCanUseOrigTPR;
 427     gmx_bool bRefinedCoul, bRefinedVdW, bRefinedGrid;
 428
 429
 430     if (nrepeats > 1)
 431     {
 432         sep_line(fp);
 433         fprintf(fp, "Summary of successful runs:\n");
 434         fprintf(fp, "Line tpr PME ranks  Gcycles Av.     Std.dev.       ns/day        PME/f");
 435         if (nnodes > 1)
 436         {
 437             fprintf(fp, "    DD grid");
 438         }
 439         fprintf(fp, "\n");
 440     }
 441
 442
 443     for (k = 0; k < ntprs; k++)
 444     {
 445         for (i = 0; i < ntests; i++)
 446         {
 447             /* Select the right dataset: */
 448             pd = &(perfdata[k][i]);
 449
 450             pd->Gcycles_Av    = 0.0;
 451             pd->PME_f_load_Av = 0.0;
 452             pd->ns_per_day_Av = 0.0;
 453
 454             if (pd->nPMEnodes == -1)
 455             {
 456                 sprintf(strbuf, "(%3d)", pd->guessPME);
 457             }
 458             else
 459             {
 460                 sprintf(strbuf, "     ");
 461             }
 462
 463             /* Get the average run time of a setting */
 464             for (j = 0; j < nrepeats; j++)
 465             {
 466                 pd->Gcycles_Av += pd->Gcycles[j];
 467                 pd->PME_f_load_Av += pd->PME_f_load[j];
 468             }
 469             pd->Gcycles_Av /= nrepeats;
 470             pd->PME_f_load_Av /= nrepeats;
 471
 472             for (j = 0; j < nrepeats; j++)
 473             {
 474                 if (pd->ns_per_day[j] > 0.0)
 475                 {
 476                     pd->ns_per_day_Av += pd->ns_per_day[j];
 477                 }
 478                 else
 479                 {
 480                     /* Somehow the performance number was not aquired for this run,
 481                      * therefor set the average to some negative value: */
 482                     pd->ns_per_day_Av = -1.0F * nrepeats;
 483                     break;
 484                 }
 485             }
 486             pd->ns_per_day_Av /= nrepeats;
 487
 488             /* Nicer output: */
 489             if (pd->PME_f_load_Av > 0.0)
 490             {
 491                 sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load_Av);
 492             }
 493             else
 494             {
 495                 sprintf(str_PME_f_load, "%s", "         -  ");
 496             }
 497
 498
 499             /* We assume we had a successful run if both averages are positive */
 500             if (pd->Gcycles_Av > 0.0 && pd->ns_per_day_Av > 0.0)
 501             {
 502                 /* Output statistics if repeats were done */
 503                 if (nrepeats > 1)
 504                 {
 505                     /* Calculate the standard deviation */
 506                     s = 0.0;
 507                     for (j = 0; j < nrepeats; j++)
 508                     {
 509                         s += gmx::square(pd->Gcycles[j] - pd->Gcycles_Av);
 510                     }
 511                     s /= (nrepeats - 1);
 512                     s = std::sqrt(s);
 513
 514                     fprintf(fp, "%4d %3d %4d%s %12.3f %12.3f %12.3f %s", line, k, pd->nPMEnodes,
 515                             strbuf, pd->Gcycles_Av, s, pd->ns_per_day_Av, str_PME_f_load);
 516                     if (nnodes > 1)
 517                     {
 518                         fprintf(fp, "  %3d %3d %3d", pd->nx, pd->ny, pd->nz);
 519                     }
 520                     fprintf(fp, "\n");
 521                 }
 522                 /* Store the index of the best run found so far in 'winner': */
 523                 if ((k_win == -1) || (pd->Gcycles_Av < perfdata[k_win][i_win].Gcycles_Av))
 524                 {
 525                     k_win    = k;
 526                     i_win    = i;
 527                     line_win = line;
 528                 }
 529                 line++;
 530             }
 531         }
 532     }
 533
 534     if (k_win == -1)
 535     {
 536         gmx_fatal(FARGS, "None of the runs was successful! Check %s for problems.", fn);
 537     }
 538
 539     sep_line(fp);
 540
 541     winPME = perfdata[k_win][i_win].nPMEnodes;
 542
 543     if (1 == ntests)
 544     {
 545         /* We stuck to a fixed number of PME-only nodes */
 546         sprintf(strbuf, "settings No. %d", k_win);
 547     }
 548     else
 549     {
 550         /* We have optimized the number of PME-only nodes */
 551         if (winPME == -1)
 552         {
 553             sprintf(strbuf, "%s", "the automatic number of PME ranks");
 554         }
 555         else
 556         {
 557             sprintf(strbuf, "%d PME ranks", winPME);
 558         }
 559     }
 560     fprintf(fp, "Best performance was achieved with %s", strbuf);
 561     if ((nrepeats > 1) && (ntests > 1))
 562     {
 563         fprintf(fp, " (see line %d)", line_win);
 564     }
 565     fprintf(fp, "\n");
 566
 567     /* Only mention settings if they were modified: */
 568     bRefinedCoul = !gmx_within_tol(info->rcoulomb[k_win], info->rcoulomb[0], GMX_REAL_EPS);
 569     bRefinedVdW  = !gmx_within_tol(info->rvdw[k_win], info->rvdw[0], GMX_REAL_EPS);
 570     bRefinedGrid = !(info->nkx[k_win] == info->nkx[0] && info->nky[k_win] == info->nky[0]
 571                      && info->nkz[k_win] == info->nkz[0]);
 572
 573     if (bRefinedCoul || bRefinedVdW || bRefinedGrid)
 574     {
 575         fprintf(fp, "Optimized PME settings:\n");
 576         bCanUseOrigTPR = FALSE;
 577     }
 578     else
 579     {
 580         bCanUseOrigTPR = TRUE;
 581     }
 582
 583     if (bRefinedCoul)
 584     {
 585         fprintf(fp, "   New Coulomb radius: %f nm (was %f nm)\n", info->rcoulomb[k_win],
 586                 info->rcoulomb[0]);
 587     }
 588
 589     if (bRefinedVdW)
 590     {
 591         fprintf(fp, "   New Van der Waals radius: %f nm (was %f nm)\n", info->rvdw[k_win], info->rvdw[0]);
 592     }
 593
 594     if (bRefinedGrid)
 595     {
 596         fprintf(fp, "   New Fourier grid xyz: %d %d %d (was %d %d %d)\n", info->nkx[k_win],
 597                 info->nky[k_win], info->nkz[k_win], info->nkx[0], info->nky[0], info->nkz[0]);
 598     }
 599
 600     if (bCanUseOrigTPR && ntprs > 1)
 601     {
 602         fprintf(fp, "and original PME settings.\n");
 603     }
 604
 605     fflush(fp);
 606
 607     /* Return the index of the mdp file that showed the highest performance
 608      * and the optimal number of PME nodes */
 609     *index_tpr    = k_win;
 610     *npme_optimal = winPME;
 611
 612     return bCanUseOrigTPR;
 613 }
 614
 615
 616 /* Get the commands we need to set up the runs from environment variables */
 617 static void get_program_paths(gmx_bool bThreads, char* cmd_mpirun[], char* cmd_mdrun[])
 618 {
 619     char*      cp;
 620     const char def_mpirun[] = "mpirun";
 621
 622     const char empty_mpirun[] = "";
 623
 624     /* Get the commands we need to set up the runs from environment variables */
 625     if (!bThreads)
 626     {
 627         if ((cp = getenv("MPIRUN")) != nullptr)
 628         {
 629             *cmd_mpirun = gmx_strdup(cp);
 630         }
 631         else
 632         {
 633             *cmd_mpirun = gmx_strdup(def_mpirun);
 634         }
 635     }
 636     else
 637     {
 638         *cmd_mpirun = gmx_strdup(empty_mpirun);
 639     }
 640
 641     if (*cmd_mdrun == nullptr)
 642     {
 643         /* The use of MDRUN is deprecated, but made available in 5.1
 644            for backward compatibility. It may be removed in a future
 645            version. */
 646         if ((cp = getenv("MDRUN")) != nullptr)
 647         {
 648             *cmd_mdrun = gmx_strdup(cp);
 649         }
 650         else
 651         {
 652             gmx_fatal(FARGS, "The way to call mdrun must be set in the -mdrun command-line flag.");
 653         }
 654     }
 655 }
 656
 657 /* Check that the commands will run mdrun (perhaps via mpirun) by
 658  * running a very quick test simulation. Requires MPI environment or
 659  * GPU support to be available if applicable. */
 660 /* TODO implement feature to parse the log file to get the list of
 661    compatible GPUs from mdrun, if the user of gmx tune-pme has not
 662    given one. */
 663 static void check_mdrun_works(gmx_bool    bThreads,
 664                               const char* cmd_mpirun,
 665                               const char* cmd_np,
 666                               const char* cmd_mdrun,
 667                               gmx_bool    bNeedGpuSupport)
 668 {
 669     char*      command = nullptr;
 670     char*      cp;
 671     char       line[STRLEN];
 672     FILE*      fp;
 673     const char filename[] = "benchtest.log";
 674
 675     /* This string should always be identical to the one in copyrite.c,
 676      * gmx_print_version_info() in the GMX_MPI section */
 677     const char match_mpi[]     = "MPI library:        MPI";
 678     const char match_mdrun[]   = "Executable: ";
 679     const char match_nogpu[]   = "GPU support:        disabled";
 680     gmx_bool   bMdrun          = FALSE;
 681     gmx_bool   bMPI            = FALSE;
 682     gmx_bool   bHaveGpuSupport = TRUE;
 683
 684     /* Run a small test to see whether mpirun + mdrun work  */
 685     fprintf(stdout, "Making sure that mdrun can be executed. ");
 686     if (bThreads)
 687     {
 688         snew(command, std::strlen(cmd_mdrun) + std::strlen(cmd_np) + std::strlen(filename) + 50);
 689         sprintf(command, "%s%s -version -maxh 0.001 1> %s 2>&1", cmd_mdrun, cmd_np, filename);
 690     }
 691     else
 692     {
 693         snew(command, std::strlen(cmd_mpirun) + std::strlen(cmd_np) + std::strlen(cmd_mdrun)
 694                               + std::strlen(filename) + 50);
 695         sprintf(command, "%s%s%s -version -maxh 0.001 1> %s 2>&1", cmd_mpirun, cmd_np, cmd_mdrun, filename);
 696     }
 697     fprintf(stdout, "Trying '%s' ... ", command);
 698     make_backup(filename);
 699     gmx_system_call(command);
 700
 701     /* Check if we find the characteristic string in the output: */
 702     if (!gmx_fexist(filename))
 703     {
 704         gmx_fatal(FARGS, "Output from test run could not be found.");
 705     }
 706
 707     fp = fopen(filename, "r");
 708     /* We need to scan the whole output file, since sometimes the queuing system
 709      * also writes stuff to stdout/err */
 710     while (!feof(fp))
 711     {
 712         cp = fgets(line, STRLEN, fp);
 713         if (cp != nullptr)
 714         {
 715             if (str_starts(line, match_mdrun))
 716             {
 717                 bMdrun = TRUE;
 718             }
 719             if (str_starts(line, match_mpi))
 720             {
 721                 bMPI = TRUE;
 722             }
 723             if (str_starts(line, match_nogpu))
 724             {
 725                 bHaveGpuSupport = FALSE;
 726             }
 727         }
 728     }
 729     fclose(fp);
 730
 731     if (bThreads)
 732     {
 733         if (bMPI)
 734         {
 735             gmx_fatal(FARGS,
 736                       "Need a threaded version of mdrun. This one\n"
 737                       "(%s)\n"
 738                       "seems to have been compiled with MPI instead.",
 739                       cmd_mdrun);
 740         }
 741     }
 742     else
 743     {
 744         if (bMdrun && !bMPI)
 745         {
 746             gmx_fatal(FARGS,
 747                       "Need an MPI-enabled version of mdrun. This one\n"
 748                       "(%s)\n"
 749                       "seems to have been compiled without MPI support.",
 750                       cmd_mdrun);
 751         }
 752     }
 753
 754     if (!bMdrun)
 755     {
 756         gmx_fatal(FARGS, "Cannot execute mdrun. Please check %s for problems!", filename);
 757     }
 758
 759     if (bNeedGpuSupport && !bHaveGpuSupport)
 760     {
 761         gmx_fatal(FARGS, "The mdrun executable did not have the expected GPU support.");
 762     }
 763
 764     fprintf(stdout, "passed.\n");
 765
 766     /* Clean up ... */
 767     remove(filename);
 768     sfree(command);
 769 }
 770
 771 /* Handles the no-GPU case by emitting an empty string. */
 772 static std::string make_gpu_id_command_line(const char* eligible_gpu_ids)
 773 {
 774     /* If the user has given no eligible GPU IDs, or we're trying the
 775      * default behaviour, then there is nothing for tune_pme to give
 776      * to mdrun -gpu_id */
 777     if (eligible_gpu_ids != nullptr)
 778     {
 779         return gmx::formatString("-gpu_id %s", eligible_gpu_ids);
 780     }
 781
 782
 783     return std::string();
 784 }
 785
 786 static void launch_simulation(gmx_bool    bLaunch,          /* Should the simulation be launched? */
 787                               FILE*       fp,               /* General log file */
 788                               gmx_bool    bThreads,         /* whether to use threads */
 789                               char*       cmd_mpirun,       /* Command for mpirun */
 790                               char*       cmd_np,           /* Switch for -np or -ntmpi or empty */
 791                               char*       cmd_mdrun,        /* Command for mdrun */
 792                               char*       args_for_mdrun,   /* Arguments for mdrun */
 793                               const char* simulation_tpr,   /* This tpr will be simulated */
 794                               int         nPMEnodes,        /* Number of PME ranks to use */
 795                               const char* eligible_gpu_ids) /* Available GPU IDs for
 796                                                              * constructing mdrun command lines */
 797 {
 798     char* command;
 799
 800
 801     /* Make enough space for the system call command,
 802      * (200 extra chars for -npme ... etc. options should suffice): */
 803     snew(command, std::strlen(cmd_mpirun) + std::strlen(cmd_mdrun) + std::strlen(cmd_np)
 804                           + std::strlen(args_for_mdrun) + std::strlen(simulation_tpr) + 200);
 805
 806     auto cmd_gpu_ids = make_gpu_id_command_line(eligible_gpu_ids);
 807
 808     /* Note that the -passall options requires args_for_mdrun to be at the end
 809      * of the command line string */
 810     if (bThreads)
 811     {
 812         sprintf(command, "%s%s-npme %d -s %s %s %s", cmd_mdrun, cmd_np, nPMEnodes, simulation_tpr,
 813                 args_for_mdrun, cmd_gpu_ids.c_str());
 814     }
 815     else
 816     {
 817         sprintf(command, "%s%s%s -npme %d -s %s %s %s", cmd_mpirun, cmd_np, cmd_mdrun, nPMEnodes,
 818                 simulation_tpr, args_for_mdrun, cmd_gpu_ids.c_str());
 819     }
 820
 821     fprintf(fp, "%s this command line to launch the simulation:\n\n%s",
 822             bLaunch ? "Using" : "Please use", command);
 823     sep_line(fp);
 824     fflush(fp);
 825
 826     /* Now the real thing! */
 827     if (bLaunch)
 828     {
 829         fprintf(stdout, "\nLaunching simulation with best parameters now.\nExecuting '%s'", command);
 830         sep_line(stdout);
 831         fflush(stdout);
 832         gmx_system_call(command);
 833     }
 834 }
 835
 836
 837 static void modify_PMEsettings(int64_t     simsteps,    /* Set this value as number of time steps */
 838                                int64_t     init_step,   /* Set this value as init_step */
 839                                const char* fn_best_tpr, /* tpr file with the best performance */
 840                                const char* fn_sim_tpr)  /* name of tpr file to be launched */
 841 {
 842     t_state    state;
 843     gmx_mtop_t mtop;
 844     char       buf[200];
 845
 846     t_inputrec  irInstance;
 847     t_inputrec* ir = &irInstance;
 848     read_tpx_state(fn_best_tpr, ir, &state, &mtop);
 849
 850     /* Reset nsteps and init_step to the value of the input .tpr file */
 851     ir->nsteps    = simsteps;
 852     ir->init_step = init_step;
 853
 854     /* Write the tpr file which will be launched */
 855     sprintf(buf, "Writing optimized simulation file %s with nsteps=%s.\n", fn_sim_tpr, "%" PRId64);
 856     fprintf(stdout, buf, ir->nsteps);
 857     fflush(stdout);
 858     write_tpx_state(fn_sim_tpr, ir, &state, &mtop);
 859 }
 860
 861 static gmx_bool can_scale_rvdw(int vdwtype)
 862 {
 863     return (evdwCUT == vdwtype || evdwPME == vdwtype);
 864 }
 865
 866 #define EPME_SWITCHED(e) ((e) == eelPMESWITCH || (e) == eelPMEUSERSWITCH)
 867
 868 /* Make additional TPR files with more computational load for the
 869  * direct space processors: */
 870 static void make_benchmark_tprs(const char* fn_sim_tpr,  /* READ : User-provided tpr file */
 871                                 char*   fn_bench_tprs[], /* WRITE: Names of benchmark tpr files  */
 872                                 int64_t benchsteps, /* Number of time steps for benchmark runs */
 873                                 int64_t statesteps, /* Step counter in checkpoint file */
 874                                 real    rmin,    /* Minimal Coulomb radius                        */
 875                                 real    rmax,    /* Maximal Coulomb radius                        */
 876                                 bool bScaleRvdw, /* Scale rvdw along with rcoulomb                */
 877                                 const int* ntprs,  /* No. of TPRs to write, each with a different
 878                                                       rcoulomb and fourierspacing  */
 879                                 t_inputinfo* info, /* Contains information about mdp file options */
 880                                 FILE*        fp) /* Write the output here                         */
 881 {
 882     int        i, j, d;
 883     t_state    state;
 884     gmx_mtop_t mtop;
 885     real       nlist_buffer; /* Thickness of the buffer regions for PME-switch potentials */
 886     char       buf[200];
 887     rvec       box_size;
 888     gmx_bool   bNote = FALSE;
 889     real       add;            /* Add this to rcoul for the next test    */
 890     real       fac = 1.0;      /* Scaling factor for Coulomb radius      */
 891     real       fourierspacing; /* Basic fourierspacing from tpr          */
 892
 893
 894     sprintf(buf, "Making benchmark tpr file%s with %s time step%s", *ntprs > 1 ? "s" : "",
 895             "%" PRId64, benchsteps > 1 ? "s" : "");
 896     fprintf(stdout, buf, benchsteps);
 897     if (statesteps > 0)
 898     {
 899         sprintf(buf, " (adding %s steps from checkpoint file)", "%" PRId64);
 900         fprintf(stdout, buf, statesteps);
 901         benchsteps += statesteps;
 902     }
 903     fprintf(stdout, ".\n");
 904
 905     t_inputrec  irInstance;
 906     t_inputrec* ir = &irInstance;
 907     read_tpx_state(fn_sim_tpr, ir, &state, &mtop);
 908
 909     /* Check if some kind of PME was chosen */
 910     if (EEL_PME(ir->coulombtype) == FALSE)
 911     {
 912         gmx_fatal(FARGS, "Can only do optimizations for simulations with %s electrostatics.",
 913                   EELTYPE(eelPME));
 914     }
 915
 916     /* Check if rcoulomb == rlist, which is necessary for plain PME. */
 917     if ((ir->cutoff_scheme != ecutsVERLET) && (eelPME == ir->coulombtype) && !(ir->rcoulomb == ir->rlist))
 918     {
 919         gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to rlist (%f).", EELTYPE(eelPME),
 920                   ir->rcoulomb, ir->rlist);
 921     }
 922     /* For other PME types, rcoulomb is allowed to be smaller than rlist */
 923     else if (ir->rcoulomb > ir->rlist)
 924     {
 925         gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to or smaller than rlist (%f)",
 926                   EELTYPE(ir->coulombtype), ir->rcoulomb, ir->rlist);
 927     }
 928
 929     if (bScaleRvdw && ir->rvdw != ir->rcoulomb)
 930     {
 931         fprintf(stdout, "NOTE: input rvdw != rcoulomb, will not scale rvdw\n");
 932         bScaleRvdw = FALSE;
 933     }
 934
 935     /* Reduce the number of steps for the benchmarks */
 936     info->orig_sim_steps = ir->nsteps;
 937     ir->nsteps           = benchsteps;
 938     /* We must not use init_step from the input tpr file for the benchmarks */
 939     info->orig_init_step = ir->init_step;
 940     ir->init_step        = 0;
 941
 942     /* For PME-switch potentials, keep the radial distance of the buffer region */
 943     nlist_buffer = ir->rlist - ir->rcoulomb;
 944
 945     /* Determine length of triclinic box vectors */
 946     for (d = 0; d < DIM; d++)
 947     {
 948         box_size[d] = 0;
 949         for (i = 0; i < DIM; i++)
 950         {
 951             box_size[d] += state.box[d][i] * state.box[d][i];
 952         }
 953         box_size[d] = std::sqrt(box_size[d]);
 954     }
 955
 956     if (ir->fourier_spacing > 0)
 957     {
 958         info->fsx[0] = ir->fourier_spacing;
 959         info->fsy[0] = ir->fourier_spacing;
 960         info->fsz[0] = ir->fourier_spacing;
 961     }
 962     else
 963     {
 964         /* Reconstruct fourierspacing per dimension from the number of grid points and box size */
 965         info->fsx[0] = box_size[XX] / ir->nkx;
 966         info->fsy[0] = box_size[YY] / ir->nky;
 967         info->fsz[0] = box_size[ZZ] / ir->nkz;
 968     }
 969
 970     /* If no value for the fourierspacing was provided on the command line, we
 971      * use the reconstruction from the tpr file */
 972     if (ir->fourier_spacing > 0)
 973     {
 974         /* Use the spacing from the tpr */
 975         fourierspacing = ir->fourier_spacing;
 976     }
 977     else
 978     {
 979         /* Use the maximum observed spacing */
 980         fourierspacing = std::max(std::max(info->fsx[0], info->fsy[0]), info->fsz[0]);
 981     }
 982
 983     fprintf(stdout, "Calculating PME grid points on the basis of a fourierspacing of %f nm\n",
 984             fourierspacing);
 985
 986     /* For performance comparisons the number of particles is useful to have */
 987     fprintf(fp, "   Number of particles  : %d\n", mtop.natoms);
 988
 989     /* Print information about settings of which some are potentially modified: */
 990     fprintf(fp, "   Coulomb type         : %s\n", EELTYPE(ir->coulombtype));
 991     fprintf(fp, "   Grid spacing x y z   : %f %f %f\n", box_size[XX] / ir->nkx,
 992             box_size[YY] / ir->nky, box_size[ZZ] / ir->nkz);
 993     fprintf(fp, "   Van der Waals type   : %s\n", EVDWTYPE(ir->vdwtype));
 994     if (ir_vdw_switched(ir))
 995     {
 996         fprintf(fp, "   rvdw_switch          : %f nm\n", ir->rvdw_switch);
 997     }
 998     if (EPME_SWITCHED(ir->coulombtype))
 999     {
1000         fprintf(fp, "   rlist                : %f nm\n", ir->rlist);
1001     }
1002
1003     /* Print a descriptive line about the tpr settings tested */
1004     fprintf(fp, "\nWill try these real/reciprocal workload settings:\n");
1005     fprintf(fp, " No.   scaling  rcoulomb");
1006     fprintf(fp, "  nkx  nky  nkz");
1007     fprintf(fp, "   spacing");
1008     if (can_scale_rvdw(ir->vdwtype))
1009     {
1010         fprintf(fp, "      rvdw");
1011     }
1012     if (EPME_SWITCHED(ir->coulombtype))
1013     {
1014         fprintf(fp, "     rlist");
1015     }
1016     fprintf(fp, "  tpr file\n");
1017
1018     /* Loop to create the requested number of tpr input files */
1019     for (j = 0; j < *ntprs; j++)
1020     {
1021         /* The first .tpr is the provided one, just need to modify nsteps,
1022          * so skip the following block */
1023         if (j != 0)
1024         {
1025             /* Determine which Coulomb radii rc to use in the benchmarks */
1026             add = (rmax - rmin) / (*ntprs - 1);
1027             if (gmx_within_tol(rmin, info->rcoulomb[0], GMX_REAL_EPS))
1028             {
1029                 ir->rcoulomb = rmin + j * add;
1030             }
1031             else if (gmx_within_tol(rmax, info->rcoulomb[0], GMX_REAL_EPS))
1032             {
1033                 ir->rcoulomb = rmin + (j - 1) * add;
1034             }
1035             else
1036             {
1037                 /* rmin != rcoul != rmax, ergo test between rmin and rmax */
1038                 add          = (rmax - rmin) / (*ntprs - 2);
1039                 ir->rcoulomb = rmin + (j - 1) * add;
1040             }
1041
1042             /* Determine the scaling factor fac */
1043             fac = ir->rcoulomb / info->rcoulomb[0];
1044
1045             /* Scale the Fourier grid spacing */
1046             ir->nkx = ir->nky = ir->nkz = 0;
1047             calcFftGrid(nullptr, state.box, fourierspacing * fac, minimalPmeGridSize(ir->pme_order),
1048                         &ir->nkx, &ir->nky, &ir->nkz);
1049
1050             /* Adjust other radii since various conditions need to be fulfilled */
1051             if (eelPME == ir->coulombtype)
1052             {
1053                 /* plain PME, rcoulomb must be equal to rlist TODO only in the group scheme? */
1054                 ir->rlist = ir->rcoulomb;
1055             }
1056             else
1057             {
1058                 /* rlist must be >= rcoulomb, we keep the size of the buffer region */
1059                 ir->rlist = ir->rcoulomb + nlist_buffer;
1060             }
1061
1062             if (bScaleRvdw && can_scale_rvdw(ir->vdwtype))
1063             {
1064                 if (ecutsVERLET == ir->cutoff_scheme || evdwPME == ir->vdwtype)
1065                 {
1066                     /* With either the Verlet cutoff-scheme or LJ-PME,
1067                        the van der Waals radius must always equal the
1068                        Coulomb radius */
1069                     ir->rvdw = ir->rcoulomb;
1070                 }
1071                 else
1072                 {
1073                     /* For vdw cutoff, rvdw >= rlist */
1074                     ir->rvdw = std::max(info->rvdw[0], ir->rlist);
1075                 }
1076             }
1077         } /* end of "if (j != 0)" */
1078
1079         /* for j==0: Save the original settings
1080          * for j >0: Save modified radii and Fourier grids */
1081         info->rcoulomb[j] = ir->rcoulomb;
1082         info->rvdw[j]     = ir->rvdw;
1083         info->nkx[j]      = ir->nkx;
1084         info->nky[j]      = ir->nky;
1085         info->nkz[j]      = ir->nkz;
1086         info->rlist[j]    = ir->rlist;
1087         info->fsx[j]      = fac * fourierspacing;
1088         info->fsy[j]      = fac * fourierspacing;
1089         info->fsz[j]      = fac * fourierspacing;
1090
1091         /* Write the benchmark tpr file */
1092         fn_bench_tprs[j] = gmx_strdup(
1093                 gmx::Path::concatenateBeforeExtension(fn_sim_tpr, gmx::formatString("_bench%.2d", j))
1094                         .c_str());
1095
1096         fprintf(stdout, "Writing benchmark tpr %s with nsteps=", fn_bench_tprs[j]);
1097         fprintf(stdout, "%" PRId64, ir->nsteps);
1098         if (j > 0)
1099         {
1100             fprintf(stdout, ", scaling factor %f\n", fac);
1101         }
1102         else
1103         {
1104             fprintf(stdout, ", unmodified settings\n");
1105         }
1106
1107         write_tpx_state(fn_bench_tprs[j], ir, &state, &mtop);
1108
1109         /* Write information about modified tpr settings to log file */
1110         fprintf(fp, "%4d%10f%10f", j, fac, ir->rcoulomb);
1111         fprintf(fp, "%5d%5d%5d", ir->nkx, ir->nky, ir->nkz);
1112         fprintf(fp, " %9f ", info->fsx[j]);
1113         if (can_scale_rvdw(ir->vdwtype))
1114         {
1115             fprintf(fp, "%10f", ir->rvdw);
1116         }
1117         if (EPME_SWITCHED(ir->coulombtype))
1118         {
1119             fprintf(fp, "%10f", ir->rlist);
1120         }
1121         fprintf(fp, "  %-14s\n", fn_bench_tprs[j]);
1122
1123         /* Make it clear to the user that some additional settings were modified */
1124         if (!gmx_within_tol(ir->rvdw, info->rvdw[0], GMX_REAL_EPS)
1125             || !gmx_within_tol(ir->rlist, info->rlist[0], GMX_REAL_EPS))
1126         {
1127             bNote = TRUE;
1128         }
1129     }
1130     if (bNote)
1131     {
1132         fprintf(fp,
1133                 "\nNote that in addition to the Coulomb radius and the Fourier grid\n"
1134                 "other input settings were also changed (see table above).\n"
1135                 "Please check if the modified settings are appropriate.\n");
1136     }
1137     fflush(stdout);
1138     fflush(fp);
1139 }
1140
1141
1142 /* Rename the files we want to keep to some meaningful filename and
1143  * delete the rest */
1144 static void cleanup(const t_filenm* fnm, int nfile, int k, int nnodes, int nPMEnodes, int nr, gmx_bool bKeepStderr)
1145 {
1146     char        numstring[STRLEN];
1147     const char* fn = nullptr;
1148     int         i;
1149     const char* opt;
1150
1151
1152     fprintf(stdout, "Cleaning up, deleting benchmark temp files ...\n");
1153
1154     for (i = 0; i < nfile; i++)
1155     {
1156         opt = const_cast<char*>(fnm[i].opt);
1157         if (std::strcmp(opt, "-p") == 0)
1158         {
1159             /* do nothing; keep this file */
1160         }
1161         else if (std::strcmp(opt, "-bg") == 0)
1162         {
1163             /* Give the log file a nice name so one can later see which parameters were used */
1164             numstring[0] = '\0';
1165             if (nr > 0)
1166             {
1167                 sprintf(numstring, "_%d", nr);
1168             }
1169             std::string newfilename = gmx::formatString(
1170                     "%s_no%d_np%d_npme%d%s", opt2fn("-bg", nfile, fnm), k, nnodes, nPMEnodes, numstring);
1171             if (gmx_fexist(opt2fn("-bg", nfile, fnm)))
1172             {
1173                 fprintf(stdout, "renaming log file to %s\n", newfilename.c_str());
1174                 make_backup(newfilename);
1175                 rename(opt2fn("-bg", nfile, fnm), newfilename.c_str());
1176             }
1177         }
1178         else if (std::strcmp(opt, "-err") == 0)
1179         {
1180             /* This file contains the output of stderr. We want to keep it in
1181              * cases where there have been problems. */
1182             fn           = opt2fn(opt, nfile, fnm);
1183             numstring[0] = '\0';
1184             if (nr > 0)
1185             {
1186                 sprintf(numstring, "_%d", nr);
1187             }
1188             std::string newfilename =
1189                     gmx::formatString("%s_no%d_np%d_npme%d%s", fn, k, nnodes, nPMEnodes, numstring);
1190             if (gmx_fexist(fn))
1191             {
1192                 if (bKeepStderr)
1193                 {
1194                     fprintf(stdout, "Saving stderr output in %s\n", newfilename.c_str());
1195                     make_backup(newfilename);
1196                     rename(fn, newfilename.c_str());
1197                 }
1198                 else
1199                 {
1200                     fprintf(stdout, "Deleting %s\n", fn);
1201                     remove(fn);
1202                 }
1203             }
1204         }
1205         /* Delete the files which are created for each benchmark run: (options -b*) */
1206         else if ((0 == std::strncmp(opt, "-b", 2)) && (opt2bSet(opt, nfile, fnm) || !is_optional(&fnm[i])))
1207         {
1208             remove_if_exists(opt2fn(opt, nfile, fnm));
1209         }
1210     }
1211 }
1212
1213
1214 enum
1215 {
1216     eNpmeAuto,
1217     eNpmeAll,
1218     eNpmeReduced,
1219     eNpmeSubset,
1220     eNpmeNr
1221 };
1222
1223 /* Create a list of numbers of PME nodes to test */
1224 static void make_npme_list(const char* npmevalues_opt, /* Make a complete list with all
1225                                                         * possibilities or a short list that keeps
1226                                                         * only reasonable numbers of PME nodes */
1227                            int* nentries,    /* Number of entries we put in the nPMEnodes list   */
1228                            int* nPMEnodes[], /* Each entry contains the value for -npme          */
1229                            int  nnodes,      /* Total number of nodes to do the tests on         */
1230                            int  minPMEnodes, /* Minimum number of PME nodes                      */
1231                            int  maxPMEnodes)  /* Maximum number of PME nodes                      */
1232 {
1233     int i, npme, npp;
1234     int min_factor = 1; /* We request that npp and npme have this minimal
1235                          * largest common factor (depends on npp)           */
1236     int nlistmax;       /* Max. list size                                   */
1237     int nlist;          /* Actual number of entries in list                 */
1238     int eNPME = 0;
1239
1240
1241     /* Do we need to check all possible values for -npme or is a reduced list enough? */
1242     if (!std::strcmp(npmevalues_opt, "all"))
1243     {
1244         eNPME = eNpmeAll;
1245     }
1246     else if (!std::strcmp(npmevalues_opt, "subset"))
1247     {
1248         eNPME = eNpmeSubset;
1249     }
1250     else /* "auto" or "range" */
1251     {
1252         if (nnodes <= 64)
1253         {
1254             eNPME = eNpmeAll;
1255         }
1256         else if (nnodes < 128)
1257         {
1258             eNPME = eNpmeReduced;
1259         }
1260         else
1261         {
1262             eNPME = eNpmeSubset;
1263         }
1264     }
1265
1266     /* Calculate how many entries we could possibly have (in case of -npme all) */
1267     if (nnodes > 2)
1268     {
1269         nlistmax = maxPMEnodes - minPMEnodes + 3;
1270         if (0 == minPMEnodes)
1271         {
1272             nlistmax--;
1273         }
1274     }
1275     else
1276     {
1277         nlistmax = 1;
1278     }
1279
1280     /* Now make the actual list which is at most of size nlist */
1281     snew(*nPMEnodes, nlistmax);
1282     nlist = 0; /* start counting again, now the real entries in the list */
1283     for (i = 0; i < nlistmax - 2; i++)
1284     {
1285         npme = maxPMEnodes - i;
1286         npp  = nnodes - npme;
1287         switch (eNPME)
1288         {
1289             case eNpmeAll: min_factor = 1; break;
1290             case eNpmeReduced: min_factor = 2; break;
1291             case eNpmeSubset:
1292                 /* For 2d PME we want a common largest factor of at least the cube
1293                  * root of the number of PP nodes */
1294                 min_factor = static_cast<int>(std::cbrt(npp));
1295                 break;
1296             default: gmx_fatal(FARGS, "Unknown option for eNPME in make_npme_list");
1297         }
1298         if (std::gcd(npp, npme) >= min_factor)
1299         {
1300             (*nPMEnodes)[nlist] = npme;
1301             nlist++;
1302         }
1303     }
1304     /* We always test 0 PME nodes and the automatic number */
1305     *nentries               = nlist + 2;
1306     (*nPMEnodes)[nlist]     = 0;
1307     (*nPMEnodes)[nlist + 1] = -1;
1308
1309     fprintf(stderr, "Will try the following %d different values for -npme:\n", *nentries);
1310     for (i = 0; i < *nentries - 1; i++)
1311     {
1312         fprintf(stderr, "%d, ", (*nPMEnodes)[i]);
1313     }
1314     fprintf(stderr, "and %d (auto).\n", (*nPMEnodes)[*nentries - 1]);
1315 }
1316
1317
1318 /* Allocate memory to store the performance data */
1319 static void init_perfdata(t_perf* perfdata[], int ntprs, int datasets, int repeats)
1320 {
1321     int i, j, k;
1322
1323
1324     for (k = 0; k < ntprs; k++)
1325     {
1326         snew(perfdata[k], datasets);
1327         for (i = 0; i < datasets; i++)
1328         {
1329             for (j = 0; j < repeats; j++)
1330             {
1331                 snew(perfdata[k][i].Gcycles, repeats);
1332                 snew(perfdata[k][i].ns_per_day, repeats);
1333                 snew(perfdata[k][i].PME_f_load, repeats);
1334             }
1335         }
1336     }
1337 }
1338
1339
1340 /* Check for errors on mdrun -h */
1341 static void make_sure_it_runs(char* mdrun_cmd_line, int length, FILE* fp, const t_filenm* fnm, int nfile)
1342 {
1343     char *command, *msg;
1344     int   ret;
1345
1346     snew(command, length + 15);
1347     snew(msg, length + 500);
1348
1349     fprintf(stdout, "Making sure the benchmarks can be executed by running just 1 step...\n");
1350     sprintf(command, "%s -nsteps 1 -quiet", mdrun_cmd_line);
1351     fprintf(stdout, "Executing '%s' ...\n", command);
1352     ret = gmx_system_call(command);
1353
1354     if (0 != ret)
1355     {
1356         /* To prevent confusion, do not again issue a gmx_fatal here since we already
1357          * get the error message from mdrun itself */
1358         sprintf(msg,
1359                 "Cannot run the first benchmark simulation! Please check the error message of\n"
1360                 "mdrun for the source of the problem. Did you provide a command line\n"
1361                 "argument that neither gmx tune_pme nor mdrun understands? If you're\n"
1362                 "sure your command line should work, you can bypass this check with \n"
1363                 "gmx tune_pme -nocheck. The failing command was:\n"
1364                 "\n%s\n\n",
1365                 command);
1366
1367         fprintf(stderr, "%s", msg);
1368         sep_line(fp);
1369         fprintf(fp, "%s", msg);
1370
1371         exit(ret);
1372     }
1373     fprintf(stdout, "Benchmarks can be executed!\n");
1374
1375     /* Clean up the benchmark output files we just created */
1376     fprintf(stdout, "Cleaning up ...\n");
1377     remove_if_exists(opt2fn("-bc", nfile, fnm));
1378     remove_if_exists(opt2fn("-be", nfile, fnm));
1379     remove_if_exists(opt2fn("-bcpo", nfile, fnm));
1380     remove_if_exists(opt2fn("-bg", nfile, fnm));
1381     remove_if_exists(opt2fn("-bo", nfile, fnm));
1382     remove_if_exists(opt2fn("-bx", nfile, fnm));
1383
1384     sfree(command);
1385     sfree(msg);
1386 }
1387
1388 static void do_the_tests(FILE*  fp,                    /* General tune_pme output file           */
1389                          char** tpr_names,             /* Filenames of the input files to test   */
1390                          int    maxPMEnodes,           /* Max fraction of nodes to use for PME   */
1391                          int    minPMEnodes,           /* Min fraction of nodes to use for PME   */
1392                          int    npme_fixed,            /* If >= -1, test fixed number of PME
1393                                                         * nodes only                             */
1394                          const char* npmevalues_opt,   /* Which -npme values should be tested    */
1395                          t_perf**    perfdata,         /* Here the performace data is stored     */
1396                          int*        pmeentries,       /* Entries in the nPMEnodes list          */
1397                          int         repeats,          /* Repeat each test this often            */
1398                          int         nnodes,           /* Total number of nodes = nPP + nPME     */
1399                          int         nr_tprs,          /* Total number of tpr files to test      */
1400                          gmx_bool    bThreads,         /* Threads or MPI?                        */
1401                          char*       cmd_mpirun,       /* mpirun command string                  */
1402                          char*       cmd_np,           /* "-np", "-n", whatever mpirun needs     */
1403                          char*       cmd_mdrun,        /* mdrun command string                   */
1404                          char*       cmd_args_bench,   /* arguments for mdrun in a string        */
1405                          const t_filenm* fnm,          /* List of filenames from command line    */
1406                          int             nfile,        /* Number of files specified on the cmdl. */
1407                          int             presteps,     /* DLB equilibration steps, is checked    */
1408                          int64_t         cpt_steps,    /* Time step counter in the checkpoint    */
1409                          gmx_bool        bCheck,       /* Check whether benchmark mdrun works    */
1410                          const char*     eligible_gpu_ids) /* GPU IDs for
1411                                                         * constructing mdrun command lines */
1412 {
1413     int      i, nr, k, ret, count = 0, totaltests;
1414     int*     nPMEnodes = nullptr;
1415     t_perf*  pd        = nullptr;
1416     int      cmdline_length;
1417     char *   command, *cmd_stub;
1418     char     buf[STRLEN];
1419     gmx_bool bResetProblem = FALSE;
1420     gmx_bool bFirst        = TRUE;
1421
1422     /* This string array corresponds to the eParselog enum type at the start
1423      * of this file */
1424     const char* ParseLog[] = { "OK.",
1425                                "Logfile not found!",
1426                                "No timings, logfile truncated?",
1427                                "Run was terminated.",
1428                                "Counters were not reset properly.",
1429                                "No DD grid found for these settings.",
1430                                "TPX version conflict!",
1431                                "mdrun was not started in parallel!",
1432                                "Number of PP ranks has a prime factor that is too large.",
1433                                "The number of PP ranks did not suit the number of GPUs.",
1434                                "Some GPUs were not detected or are incompatible.",
1435                                "An error occurred." };
1436     char        str_PME_f_load[13];
1437
1438
1439     /* Allocate space for the mdrun command line. 100 extra characters should
1440        be more than enough for the -npme etcetera arguments */
1441     cmdline_length = std::strlen(cmd_mpirun) + std::strlen(cmd_np) + std::strlen(cmd_mdrun)
1442                      + std::strlen(cmd_args_bench) + std::strlen(tpr_names[0]) + 100;
1443     snew(command, cmdline_length);
1444     snew(cmd_stub, cmdline_length);
1445
1446     /* Construct the part of the command line that stays the same for all tests: */
1447     if (bThreads)
1448     {
1449         sprintf(cmd_stub, "%s%s", cmd_mdrun, cmd_np);
1450     }
1451     else
1452     {
1453         sprintf(cmd_stub, "%s%s%s ", cmd_mpirun, cmd_np, cmd_mdrun);
1454     }
1455
1456     /* Create a list of numbers of PME nodes to test */
1457     if (npme_fixed < -1)
1458     {
1459         make_npme_list(npmevalues_opt, pmeentries, &nPMEnodes, nnodes, minPMEnodes, maxPMEnodes);
1460     }
1461     else
1462     {
1463         *pmeentries = 1;
1464         snew(nPMEnodes, 1);
1465         nPMEnodes[0] = npme_fixed;
1466         fprintf(stderr, "Will use a fixed number of %d PME-only ranks.\n", nPMEnodes[0]);
1467     }
1468
1469     if (0 == repeats)
1470     {
1471         fprintf(fp, "\nNo benchmarks done since number of repeats (-r) is 0.\n");
1472         gmx_ffclose(fp);
1473         finalize(opt2fn("-p", nfile, fnm));
1474         exit(0);
1475     }
1476
1477     /* Allocate one dataset for each tpr input file: */
1478     init_perfdata(perfdata, nr_tprs, *pmeentries, repeats);
1479
1480     /*****************************************/
1481     /* Main loop over all tpr files to test: */
1482     /*****************************************/
1483     totaltests = nr_tprs * (*pmeentries) * repeats;
1484     for (k = 0; k < nr_tprs; k++)
1485     {
1486         fprintf(fp, "\nIndividual timings for input file %d (%s):\n", k, tpr_names[k]);
1487         fprintf(fp, "PME ranks      Gcycles       ns/day        PME/f    Remark\n");
1488         /* Loop over various numbers of PME nodes: */
1489         for (i = 0; i < *pmeentries; i++)
1490         {
1491             pd = &perfdata[k][i];
1492
1493             auto cmd_gpu_ids = make_gpu_id_command_line(eligible_gpu_ids);
1494
1495             /* Loop over the repeats for each scenario: */
1496             for (nr = 0; nr < repeats; nr++)
1497             {
1498                 pd->nPMEnodes = nPMEnodes[i];
1499
1500                 /* Add -npme and -s to the command line and save it. Note that
1501                  * the -passall (if set) options requires cmd_args_bench to be
1502                  * at the end of the command line string */
1503                 snew(pd->mdrun_cmd_line, cmdline_length);
1504                 sprintf(pd->mdrun_cmd_line, "%s-npme %d -s %s %s %s", cmd_stub, pd->nPMEnodes,
1505                         tpr_names[k], cmd_args_bench, cmd_gpu_ids.c_str());
1506
1507                 /* To prevent that all benchmarks fail due to a show-stopper argument
1508                  * on the mdrun command line, we make a quick check first.
1509                  * This check can be turned off in cases where the automatically chosen
1510                  * number of PME-only ranks leads to a number of PP ranks for which no
1511                  * decomposition can be found (e.g. for large prime numbers) */
1512                 if (bFirst && bCheck)
1513                 {
1514                     /* TODO This check is really for a functional
1515                      * .tpr, and if we need it, it should take place
1516                      * for every .tpr, and the logic for it should be
1517                      * immediately inside the loop over k, not in
1518                      * this inner loop. */
1519                     char* temporary_cmd_line;
1520
1521                     snew(temporary_cmd_line, cmdline_length);
1522                     /* TODO -npme 0 is more likely to succeed at low
1523                        parallelism than the default of -npme -1, but
1524                        is more likely to fail at the scaling limit
1525                        when the PP domains may be too small. "mpirun
1526                        -np 1 mdrun" is probably a reasonable thing to
1527                        do for this check, but it'll be easier to
1528                        implement that after some refactoring of how
1529                        the number of MPI ranks is managed. */
1530                     sprintf(temporary_cmd_line, "%s-npme 0 -nb cpu -s %s %s", cmd_stub,
1531                             tpr_names[k], cmd_args_bench);
1532                     make_sure_it_runs(temporary_cmd_line, cmdline_length, fp, fnm, nfile);
1533                 }
1534                 bFirst = FALSE;
1535
1536                 /* Do a benchmark simulation: */
1537                 if (repeats > 1)
1538                 {
1539                     sprintf(buf, ", pass %d/%d", nr + 1, repeats);
1540                 }
1541                 else
1542                 {
1543                     buf[0] = '\0';
1544                 }
1545                 fprintf(stdout, "\n=== Progress %2.0f%%, tpr %d/%d, run %d/%d%s:\n",
1546                         (100.0 * count) / totaltests, k + 1, nr_tprs, i + 1, *pmeentries, buf);
1547                 make_backup(opt2fn("-err", nfile, fnm));
1548                 sprintf(command, "%s 1> /dev/null 2>%s", pd->mdrun_cmd_line, opt2fn("-err", nfile, fnm));
1549                 fprintf(stdout, "%s\n", pd->mdrun_cmd_line);
1550                 gmx_system_call(command);
1551
1552                 /* Collect the performance data from the log file; also check stderr
1553                  * for fatal errors */
1554                 ret = parse_logfile(opt2fn("-bg", nfile, fnm), opt2fn("-err", nfile, fnm), pd, nr,
1555                                     presteps, cpt_steps, nnodes);
1556                 if ((presteps > 0) && (ret == eParselogResetProblem))
1557                 {
1558                     bResetProblem = TRUE;
1559                 }
1560
1561                 if (-1 == pd->nPMEnodes)
1562                 {
1563                     sprintf(buf, "(%3d)", pd->guessPME);
1564                 }
1565                 else
1566                 {
1567                     sprintf(buf, "     ");
1568                 }
1569
1570                 /* Nicer output */
1571                 if (pd->PME_f_load[nr] > 0.0)
1572                 {
1573                     sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load[nr]);
1574                 }
1575                 else
1576                 {
1577                     sprintf(str_PME_f_load, "%s", "         -  ");
1578                 }
1579
1580                 /* Write the data we got to disk */
1581                 fprintf(fp, "%4d%s %12.3f %12.3f %s    %s", pd->nPMEnodes, buf, pd->Gcycles[nr],
1582                         pd->ns_per_day[nr], str_PME_f_load, ParseLog[ret]);
1583                 if (!(ret == eParselogOK || ret == eParselogNoDDGrid || ret == eParselogNotFound))
1584                 {
1585                     fprintf(fp, " Check %s file for problems.", ret == eParselogFatal ? "err" : "log");
1586                 }
1587                 fprintf(fp, "\n");
1588                 fflush(fp);
1589                 count++;
1590
1591                 /* Do some cleaning up and delete the files we do not need any more */
1592                 cleanup(fnm, nfile, k, nnodes, pd->nPMEnodes, nr, ret == eParselogFatal);
1593
1594                 /* If the first run with this number of processors already failed, do not try again: */
1595                 if (pd->Gcycles[0] <= 0.0 && repeats > 1)
1596                 {
1597                     fprintf(stdout,
1598                             "Skipping remaining passes of unsuccessful setting, see log file for "
1599                             "details.\n");
1600                     count += repeats - (nr + 1);
1601                     break;
1602                 }
1603             } /* end of repeats loop */
1604         }     /* end of -npme loop */
1605     }         /* end of tpr file loop */
1606
1607     if (bResetProblem)
1608     {
1609         sep_line(fp);
1610         fprintf(fp, "WARNING: The cycle and time step counters could not be reset properly. ");
1611         sep_line(fp);
1612     }
1613     sfree(command);
1614     sfree(cmd_stub);
1615 }
1616
1617
1618 static void check_input(int             nnodes,
1619                         int             repeats,
1620                         int*            ntprs,
1621                         real*           rmin,
1622                         real            rcoulomb,
1623                         real*           rmax,
1624                         real            maxPMEfraction,
1625                         real            minPMEfraction,
1626                         int             npme_fixed,
1627                         int64_t         bench_nsteps,
1628                         const t_filenm* fnm,
1629                         int             nfile,
1630                         int             sim_part,
1631                         int             presteps,
1632                         int             npargs,
1633                         t_pargs*        pa)
1634 {
1635     int old;
1636
1637
1638     /* Make sure the input file exists */
1639     if (!gmx_fexist(opt2fn("-s", nfile, fnm)))
1640     {
1641         gmx_fatal(FARGS, "File %s not found.", opt2fn("-s", nfile, fnm));
1642     }
1643
1644     /* Make sure that the checkpoint file is not overwritten during benchmarking */
1645     if ((0 == std::strcmp(opt2fn("-cpi", nfile, fnm), opt2fn("-bcpo", nfile, fnm))) && (sim_part > 1))
1646     {
1647         gmx_fatal(FARGS,
1648                   "Checkpoint input (-cpi) and benchmark checkpoint output (-bcpo) files must not "
1649                   "be identical.\n"
1650                   "The checkpoint input file must not be overwritten during the benchmarks.\n");
1651     }
1652
1653     /* Make sure that repeats is >= 0 (if == 0, only write tpr files) */
1654     if (repeats < 0)
1655     {
1656         gmx_fatal(FARGS, "Number of repeats < 0!");
1657     }
1658
1659     /* Check number of nodes */
1660     if (nnodes < 1)
1661     {
1662         gmx_fatal(FARGS, "Number of ranks/threads must be a positive integer.");
1663     }
1664
1665     /* Automatically choose -ntpr if not set */
1666     if (*ntprs < 1)
1667     {
1668         if (nnodes < 16)
1669         {
1670             *ntprs = 1;
1671         }
1672         else
1673         {
1674             *ntprs = 3;
1675             /* Set a reasonable scaling factor for rcoulomb */
1676             if (*rmax <= 0)
1677             {
1678                 *rmax = rcoulomb * 1.2;
1679             }
1680         }
1681         fprintf(stderr, "Will test %d tpr file%s.\n", *ntprs, *ntprs == 1 ? "" : "s");
1682     }
1683     else
1684     {
1685         if (1 == *ntprs)
1686         {
1687             fprintf(stderr,
1688                     "Note: Choose ntpr>1 to shift PME load between real and reciprocal space.\n");
1689         }
1690     }
1691
1692     /* Make shure that rmin <= rcoulomb <= rmax */
1693     if (*rmin <= 0)
1694     {
1695         *rmin = rcoulomb;
1696     }
1697     if (*rmax <= 0)
1698     {
1699         *rmax = rcoulomb;
1700     }
1701     if (!(*rmin <= *rmax))
1702     {
1703         gmx_fatal(FARGS,
1704                   "Please choose the Coulomb radii such that rmin <= rmax.\n"
1705                   "rmin = %g, rmax = %g, actual rcoul from .tpr file = %g\n",
1706                   *rmin, *rmax, rcoulomb);
1707     }
1708     /* Add test scenarios if rmin or rmax were set */
1709     if (*ntprs <= 2)
1710     {
1711         if (!gmx_within_tol(*rmin, rcoulomb, GMX_REAL_EPS) && (*ntprs == 1))
1712         {
1713             (*ntprs)++;
1714             fprintf(stderr, "NOTE: Setting -rmin to %g changed -ntpr to %d\n", *rmin, *ntprs);
1715         }
1716         if (!gmx_within_tol(*rmax, rcoulomb, GMX_REAL_EPS) && (*ntprs == 1))
1717         {
1718             (*ntprs)++;
1719             fprintf(stderr, "NOTE: Setting -rmax to %g changed -ntpr to %d\n", *rmax, *ntprs);
1720         }
1721     }
1722     old = *ntprs;
1723     /* If one of rmin, rmax is set, we need 2 tpr files at minimum */
1724     if (!gmx_within_tol(*rmax, rcoulomb, GMX_REAL_EPS) || !gmx_within_tol(*rmin, rcoulomb, GMX_REAL_EPS))
1725     {
1726         *ntprs = std::max(*ntprs, 2);
1727     }
1728
1729     /* If both rmin, rmax are set, we need 3 tpr files at minimum */
1730     if (!gmx_within_tol(*rmax, rcoulomb, GMX_REAL_EPS) && !gmx_within_tol(*rmin, rcoulomb, GMX_REAL_EPS))
1731     {
1732         *ntprs = std::max(*ntprs, 3);
1733     }
1734
1735     if (old != *ntprs)
1736     {
1737         fprintf(stderr, "NOTE: Your rmin, rmax setting changed -ntpr to %d\n", *ntprs);
1738     }
1739
1740     if (*ntprs > 1)
1741     {
1742         if (gmx_within_tol(*rmin, rcoulomb, GMX_REAL_EPS)
1743             && gmx_within_tol(rcoulomb, *rmax, GMX_REAL_EPS)) /* We have just a single rc */
1744         {
1745             fprintf(stderr,
1746                     "WARNING: Resetting -ntpr to 1 since no Coulomb radius scaling is requested.\n"
1747                     "Please set rmin < rmax to test Coulomb radii in the [rmin, rmax] interval\n"
1748                     "with correspondingly adjusted PME grid settings\n");
1749             *ntprs = 1;
1750         }
1751     }
1752
1753     /* Check whether max and min fraction are within required values */
1754     if (maxPMEfraction > 0.5 || maxPMEfraction < 0)
1755     {
1756         gmx_fatal(FARGS, "-max must be between 0 and 0.5");
1757     }
1758     if (minPMEfraction > 0.5 || minPMEfraction < 0)
1759     {
1760         gmx_fatal(FARGS, "-min must be between 0 and 0.5");
1761     }
1762     if (maxPMEfraction < minPMEfraction)
1763     {
1764         gmx_fatal(FARGS, "-max must be larger or equal to -min");
1765     }
1766
1767     /* Check whether the number of steps - if it was set - has a reasonable value */
1768     if (bench_nsteps < 0)
1769     {
1770         gmx_fatal(FARGS, "Number of steps must be positive.");
1771     }
1772
1773     if (bench_nsteps > 10000 || bench_nsteps < 100)
1774     {
1775         fprintf(stderr, "WARNING: steps=");
1776         fprintf(stderr, "%" PRId64, bench_nsteps);
1777         fprintf(stderr, ". Are you sure you want to perform so %s steps for each benchmark?\n",
1778                 (bench_nsteps < 100) ? "few" : "many");
1779     }
1780
1781     if (presteps < 0)
1782     {
1783         gmx_fatal(FARGS, "Cannot have a negative number of presteps.\n");
1784     }
1785
1786     /* Check for rcoulomb scaling if more than one .tpr file is tested */
1787     if (*ntprs > 1)
1788     {
1789         if (*rmin / rcoulomb < 0.75 || *rmax / rcoulomb > 1.25)
1790         {
1791             fprintf(stderr,
1792                     "WARNING: Applying extreme scaling factor. I hope you know what you are "
1793                     "doing.\n");
1794         }
1795     }
1796
1797     /* If a fixed number of PME nodes is set we do rcoulomb and PME gird tuning
1798      * only. We need to check whether the requested number of PME-only nodes
1799      * makes sense. */
1800     if (npme_fixed > -1)
1801     {
1802         /* No more than 50% of all nodes can be assigned as PME-only nodes. */
1803         if (2 * npme_fixed > nnodes)
1804         {
1805             gmx_fatal(FARGS,
1806                       "Cannot have more than %d PME-only ranks for a total of %d ranks (you chose "
1807                       "%d).\n",
1808                       nnodes / 2, nnodes, npme_fixed);
1809         }
1810         if ((npme_fixed > 0) && (5 * npme_fixed < nnodes))
1811         {
1812             fprintf(stderr,
1813                     "WARNING: Only %g percent of the ranks are assigned as PME-only ranks.\n",
1814                     (100.0 * npme_fixed) / nnodes);
1815         }
1816         if (opt2parg_bSet("-min", npargs, pa) || opt2parg_bSet("-max", npargs, pa))
1817         {
1818             fprintf(stderr,
1819                     "NOTE: The -min, -max, and -npme options have no effect when a\n"
1820                     "      fixed number of PME-only ranks is requested with -fix.\n");
1821         }
1822     }
1823 }
1824
1825
1826 /* Returns TRUE when "opt" is needed at launch time */
1827 static gmx_bool is_launch_file(char* opt, gmx_bool bSet)
1828 {
1829     if (0 == std::strncmp(opt, "-swap", 5))
1830     {
1831         return bSet;
1832     }
1833
1834     /* Apart from the input .tpr and the output log files we need all options that
1835      * were set on the command line and that do not start with -b */
1836     if (0 == std::strncmp(opt, "-b", 2) || 0 == std::strncmp(opt, "-s", 2)
1837         || 0 == std::strncmp(opt, "-err", 4) || 0 == std::strncmp(opt, "-p", 2))
1838     {
1839         return FALSE;
1840     }
1841
1842     return bSet;
1843 }
1844
1845
1846 /* Returns TRUE when "opt" defines a file which is needed for the benchmarks runs */
1847 static gmx_bool is_bench_file(char* opt, gmx_bool bSet, gmx_bool bOptional, gmx_bool bIsOutput)
1848 {
1849     /* Apart from the input .tpr, all files starting with "-b" are for
1850      * _b_enchmark files exclusively */
1851     if (0 == std::strncmp(opt, "-s", 2))
1852     {
1853         return FALSE;
1854     }
1855
1856     if (0 == std::strncmp(opt, "-b", 2) || 0 == std::strncmp(opt, "-s", 2))
1857     {
1858         return !bOptional || bSet;
1859     }
1860     else
1861     {
1862         if (bIsOutput)
1863         {
1864             return FALSE;
1865         }
1866         else
1867         {
1868             return bSet; /* These are additional input files like -cpi -ei */
1869         }
1870     }
1871 }
1872
1873
1874 /* Adds 'buf' to 'str' */
1875 static void add_to_string(char** str, const char* buf)
1876 {
1877     int len;
1878
1879
1880     len = std::strlen(*str) + std::strlen(buf) + 1;
1881     srenew(*str, len);
1882     std::strcat(*str, buf);
1883 }
1884
1885
1886 /* Create the command line for the benchmark as well as for the real run */
1887 static void
1888 create_command_line_snippets(gmx_bool bAppendFiles,
1889                              gmx_bool bKeepAndNumCPT,
1890                              gmx_bool bResetHWay,
1891                              int      presteps,
1892                              int      nfile,
1893                              t_filenm fnm[],
1894                              char* cmd_args_bench[], /* command line arguments for benchmark runs */
1895                              char* cmd_args_launch[], /* command line arguments for simulation run */
1896                              char  extra_args[],      /* Add this to the end of the command line */
1897                              char* deffnm)            /* Default file names, or NULL if not set */
1898 {
1899     int         i;
1900     char*       opt;
1901     const char* name;
1902     char        strbuf[STRLEN];
1903
1904
1905     /* strlen needs at least '\0' as a string: */
1906     snew(*cmd_args_bench, 1);
1907     snew(*cmd_args_launch, 1);
1908     *cmd_args_launch[0] = '\0';
1909     *cmd_args_bench[0]  = '\0';
1910
1911
1912     /*******************************************/
1913     /* 1. Process other command line arguments */
1914     /*******************************************/
1915     if (presteps > 0)
1916     {
1917         /* Add equilibration steps to benchmark options */
1918         sprintf(strbuf, "-resetstep %d ", presteps);
1919         add_to_string(cmd_args_bench, strbuf);
1920     }
1921     /* These switches take effect only at launch time */
1922     if (deffnm)
1923     {
1924         sprintf(strbuf, "-deffnm %s ", deffnm);
1925         add_to_string(cmd_args_launch, strbuf);
1926     }
1927     if (!bAppendFiles)
1928     {
1929         add_to_string(cmd_args_launch, "-noappend ");
1930     }
1931     if (bKeepAndNumCPT)
1932     {
1933         add_to_string(cmd_args_launch, "-cpnum ");
1934     }
1935     if (bResetHWay)
1936     {
1937         add_to_string(cmd_args_launch, "-resethway ");
1938     }
1939
1940     /********************/
1941     /* 2. Process files */
1942     /********************/
1943     for (i = 0; i < nfile; i++)
1944     {
1945         opt  = const_cast<char*>(fnm[i].opt);
1946         name = opt2fn(opt, nfile, fnm);
1947
1948         /* Strbuf contains the options, now let's sort out where we need that */
1949         sprintf(strbuf, "%s %s ", opt, name);
1950
1951         if (is_bench_file(opt, opt2bSet(opt, nfile, fnm), is_optional(&fnm[i]), is_output(&fnm[i])))
1952         {
1953             /* All options starting with -b* need the 'b' removed,
1954              * therefore overwrite strbuf */
1955             if (0 == std::strncmp(opt, "-b", 2))
1956             {
1957                 sprintf(strbuf, "-%s %s ", &opt[2], name);
1958             }
1959
1960             add_to_string(cmd_args_bench, strbuf);
1961         }
1962
1963         if (is_launch_file(opt, opt2bSet(opt, nfile, fnm)))
1964         {
1965             add_to_string(cmd_args_launch, strbuf);
1966         }
1967     }
1968
1969     add_to_string(cmd_args_bench, extra_args);
1970     add_to_string(cmd_args_launch, extra_args);
1971 }
1972
1973
1974 /* Set option opt */
1975 static void setopt(const char* opt, int nfile, t_filenm fnm[])
1976 {
1977     int i;
1978
1979     for (i = 0; (i < nfile); i++)
1980     {
1981         if (std::strcmp(opt, fnm[i].opt) == 0)
1982         {
1983             fnm[i].flag |= ffSET;
1984         }
1985     }
1986 }
1987
1988
1989 /* This routine inspects the tpr file and ...
1990  * 1. checks for output files that get triggered by a tpr option. These output
1991  *    files are marked as 'set' to allow for a proper cleanup after each
1992  *    tuning run.
1993  * 2. returns the PME:PP load ratio
1994  * 3. returns rcoulomb from the tpr */
1995 static float inspect_tpr(int nfile, t_filenm fnm[], real* rcoulomb)
1996 {
1997     gmx_bool   bTpi;  /* Is test particle insertion requested?          */
1998     gmx_bool   bFree; /* Is a free energy simulation requested?         */
1999     gmx_bool   bNM;   /* Is a normal mode analysis requested?           */
2000     gmx_bool   bSwap; /* Is water/ion position swapping requested?      */
2001     t_state    state;
2002     gmx_mtop_t mtop;
2003
2004
2005     /* Check tpr file for options that trigger extra output files */
2006     t_inputrec  irInstance;
2007     t_inputrec* ir = &irInstance;
2008     read_tpx_state(opt2fn("-s", nfile, fnm), ir, &state, &mtop);
2009     bFree = (efepNO != ir->efep);
2010     bNM   = (eiNM == ir->eI);
2011     bSwap = (eswapNO != ir->eSwapCoords);
2012     bTpi  = EI_TPI(ir->eI);
2013
2014     /* Set these output files on the tuning command-line */
2015     if (ir->bPull)
2016     {
2017         setopt("-pf", nfile, fnm);
2018         setopt("-px", nfile, fnm);
2019     }
2020     if (bFree)
2021     {
2022         setopt("-dhdl", nfile, fnm);
2023     }
2024     if (bTpi)
2025     {
2026         setopt("-tpi", nfile, fnm);
2027         setopt("-tpid", nfile, fnm);
2028     }
2029     if (bNM)
2030     {
2031         setopt("-mtx", nfile, fnm);
2032     }
2033     if (bSwap)
2034     {
2035         setopt("-swap", nfile, fnm);
2036     }
2037
2038     *rcoulomb = ir->rcoulomb;
2039
2040     /* Return the estimate for the number of PME nodes */
2041     float npme = pme_load_estimate(mtop, *ir, state.box);
2042     return npme;
2043 }
2044
2045
2046 static void couple_files_options(int nfile, t_filenm fnm[])
2047 {
2048     int      i;
2049     gmx_bool bSet, bBench;
2050     char*    opt;
2051     char     buf[20];
2052
2053
2054     for (i = 0; i < nfile; i++)
2055     {
2056         opt    = const_cast<char*>(fnm[i].opt);
2057         bSet   = ((fnm[i].flag & ffSET) != 0);
2058         bBench = (0 == std::strncmp(opt, "-b", 2));
2059
2060         /* Check optional files */
2061         /* If e.g. -eo is set, then -beo also needs to be set */
2062         if (is_optional(&fnm[i]) && bSet && !bBench)
2063         {
2064             sprintf(buf, "-b%s", &opt[1]);
2065             setopt(buf, nfile, fnm);
2066         }
2067         /* If -beo is set, then -eo also needs to be! */
2068         if (is_optional(&fnm[i]) && bSet && bBench)
2069         {
2070             sprintf(buf, "-%s", &opt[2]);
2071             setopt(buf, nfile, fnm);
2072         }
2073     }
2074 }
2075
2076
2077 #define BENCHSTEPS (1000)
2078
2079 int gmx_tune_pme(int argc, char* argv[])
2080 {
2081     const char* desc[] = {
2082         "For a given number [TT]-np[tt] or [TT]-ntmpi[tt] of ranks, [THISMODULE] systematically",
2083         "times [gmx-mdrun] with various numbers of PME-only ranks and determines",
2084         "which setting is fastest. It will also test whether performance can",
2085         "be enhanced by shifting load from the reciprocal to the real space",
2086         "part of the Ewald sum. ",
2087         "Simply pass your [REF].tpr[ref] file to [THISMODULE] together with other options",
2088         "for [gmx-mdrun] as needed.[PAR]",
2089         "[THISMODULE] needs to call [gmx-mdrun] and so requires that you",
2090         "specify how to call mdrun with the argument to the [TT]-mdrun[tt]",
2091         "parameter. Depending how you have built GROMACS, values such as",
2092         "'gmx mdrun', 'gmx_d mdrun', or 'mdrun_mpi' might be needed.[PAR]",
2093         "The program that runs MPI programs can be set in the environment variable",
2094         "MPIRUN (defaults to 'mpirun'). Note that for certain MPI frameworks,",
2095         "you need to provide a machine- or hostfile. This can also be passed",
2096         "via the MPIRUN variable, e.g.[PAR]",
2097         "[TT]export MPIRUN=\"/usr/local/mpirun -machinefile hosts\"[tt]",
2098         "Note that in such cases it is normally necessary to compile",
2099         "and/or run [THISMODULE] without MPI support, so that it can call",
2100         "the MPIRUN program.[PAR]",
2101         "Before doing the actual benchmark runs, [THISMODULE] will do a quick",
2102         "check whether [gmx-mdrun] works as expected with the provided parallel settings",
2103         "if the [TT]-check[tt] option is activated (the default).",
2104         "Please call [THISMODULE] with the normal options you would pass to",
2105         "[gmx-mdrun] and add [TT]-np[tt] for the number of ranks to perform the",
2106         "tests on, or [TT]-ntmpi[tt] for the number of threads. You can also add [TT]-r[tt]",
2107         "to repeat each test several times to get better statistics. [PAR]",
2108         "[THISMODULE] can test various real space / reciprocal space workloads",
2109         "for you. With [TT]-ntpr[tt] you control how many extra [REF].tpr[ref] files will be",
2110         "written with enlarged cutoffs and smaller Fourier grids respectively.",
2111         "Typically, the first test (number 0) will be with the settings from the input",
2112         "[REF].tpr[ref] file; the last test (number [TT]ntpr[tt]) will have the Coulomb cutoff",
2113         "specified by [TT]-rmax[tt] with a somewhat smaller PME grid at the same time. ",
2114         "In this last test, the Fourier spacing is multiplied with [TT]rmax[tt]/rcoulomb. ",
2115         "The remaining [REF].tpr[ref] files will have equally-spaced Coulomb radii (and Fourier ",
2116         "spacings) between these extremes. [BB]Note[bb] that you can set [TT]-ntpr[tt] to 1",
2117         "if you just seek the optimal number of PME-only ranks; in that case",
2118         "your input [REF].tpr[ref] file will remain unchanged.[PAR]",
2119         "For the benchmark runs, the default of 1000 time steps should suffice for most",
2120         "MD systems. The dynamic load balancing needs about 100 time steps",
2121         "to adapt to local load imbalances, therefore the time step counters",
2122         "are by default reset after 100 steps. For large systems (>1M atoms), as well as ",
2123         "for a higher accuracy of the measurements, you should set [TT]-resetstep[tt] to a higher ",
2124         "value.",
2125         "From the 'DD' load imbalance entries in the md.log output file you",
2126         "can tell after how many steps the load is sufficiently balanced. Example call:[PAR]",
2127         "[TT]gmx tune_pme -np 64 -s protein.tpr -launch[tt][PAR]",
2128         "After calling [gmx-mdrun] several times, detailed performance information",
2129         "is available in the output file [TT]perf.out[tt].",
2130         "[BB]Note[bb] that during the benchmarks, a couple of temporary files are written",
2131         "(options [TT]-b*[tt]), these will be automatically deleted after each test.[PAR]",
2132         "If you want the simulation to be started automatically with the",
2133         "optimized parameters, use the command line option [TT]-launch[tt].[PAR]",
2134         "Basic support for GPU-enabled [TT]mdrun[tt] exists. Give a string containing the IDs",
2135         "of the GPUs that you wish to use in the optimization in the [TT]-gpu_id[tt]",
2136         "command-line argument. This works exactly like [TT]mdrun -gpu_id[tt], does not imply a ",
2137         "mapping,",
2138         "and merely declares the eligible set of GPU devices. [TT]gmx-tune_pme[tt] will construct ",
2139         "calls to",
2140         "mdrun that use this set appropriately. [TT]gmx-tune_pme[tt] does not support",
2141         "[TT]-gputasks[tt].[PAR]",
2142     };
2143
2144     int   nnodes         = 1;
2145     int   repeats        = 2;
2146     int   pmeentries     = 0; /* How many values for -npme do we actually test for each tpr file */
2147     real  maxPMEfraction = 0.50;
2148     real  minPMEfraction = 0.25;
2149     int   maxPMEnodes, minPMEnodes;
2150     float guessPMEratio; /* guessed PME:PP ratio based on the tpr file */
2151     float guessPMEnodes;
2152     int   npme_fixed = -2; /* If >= -1, use only this number
2153                             * of PME-only nodes                */
2154     int         ntprs = 0;
2155     real        rmin = 0.0, rmax = 0.0; /* min and max value for rcoulomb if scaling is requested */
2156     real        rcoulomb       = -1.0;  /* Coulomb radius as set in .tpr file */
2157     gmx_bool    bScaleRvdw     = TRUE;
2158     int64_t     bench_nsteps   = BENCHSTEPS;
2159     int64_t     new_sim_nsteps = -1;   /* -1 indicates: not set by the user */
2160     int64_t     cpt_steps      = 0;    /* Step counter in .cpt input file   */
2161     int         presteps       = 1500; /* Do a full cycle reset after presteps steps */
2162     gmx_bool    bOverwrite     = FALSE, bKeepTPR;
2163     gmx_bool    bLaunch        = FALSE;
2164     char*       ExtraArgs      = nullptr;
2165     char**      tpr_names      = nullptr;
2166     const char* simulation_tpr = nullptr;
2167     char*       deffnm         = nullptr;
2168     int         best_npme, best_tpr;
2169     int         sim_part = 1; /* For benchmarks with checkpoint files */
2170     char        bbuf[STRLEN];
2171
2172
2173     /* Default program names if nothing else is found */
2174     char *cmd_mpirun = nullptr, *cmd_mdrun = nullptr;
2175     char *cmd_args_bench, *cmd_args_launch;
2176     char* cmd_np = nullptr;
2177
2178     /* IDs of GPUs that are eligible for computation */
2179     char* eligible_gpu_ids = nullptr;
2180
2181     t_perf**     perfdata = nullptr;
2182     t_inputinfo* info;
2183     int          i;
2184     FILE*        fp;
2185
2186     /* Print out how long the tuning took */
2187     double seconds;
2188
2189     static t_filenm fnm[] = { /* tune_pme */
2190                               { efOUT, "-p", "perf", ffWRITE },
2191                               { efLOG, "-err", "bencherr", ffWRITE },
2192                               { efTPR, "-so", "tuned", ffWRITE },
2193                               /* mdrun: */
2194                               { efTPR, "-s", nullptr, ffREAD },
2195                               { efTRN, "-o", nullptr, ffWRITE },
2196                               { efCOMPRESSED, "-x", nullptr, ffOPTWR },
2197                               { efCPT, "-cpi", nullptr, ffOPTRD },
2198                               { efCPT, "-cpo", nullptr, ffOPTWR },
2199                               { efSTO, "-c", "confout", ffWRITE },
2200                               { efEDR, "-e", "ener", ffWRITE },
2201                               { efLOG, "-g", "md", ffWRITE },
2202                               { efXVG, "-dhdl", "dhdl", ffOPTWR },
2203                               { efXVG, "-field", "field", ffOPTWR },
2204                               { efXVG, "-table", "table", ffOPTRD },
2205                               { efXVG, "-tablep", "tablep", ffOPTRD },
2206                               { efXVG, "-tableb", "table", ffOPTRD },
2207                               { efTRX, "-rerun", "rerun", ffOPTRD },
2208                               { efXVG, "-tpi", "tpi", ffOPTWR },
2209                               { efXVG, "-tpid", "tpidist", ffOPTWR },
2210                               { efEDI, "-ei", "sam", ffOPTRD },
2211                               { efXVG, "-eo", "edsam", ffOPTWR },
2212                               { efXVG, "-px", "pullx", ffOPTWR },
2213                               { efXVG, "-pf", "pullf", ffOPTWR },
2214                               { efXVG, "-ro", "rotation", ffOPTWR },
2215                               { efLOG, "-ra", "rotangles", ffOPTWR },
2216                               { efLOG, "-rs", "rotslabs", ffOPTWR },
2217                               { efLOG, "-rt", "rottorque", ffOPTWR },
2218                               { efMTX, "-mtx", "nm", ffOPTWR },
2219                               { efXVG, "-swap", "swapions", ffOPTWR },
2220                               /* Output files that are deleted after each benchmark run */
2221                               { efTRN, "-bo", "bench", ffWRITE },
2222                               { efXTC, "-bx", "bench", ffWRITE },
2223                               { efCPT, "-bcpo", "bench", ffWRITE },
2224                               { efSTO, "-bc", "bench", ffWRITE },
2225                               { efEDR, "-be", "bench", ffWRITE },
2226                               { efLOG, "-bg", "bench", ffWRITE },
2227                               { efXVG, "-beo", "benchedo", ffOPTWR },
2228                               { efXVG, "-bdhdl", "benchdhdl", ffOPTWR },
2229                               { efXVG, "-bfield", "benchfld", ffOPTWR },
2230                               { efXVG, "-btpi", "benchtpi", ffOPTWR },
2231                               { efXVG, "-btpid", "benchtpid", ffOPTWR },
2232                               { efXVG, "-bdevout", "benchdev", ffOPTWR },
2233                               { efXVG, "-brunav", "benchrnav", ffOPTWR },
2234                               { efXVG, "-bpx", "benchpx", ffOPTWR },
2235                               { efXVG, "-bpf", "benchpf", ffOPTWR },
2236                               { efXVG, "-bro", "benchrot", ffOPTWR },
2237                               { efLOG, "-bra", "benchrota", ffOPTWR },
2238                               { efLOG, "-brs", "benchrots", ffOPTWR },
2239                               { efLOG, "-brt", "benchrott", ffOPTWR },
2240                               { efMTX, "-bmtx", "benchn", ffOPTWR },
2241                               { efNDX, "-bdn", "bench", ffOPTWR },
2242                               { efXVG, "-bswap", "benchswp", ffOPTWR }
2243     };
2244
2245     gmx_bool bThreads = FALSE;
2246
2247     int nthreads = 1;
2248
2249     const char* procstring[]     = { nullptr, "np", "n", "none", nullptr };
2250     const char* npmevalues_opt[] = { nullptr, "auto", "all", "subset", nullptr };
2251
2252     gmx_bool bAppendFiles          = TRUE;
2253     gmx_bool bKeepAndNumCPT        = FALSE;
2254     gmx_bool bResetCountersHalfWay = FALSE;
2255     gmx_bool bBenchmark            = TRUE;
2256     gmx_bool bCheck                = TRUE;
2257
2258     gmx_output_env_t* oenv = nullptr;
2259
2260     t_pargs pa[] = {
2261         /***********************/
2262         /* tune_pme options: */
2263         /***********************/
2264         { "-mdrun",
2265           FALSE,
2266           etSTR,
2267           { &cmd_mdrun },
2268           "Command line to run a simulation, e.g. 'gmx mdrun' or 'mdrun_mpi'" },
2269         { "-np",
2270           FALSE,
2271           etINT,
2272           { &nnodes },
2273           "Number of ranks to run the tests on (must be > 2 for separate PME ranks)" },
2274         { "-npstring",
2275           FALSE,
2276           etENUM,
2277           { procstring },
2278           "Name of the [TT]$MPIRUN[tt] option that specifies the number of ranks to use ('np', or "
2279           "'n'; use 'none' if there is no such option)" },
2280         { "-ntmpi",
2281           FALSE,
2282           etINT,
2283           { &nthreads },
2284           "Number of MPI-threads to run the tests on (turns MPI & mpirun off)" },
2285         { "-r", FALSE, etINT, { &repeats }, "Repeat each test this often" },
2286         { "-max", FALSE, etREAL, { &maxPMEfraction }, "Max fraction of PME ranks to test with" },
2287         { "-min", FALSE, etREAL, { &minPMEfraction }, "Min fraction of PME ranks to test with" },
2288         { "-npme",
2289           FALSE,
2290           etENUM,
2291           { npmevalues_opt },
2292           "Within -min and -max, benchmark all possible values for [TT]-npme[tt], or just a "
2293           "reasonable subset. "
2294           "Auto neglects -min and -max and chooses reasonable values around a guess for npme "
2295           "derived from the .tpr" },
2296         { "-fix",
2297           FALSE,
2298           etINT,
2299           { &npme_fixed },
2300           "If >= -1, do not vary the number of PME-only ranks, instead use this fixed value and "
2301           "only vary rcoulomb and the PME grid spacing." },
2302         { "-rmax",
2303           FALSE,
2304           etREAL,
2305           { &rmax },
2306           "If >0, maximal rcoulomb for -ntpr>1 (rcoulomb upscaling results in fourier grid "
2307           "downscaling)" },
2308         { "-rmin", FALSE, etREAL, { &rmin }, "If >0, minimal rcoulomb for -ntpr>1" },
2309         { "-scalevdw", FALSE, etBOOL, { &bScaleRvdw }, "Scale rvdw along with rcoulomb" },
2310         { "-ntpr",
2311           FALSE,
2312           etINT,
2313           { &ntprs },
2314           "Number of [REF].tpr[ref] files to benchmark. Create this many files with different "
2315           "rcoulomb scaling factors depending on -rmin and -rmax. "
2316           "If < 1, automatically choose the number of [REF].tpr[ref] files to test" },
2317         { "-steps",
2318           FALSE,
2319           etINT64,
2320           { &bench_nsteps },
2321           "Take timings for this many steps in the benchmark runs" },
2322         { "-resetstep",
2323           FALSE,
2324           etINT,
2325           { &presteps },
2326           "Let dlb equilibrate this many steps before timings are taken (reset cycle counters "
2327           "after this many steps)" },
2328         { "-nsteps",
2329           FALSE,
2330           etINT64,
2331           { &new_sim_nsteps },
2332           "If non-negative, perform this many steps in the real run (overwrites nsteps from "
2333           "[REF].tpr[ref], add [REF].cpt[ref] steps)" },
2334         { "-launch", FALSE, etBOOL, { &bLaunch }, "Launch the real simulation after optimization" },
2335         { "-bench",
2336           FALSE,
2337           etBOOL,
2338           { &bBenchmark },
2339           "Run the benchmarks or just create the input [REF].tpr[ref] files?" },
2340         { "-check",
2341           FALSE,
2342           etBOOL,
2343           { &bCheck },
2344           "Before the benchmark runs, check whether mdrun works in parallel" },
2345         { "-gpu_id",
2346           FALSE,
2347           etSTR,
2348           { &eligible_gpu_ids },
2349           "List of unique GPU device IDs that are eligible for use" },
2350         /******************/
2351         /* mdrun options: */
2352         /******************/
2353         /* We let tune_pme parse and understand these options, because we need to
2354          * prevent that they appear on the mdrun command line for the benchmarks */
2355         { "-append",
2356           FALSE,
2357           etBOOL,
2358           { &bAppendFiles },
2359           "Append to previous output files when continuing from checkpoint instead of adding the "
2360           "simulation part number to all file names (for launch only)" },
2361         { "-cpnum",
2362           FALSE,
2363           etBOOL,
2364           { &bKeepAndNumCPT },
2365           "Keep and number checkpoint files (launch only)" },
2366         { "-deffnm", FALSE, etSTR, { &deffnm }, "Set the default filenames (launch only)" },
2367         { "-resethway",
2368           FALSE,
2369           etBOOL,
2370           { &bResetCountersHalfWay },
2371           "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt] "
2372           "(launch only)" }
2373     };
2374
2375 #define NFILE asize(fnm)
2376
2377     seconds = gmx_gettime();
2378
2379     if (!parse_common_args(&argc, argv, PCA_NOEXIT_ON_ARGS, NFILE, fnm, asize(pa), pa, asize(desc),
2380                            desc, 0, nullptr, &oenv))
2381     {
2382         return 0;
2383     }
2384
2385     // procstring[0] is used inside two different conditionals further down
2386     GMX_RELEASE_ASSERT(procstring[0] != nullptr, "Options inconsistency; procstring[0] is NULL");
2387
2388     /* Store the remaining unparsed command line entries in a string which
2389      * is then attached to the mdrun command line */
2390     snew(ExtraArgs, 1);
2391     ExtraArgs[0] = '\0';
2392     for (i = 1; i < argc; i++) /* argc will now be 1 if everything was understood */
2393     {
2394         add_to_string(&ExtraArgs, argv[i]);
2395         add_to_string(&ExtraArgs, " ");
2396     }
2397
2398     if (opt2parg_bSet("-ntmpi", asize(pa), pa))
2399     {
2400         bThreads = TRUE;
2401         if (opt2parg_bSet("-npstring", asize(pa), pa))
2402         {
2403             fprintf(stderr, "WARNING: -npstring has no effect when using threads.\n");
2404         }
2405
2406         if (nnodes > 1)
2407         {
2408             gmx_fatal(FARGS, "Can't run multi-threaded MPI simulation yet!");
2409         }
2410         /* and now we just set this; a bit of an ugly hack*/
2411         nnodes = nthreads;
2412     }
2413     /* Check for PME:PP ratio and whether tpr triggers additional output files */
2414     guessPMEratio = inspect_tpr(NFILE, fnm, &rcoulomb);
2415
2416     /* Automatically set -beo options if -eo is set etc. */
2417     couple_files_options(NFILE, fnm);
2418
2419     /* Construct the command line arguments for benchmark runs
2420      * as well as for the simulation run */
2421     if (bThreads)
2422     {
2423         sprintf(bbuf, " -ntmpi %d ", nthreads);
2424     }
2425     else
2426     {
2427         /* This string will be used for MPI runs and will appear after the
2428          * mpirun command. */
2429         if (std::strcmp(procstring[0], "none") != 0)
2430         {
2431             sprintf(bbuf, " -%s %d ", procstring[0], nnodes);
2432         }
2433         else
2434         {
2435             sprintf(bbuf, " ");
2436         }
2437     }
2438
2439     cmd_np = bbuf;
2440
2441     create_command_line_snippets(bAppendFiles, bKeepAndNumCPT, bResetCountersHalfWay, presteps,
2442                                  NFILE, fnm, &cmd_args_bench, &cmd_args_launch, ExtraArgs, deffnm);
2443
2444     /* Prepare to use checkpoint file if requested */
2445     sim_part = 1;
2446     if (opt2bSet("-cpi", NFILE, fnm))
2447     {
2448         const char* filename = opt2fn("-cpi", NFILE, fnm);
2449         int         cpt_sim_part;
2450         read_checkpoint_part_and_step(filename, &cpt_sim_part, &cpt_steps);
2451         if (cpt_sim_part == 0)
2452         {
2453             gmx_fatal(FARGS, "Checkpoint file %s could not be read!", filename);
2454         }
2455         /* tune_pme will run the next part of the simulation */
2456         sim_part = cpt_sim_part + 1;
2457     }
2458
2459     /* Open performance output file and write header info */
2460     fp = gmx_ffopen(opt2fn("-p", NFILE, fnm), "w");
2461
2462     /* Make a quick consistency check of command line parameters */
2463     check_input(nnodes, repeats, &ntprs, &rmin, rcoulomb, &rmax, maxPMEfraction, minPMEfraction,
2464                 npme_fixed, bench_nsteps, fnm, NFILE, sim_part, presteps, asize(pa), pa);
2465
2466     /* Determine the maximum and minimum number of PME nodes to test,
2467      * the actual list of settings is build in do_the_tests(). */
2468     if ((nnodes > 2) && (npme_fixed < -1))
2469     {
2470         if (0 == std::strcmp(npmevalues_opt[0], "auto"))
2471         {
2472             /* Determine the npme range automatically based on the PME:PP load guess */
2473             if (guessPMEratio > 1.0)
2474             {
2475                 /* More PME than PP work, probably we do not need separate PME nodes at all! */
2476                 maxPMEnodes = nnodes / 2;
2477                 minPMEnodes = nnodes / 2;
2478             }
2479             else
2480             {
2481                 /* PME : PP load is in the range 0..1, let's test around the guess */
2482                 guessPMEnodes = static_cast<int>(nnodes / (1.0 + 1.0 / guessPMEratio));
2483                 minPMEnodes   = static_cast<int>(std::floor(0.7 * guessPMEnodes));
2484                 maxPMEnodes   = static_cast<int>(std::ceil(1.6 * guessPMEnodes));
2485                 maxPMEnodes   = std::min(maxPMEnodes, nnodes / 2);
2486             }
2487         }
2488         else
2489         {
2490             /* Determine the npme range based on user input */
2491             maxPMEnodes = static_cast<int>(std::floor(maxPMEfraction * nnodes));
2492             minPMEnodes = std::max(static_cast<int>(std::floor(minPMEfraction * nnodes)), 0);
2493             fprintf(stdout, "Will try runs with %d ", minPMEnodes);
2494             if (maxPMEnodes != minPMEnodes)
2495             {
2496                 fprintf(stdout, "- %d ", maxPMEnodes);
2497             }
2498             fprintf(stdout,
2499                     "PME-only ranks.\n  Note that the automatic number of PME-only ranks and no "
2500                     "separate PME ranks are always tested.\n");
2501         }
2502     }
2503     else
2504     {
2505         maxPMEnodes = 0;
2506         minPMEnodes = 0;
2507     }
2508
2509     /* Get the commands we need to set up the runs from environment variables */
2510     get_program_paths(bThreads, &cmd_mpirun, &cmd_mdrun);
2511     if (bBenchmark && repeats > 0)
2512     {
2513         check_mdrun_works(bThreads, cmd_mpirun, cmd_np, cmd_mdrun, nullptr != eligible_gpu_ids);
2514     }
2515
2516     /* Print some header info to file */
2517     sep_line(fp);
2518     fprintf(fp, "\n      P E R F O R M A N C E   R E S U L T S\n");
2519     sep_line(fp);
2520     fprintf(fp, "%s for GROMACS %s\n", output_env_get_program_display_name(oenv), gmx_version());
2521     if (!bThreads)
2522     {
2523         fprintf(fp, "Number of ranks         : %d\n", nnodes);
2524         fprintf(fp, "The mpirun command is   : %s\n", cmd_mpirun);
2525         if (std::strcmp(procstring[0], "none") != 0)
2526         {
2527             fprintf(fp, "Passing # of ranks via  : -%s\n", procstring[0]);
2528         }
2529         else
2530         {
2531             fprintf(fp, "Not setting number of ranks in system call\n");
2532         }
2533     }
2534     else
2535     {
2536         fprintf(fp, "Number of threads       : %d\n", nnodes);
2537     }
2538
2539     fprintf(fp, "The mdrun  command is   : %s\n", cmd_mdrun);
2540     fprintf(fp, "mdrun args benchmarks   : %s\n", cmd_args_bench);
2541     fprintf(fp, "Benchmark steps         : ");
2542     fprintf(fp, "%" PRId64, bench_nsteps);
2543     fprintf(fp, "\n");
2544     fprintf(fp, "dlb equilibration steps : %d\n", presteps);
2545     if (sim_part > 1)
2546     {
2547         fprintf(fp, "Checkpoint time step    : ");
2548         fprintf(fp, "%" PRId64, cpt_steps);
2549         fprintf(fp, "\n");
2550     }
2551     fprintf(fp, "mdrun args at launchtime: %s\n", cmd_args_launch);
2552
2553     if (new_sim_nsteps >= 0)
2554     {
2555         bOverwrite = TRUE;
2556         fprintf(stderr, "Note: Simulation input file %s will have ", opt2fn("-so", NFILE, fnm));
2557         fprintf(stderr, "%" PRId64, new_sim_nsteps + cpt_steps);
2558         fprintf(stderr, " steps.\n");
2559         fprintf(fp, "Simulation steps        : ");
2560         fprintf(fp, "%" PRId64, new_sim_nsteps);
2561         fprintf(fp, "\n");
2562     }
2563     if (repeats > 1)
2564     {
2565         fprintf(fp, "Repeats for each test   : %d\n", repeats);
2566     }
2567
2568     if (npme_fixed >= -1)
2569     {
2570         fprintf(fp, "Fixing -npme at         : %d\n", npme_fixed);
2571     }
2572
2573     fprintf(fp, "Input file              : %s\n", opt2fn("-s", NFILE, fnm));
2574     fprintf(fp, "   PME/PP load estimate : %g\n", guessPMEratio);
2575
2576     /* Allocate memory for the inputinfo struct: */
2577     snew(info, 1);
2578     info->nr_inputfiles = ntprs;
2579     for (i = 0; i < ntprs; i++)
2580     {
2581         snew(info->rcoulomb, ntprs);
2582         snew(info->rvdw, ntprs);
2583         snew(info->rlist, ntprs);
2584         snew(info->nkx, ntprs);
2585         snew(info->nky, ntprs);
2586         snew(info->nkz, ntprs);
2587         snew(info->fsx, ntprs);
2588         snew(info->fsy, ntprs);
2589         snew(info->fsz, ntprs);
2590     }
2591     /* Make alternative tpr files to test: */
2592     snew(tpr_names, ntprs);
2593     for (i = 0; i < ntprs; i++)
2594     {
2595         snew(tpr_names[i], STRLEN);
2596     }
2597
2598     /* It can be that ntprs is reduced by make_benchmark_tprs if not enough
2599      * different grids could be found. */
2600     make_benchmark_tprs(opt2fn("-s", NFILE, fnm), tpr_names, bench_nsteps + presteps, cpt_steps,
2601                         rmin, rmax, bScaleRvdw, &ntprs, info, fp);
2602
2603     /********************************************************************************/
2604     /* Main loop over all scenarios we need to test: tpr files, PME nodes, repeats  */
2605     /********************************************************************************/
2606     snew(perfdata, ntprs);
2607     if (bBenchmark)
2608     {
2609         GMX_RELEASE_ASSERT(npmevalues_opt[0] != nullptr,
2610                            "Options inconsistency; npmevalues_opt[0] is NULL");
2611         do_the_tests(fp, tpr_names, maxPMEnodes, minPMEnodes, npme_fixed, npmevalues_opt[0], perfdata,
2612                      &pmeentries, repeats, nnodes, ntprs, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
2613                      cmd_args_bench, fnm, NFILE, presteps, cpt_steps, bCheck, eligible_gpu_ids);
2614
2615         fprintf(fp, "\nTuning took%8.1f minutes.\n", (gmx_gettime() - seconds) / 60.0);
2616
2617         /* Analyse the results and give a suggestion for optimal settings: */
2618         bKeepTPR = analyze_data(fp, opt2fn("-p", NFILE, fnm), perfdata, nnodes, ntprs, pmeentries,
2619                                 repeats, info, &best_tpr, &best_npme);
2620
2621         /* Take the best-performing tpr file and enlarge nsteps to original value */
2622         if (bKeepTPR && !bOverwrite)
2623         {
2624             simulation_tpr = opt2fn("-s", NFILE, fnm);
2625         }
2626         else
2627         {
2628             simulation_tpr = opt2fn("-so", NFILE, fnm);
2629             modify_PMEsettings(bOverwrite ? (new_sim_nsteps + cpt_steps) : info->orig_sim_steps,
2630                                info->orig_init_step, tpr_names[best_tpr], simulation_tpr);
2631         }
2632
2633         /* Let's get rid of the temporary benchmark input files */
2634         for (i = 0; i < ntprs; i++)
2635         {
2636             fprintf(stdout, "Deleting temporary benchmark input file %s\n", tpr_names[i]);
2637             remove(tpr_names[i]);
2638         }
2639
2640         /* Now start the real simulation if the user requested it ... */
2641         launch_simulation(bLaunch, fp, bThreads, cmd_mpirun, cmd_np, cmd_mdrun, cmd_args_launch,
2642                           simulation_tpr, best_npme, eligible_gpu_ids);
2643     }
2644     gmx_ffclose(fp);
2645
2646     /* ... or simply print the performance results to screen: */
2647     if (!bLaunch)
2648     {
2649         finalize(opt2fn("-p", NFILE, fnm));
2650     }
2651
2652     return 0;
2653 }