src/tools/gmx_tune_pme.c

   1 /*
   2  *
   3  *                This source code is part of
   4  *
   5  *                 G   R   O   M   A   C   S
   6  *
   7  *          GROningen MAchine for Chemical Simulations
   8  *
   9  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  10  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  11  * Copyright (c) 2001-2008, The GROMACS development team,
  12  * check out http://www.gromacs.org for more information.
  13
  14  * This program is free software; you can redistribute it and/or
  15  * modify it under the terms of the GNU General Public License
  16  * as published by the Free Software Foundation; either version 2
  17  * of the License, or (at your option) any later version.
  18  *
  19  * If you want to redistribute modifications, please consider that
  20  * scientific software is very special. Version control is crucial -
  21  * bugs must be traceable. We will be happy to consider code for
  22  * inclusion in the official distribution, but derived work must not
  23  * be called official GROMACS. Details are found in the README & COPYING
  24  * files - if they are missing, get the official version at www.gromacs.org.
  25  *
  26  * To help us fund GROMACS development, we humbly ask that you cite
  27  * the papers on the package - you can find them in the top README file.
  28  *
  29  * For more info, check our website at http://www.gromacs.org
  30  *
  31  * And Hey:
  32  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  33  */
  34 #include "statutil.h"
  35 #include "typedefs.h"
  36 #include "smalloc.h"
  37 #include "vec.h"
  38 #include "copyrite.h"
  39 #include "statutil.h"
  40 #include "tpxio.h"
  41 #include "string2.h"
  42 #include "readinp.h"
  43 #include "calcgrid.h"
  44 #include "checkpoint.h"
  45 #include "gmx_ana.h"
  46 #include "names.h"
  47
  48
  49
  50 enum {
  51   ddnoSEL, ddnoINTERLEAVE, ddnoPP_PME, ddnoCARTESIAN, ddnoNR
  52 };
  53
  54 /* Enum for situations that can occur during log file parsing, the
  55  * corresponding string entries can be found in do_the_tests() in
  56  * const char* ParseLog[] */
  57 enum {
  58     eParselogOK,
  59     eParselogNotFound,
  60     eParselogNoPerfData,
  61     eParselogTerm,
  62     eParselogResetProblem,
  63     eParselogNoDDGrid,
  64     eParselogTPXVersion,
  65     eParselogNotParallel,
  66     eParselogFatal,
  67     eParselogNr
  68 };
  69
  70
  71 typedef struct
  72 {
  73     int  nPMEnodes;       /* number of PME only nodes used in this test */
  74     int  nx, ny, nz;      /* DD grid */
  75     int  guessPME;        /* if nPMEnodes == -1, this is the guessed number of PME nodes */
  76     double *Gcycles;      /* This can contain more than one value if doing multiple tests */
  77     double Gcycles_Av;
  78     float *ns_per_day;
  79     float ns_per_day_Av;
  80     float *PME_f_load;    /* PME mesh/force load average*/
  81     float PME_f_load_Av;  /* Average average ;) ... */
  82     char *mdrun_cmd_line; /* Mdrun command line used for this test */
  83 } t_perf;
  84
  85
  86 typedef struct
  87 {
  88     int  nr_inputfiles;         /* The number of tpr and mdp input files */
  89     gmx_large_int_t orig_sim_steps;  /* Number of steps to be done in the real simulation */
  90     real *r_coulomb;            /* The coulomb radii [0...nr_inputfiles] */
  91     real *r_vdw;                /* The vdW radii */
  92     real *rlist;                /* Neighbourlist cutoff radius */
  93     real *rlistlong;
  94     int  *fourier_nx, *fourier_ny, *fourier_nz;
  95     real *fourier_sp;           /* Fourierspacing */
  96
  97     /* Original values as in inputfile: */
  98     real orig_rcoulomb;
  99     real orig_rvdw;
 100     real orig_rlist, orig_rlistlong;
 101     int  orig_nk[DIM];
 102     real orig_fs[DIM];
 103 } t_inputinfo;
 104
 105
 106 static void sep_line(FILE *fp)
 107 {
 108     fprintf(fp, "\n------------------------------------------------------------\n");
 109 }
 110
 111
 112 /* Wrapper for system calls */
 113 static int gmx_system_call(char *command)
 114 {
 115 #ifdef GMX_NO_SYSTEM
 116     gmx_fatal(FARGS,"No calls to system(3) supported on this platform. Attempted to call:\n'%s'\n",command);
 117 #else
 118     return ( system(command) );
 119 #endif
 120 }
 121
 122
 123 /* Check if string starts with substring */
 124 static bool str_starts(const char *string, const char *substring)
 125 {
 126     return ( strncmp(string, substring, strlen(substring)) == 0);
 127 }
 128
 129
 130 static void cleandata(t_perf *perfdata, int test_nr)
 131 {
 132     perfdata->Gcycles[test_nr]    = 0.0;
 133     perfdata->ns_per_day[test_nr] = 0.0;
 134     perfdata->PME_f_load[test_nr] = 0.0;
 135
 136     return;
 137 }
 138
 139
 140 static bool is_equal(real a, real b)
 141 {
 142     real diff, eps=1.0e-6;
 143
 144
 145     diff = a - b;
 146
 147     if (diff < 0.0) diff = -diff;
 148
 149     if (diff < eps)
 150         return TRUE;
 151     else
 152         return FALSE;
 153 }
 154
 155
 156 static void finalize(const char *fn_out)
 157 {
 158     char buf[STRLEN];
 159     FILE *fp;
 160
 161
 162     fp = fopen(fn_out,"r");
 163     fprintf(stdout,"\n\n");
 164
 165     while( fgets(buf,STRLEN-1,fp) != NULL )
 166     {
 167         fprintf(stdout,"%s",buf);
 168     }
 169     fclose(fp);
 170     fprintf(stdout,"\n\n");
 171     thanx(stderr);
 172 }
 173
 174
 175 enum {eFoundNothing, eFoundDDStr, eFoundAccountingStr, eFoundCycleStr};
 176
 177 static int parse_logfile(const char *logfile, const char *errfile,
 178         t_perf *perfdata, int test_nr, int presteps, gmx_large_int_t cpt_steps,
 179         int nnodes)
 180 {
 181     FILE  *fp;
 182     char  line[STRLEN], dumstring[STRLEN], dumstring2[STRLEN];
 183     const char matchstrdd[]="Domain decomposition grid";
 184     const char matchstrcr[]="resetting all time and cycle counters";
 185     const char matchstrbal[]="Average PME mesh/force load:";
 186     const char matchstring[]="R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G";
 187     const char errSIG[]="signal, stopping at the next";
 188     int   iFound;
 189     int   procs;
 190     float  dum1,dum2,dum3;
 191     int   npme;
 192     gmx_large_int_t resetsteps=-1;
 193     bool  bFoundResetStr = FALSE;
 194     bool  bResetChecked  = FALSE;
 195
 196
 197     if (!gmx_fexist(logfile))
 198     {
 199         fprintf(stderr, "WARNING: Could not find logfile %s.\n", logfile);
 200         cleandata(perfdata, test_nr);
 201         return eParselogNotFound;
 202     }
 203
 204     fp = fopen(logfile, "r");
 205     perfdata->PME_f_load[test_nr] = -1.0;
 206     perfdata->guessPME            = -1;
 207
 208     iFound = eFoundNothing;
 209     if (1 == nnodes)
 210         iFound = eFoundDDStr; /* Skip some case statements */
 211
 212     while (fgets(line, STRLEN, fp) != NULL)
 213     {
 214         /* Remove leading spaces */
 215         ltrim(line);
 216
 217         /* Check for TERM and INT signals from user: */
 218         if ( strstr(line, errSIG) != NULL )
 219         {
 220             fclose(fp);
 221             cleandata(perfdata, test_nr);
 222             return eParselogTerm;
 223         }
 224
 225         /* Check whether cycle resetting  worked */
 226         if (presteps > 0 && !bFoundResetStr)
 227         {
 228             if (strstr(line, matchstrcr) != NULL)
 229             {
 230                 sprintf(dumstring, "Step %s", gmx_large_int_pfmt);
 231                 sscanf(line, dumstring, &resetsteps);
 232                 bFoundResetStr = TRUE;
 233                 if (resetsteps == presteps+cpt_steps)
 234                 {
 235                     bResetChecked = TRUE;
 236                 }
 237                 else
 238                 {
 239                     sprintf(dumstring , gmx_large_int_pfmt, resetsteps);
 240                     sprintf(dumstring2, gmx_large_int_pfmt, presteps+cpt_steps);
 241                     fprintf(stderr, "WARNING: Time step counters were reset at step %s,\n"
 242                                     "         though they were supposed to be reset at step %s!\n",
 243                             dumstring, dumstring2);
 244                 }
 245             }
 246         }
 247
 248         /* Look for strings that appear in a certain order in the log file: */
 249         switch(iFound)
 250         {
 251             case eFoundNothing:
 252                 /* Look for domain decomp grid and separate PME nodes: */
 253                 if (str_starts(line, matchstrdd))
 254                 {
 255                     sscanf(line, "Domain decomposition grid %d x %d x %d, separate PME nodes %d",
 256                             &(perfdata->nx), &(perfdata->ny), &(perfdata->nz), &npme);
 257                     if (perfdata->nPMEnodes == -1)
 258                         perfdata->guessPME = npme;
 259                     else if (perfdata->nPMEnodes != npme)
 260                         gmx_fatal(FARGS, "PME nodes from command line and output file are not identical");
 261                     iFound = eFoundDDStr;
 262                 }
 263                 /* Catch a few errors that might have occured: */
 264                 else if (str_starts(line, "There is no domain decomposition for"))
 265                 {
 266                     return eParselogNoDDGrid;
 267                 }
 268                 else if (str_starts(line, "reading tpx file"))
 269                 {
 270                     return eParselogTPXVersion;
 271                 }
 272                 else if (str_starts(line, "The -dd or -npme option request a parallel simulation"))
 273                 {
 274                     return eParselogNotParallel;
 275                 }
 276                 break;
 277             case eFoundDDStr:
 278                 /* Look for PME mesh/force balance (not necessarily present, though) */
 279                 if (str_starts(line, matchstrbal))
 280                     sscanf(&line[strlen(matchstrbal)], "%f", &(perfdata->PME_f_load[test_nr]));
 281                 /* Look for matchstring */
 282                 if (str_starts(line, matchstring))
 283                     iFound = eFoundAccountingStr;
 284                 break;
 285             case eFoundAccountingStr:
 286                 /* Already found matchstring - look for cycle data */
 287                 if (str_starts(line, "Total  "))
 288                 {
 289                     sscanf(line,"Total %d %lf",&procs,&(perfdata->Gcycles[test_nr]));
 290                     iFound = eFoundCycleStr;
 291                 }
 292                 break;
 293             case eFoundCycleStr:
 294                 /* Already found cycle data - look for remaining performance info and return */
 295                 if (str_starts(line, "Performance:"))
 296                 {
 297                     sscanf(line,"%s %f %f %f %f", dumstring, &dum1, &dum2, &(perfdata->ns_per_day[test_nr]), &dum3);
 298                     fclose(fp);
 299                     if (bResetChecked || presteps == 0)
 300                         return eParselogOK;
 301                     else
 302                         return eParselogResetProblem;
 303                 }
 304                 break;
 305         }
 306     } /* while */
 307
 308     /* Check why there is no performance data in the log file.
 309      * Did a fatal errors occur? */
 310     if (gmx_fexist(errfile))
 311     {
 312         fp = fopen(errfile, "r");
 313         while (fgets(line, STRLEN, fp) != NULL)
 314         {
 315             if ( str_starts(line, "Fatal error:") )
 316             {
 317                 if (fgets(line, STRLEN, fp) != NULL)
 318                     fprintf(stderr, "\nWARNING: A fatal error has occured during this benchmark:\n"
 319                                     "%s\n", line);
 320                 fclose(fp);
 321                 cleandata(perfdata, test_nr);
 322                 return eParselogFatal;
 323             }
 324         }
 325         fclose(fp);
 326     }
 327     else
 328     {
 329         fprintf(stderr, "WARNING: Could not find stderr file %s.\n", errfile);
 330     }
 331
 332     /* Giving up ... we could not find out why there is no performance data in
 333      * the log file. */
 334     fprintf(stdout, "No performance data in log file.\n");
 335     fclose(fp);
 336     cleandata(perfdata, test_nr);
 337
 338     return eParselogNoPerfData;
 339 }
 340
 341
 342 static bool analyze_data(
 343         FILE        *fp,
 344         const char  *fn,
 345         t_perf      **perfdata,
 346         int         nnodes,
 347         int         ntprs,
 348         int         ntests,
 349         int         nrepeats,
 350         t_inputinfo *info,
 351         int         *index_tpr,    /* OUT: Nr of mdp file with best settings */
 352         int         *npme_optimal) /* OUT: Optimal number of PME nodes */
 353 {
 354     int  i,j,k;
 355     int line=0, line_win=-1;
 356     int  k_win=-1, i_win=-1, winPME;
 357     double s=0.0;  /* standard deviation */
 358     t_perf *pd;
 359     char strbuf[STRLEN];
 360     char str_PME_f_load[13];
 361     bool bCanUseOrigTPR;
 362
 363
 364     if (nrepeats > 1)
 365     {
 366         sep_line(fp);
 367         fprintf(fp, "Summary of successful runs:\n");
 368         fprintf(fp, "Line tpr PME nodes  Gcycles Av.     Std.dev.       ns/day        PME/f");
 369         if (nnodes > 1)
 370             fprintf(fp, "    DD grid");
 371         fprintf(fp, "\n");
 372     }
 373
 374
 375     for (k=0; k<ntprs; k++)
 376     {
 377         for (i=0; i<ntests; i++)
 378         {
 379             /* Select the right dataset: */
 380             pd = &(perfdata[k][i]);
 381
 382             pd->Gcycles_Av    = 0.0;
 383             pd->PME_f_load_Av = 0.0;
 384             pd->ns_per_day_Av = 0.0;
 385
 386             if (pd->nPMEnodes == -1)
 387                 sprintf(strbuf, "(%3d)", pd->guessPME);
 388             else
 389                 sprintf(strbuf, "     ");
 390
 391             /* Get the average run time of a setting */
 392             for (j=0; j<nrepeats; j++)
 393             {
 394                 pd->Gcycles_Av    += pd->Gcycles[j];
 395                 pd->PME_f_load_Av += pd->PME_f_load[j];
 396             }
 397             pd->Gcycles_Av    /= nrepeats;
 398             pd->PME_f_load_Av /= nrepeats;
 399
 400             for (j=0; j<nrepeats; j++)
 401             {
 402                 if (pd->ns_per_day[j] > 0.0)
 403                     pd->ns_per_day_Av += pd->ns_per_day[j];
 404                 else
 405                 {
 406                     /* Somehow the performance number was not aquired for this run,
 407                      * therefor set the average to some negative value: */
 408                     pd->ns_per_day_Av = -1.0f*nrepeats;
 409                     break;
 410                 }
 411             }
 412             pd->ns_per_day_Av /= nrepeats;
 413
 414             /* Nicer output: */
 415             if (pd->PME_f_load_Av > 0.0)
 416                 sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load_Av);
 417             else
 418                 sprintf(str_PME_f_load, "%s", "         -  ");
 419
 420
 421             /* We assume we had a successful run if both averages are positive */
 422             if (pd->Gcycles_Av > 0.0 && pd->ns_per_day_Av > 0.0)
 423             {
 424                 /* Output statistics if repeats were done */
 425                 if (nrepeats > 1)
 426                 {
 427                     /* Calculate the standard deviation */
 428                     s = 0.0;
 429                     for (j=0; j<nrepeats; j++)
 430                         s += pow( pd->Gcycles[j] - pd->Gcycles_Av, 2 );
 431                     s /= (nrepeats - 1);
 432                     s = sqrt(s);
 433
 434                     fprintf(fp, "%4d %3d %4d%s %12.3f %12.3f %12.3f %s",
 435                             line, k, pd->nPMEnodes, strbuf, pd->Gcycles_Av, s,
 436                             pd->ns_per_day_Av, str_PME_f_load);
 437                     if (nnodes > 1)
 438                         fprintf(fp, "  %3d %3d %3d", pd->nx, pd->ny, pd->nz);
 439                     fprintf(fp, "\n");
 440                 }
 441                 /* Store the index of the best run found so far in 'winner': */
 442                 if ( (k_win == -1) || (pd->Gcycles_Av < perfdata[k_win][i_win].Gcycles_Av) )
 443                 {
 444                     k_win = k;
 445                     i_win = i;
 446                     line_win = line;
 447                 }
 448                 line++;
 449             }
 450         }
 451     }
 452
 453     if (k_win == -1)
 454         gmx_fatal(FARGS, "None of the runs was successful! Check %s for problems.", fn);
 455
 456     sep_line(fp);
 457
 458     winPME = perfdata[k_win][i_win].nPMEnodes;
 459     if (winPME == -1)
 460         sprintf(strbuf, "%s", "the automatic number of");
 461     else
 462         sprintf(strbuf, "%d", winPME);
 463     fprintf(fp, "Best performance was achieved with %s PME nodes", strbuf);
 464     if (nrepeats > 1)
 465         fprintf(fp, " (see line %d)", line_win);
 466     fprintf(fp, "\n");
 467
 468     /* Only mention settings if they were modified: */
 469     bCanUseOrigTPR = TRUE;
 470     if ( !is_equal(info->r_coulomb[k_win], info->orig_rcoulomb) )
 471     {
 472         fprintf(fp, "Optimized PME settings:\n"
 473                     "   New Coulomb radius: %f nm (was %f nm)\n",
 474                     info->r_coulomb[k_win], info->orig_rcoulomb);
 475         bCanUseOrigTPR = FALSE;
 476     }
 477
 478     if ( !is_equal(info->r_vdw[k_win], info->orig_rvdw) )
 479     {
 480         fprintf(fp, "   New Van der Waals radius: %f nm (was %f nm)\n",
 481                 info->r_vdw[k_win], info->orig_rvdw);
 482         bCanUseOrigTPR = FALSE;
 483     }
 484
 485     if ( ! (info->fourier_nx[k_win]==info->orig_nk[XX] &&
 486             info->fourier_ny[k_win]==info->orig_nk[YY] &&
 487             info->fourier_nz[k_win]==info->orig_nk[ZZ] ) )
 488     {
 489         fprintf(fp, "   New Fourier grid xyz: %d %d %d (was %d %d %d)\n",
 490                 info->fourier_nx[k_win], info->fourier_ny[k_win], info->fourier_nz[k_win],
 491                 info->orig_nk[XX], info->orig_nk[YY], info->orig_nk[ZZ]);
 492         bCanUseOrigTPR = FALSE;
 493     }
 494     if (bCanUseOrigTPR && ntprs > 1)
 495         fprintf(fp, "and original PME settings.\n");
 496
 497     fflush(fp);
 498
 499     /* Return the index of the mdp file that showed the highest performance
 500      * and the optimal number of PME nodes */
 501     *index_tpr    = k_win;
 502     *npme_optimal = winPME;
 503
 504     return bCanUseOrigTPR;
 505 }
 506
 507
 508 /* Get the commands we need to set up the runs from environment variables */
 509 static void get_program_paths(bool bThreads, char *cmd_mpirun[], char cmd_np[],
 510                               char *cmd_mdrun[], int repeats)
 511 {
 512     char *command=NULL;
 513     char *cp;
 514     char *cp2;
 515     char line[STRLEN];
 516     FILE *fp;
 517     const char def_mpirun[] = "mpirun";
 518     const char def_mdrun[]  = "mdrun";
 519     const char filename[]   = "benchtest.log";
 520     const char match_mpi[]  = "NNODES=";
 521     const char match_mdrun[]= "Program: ";
 522     const char empty_mpirun[] = "";
 523     bool  bMdrun = FALSE;
 524     bool  bMPI   = FALSE;
 525
 526
 527     /* Get the commands we need to set up the runs from environment variables */
 528     if (!bThreads)
 529     {
 530         if ( (cp = getenv("MPIRUN")) != NULL)
 531             *cmd_mpirun = strdup(cp);
 532         else
 533             *cmd_mpirun = strdup(def_mpirun);
 534     }
 535     else
 536     {
 537         *cmd_mpirun = strdup(empty_mpirun);
 538     }
 539
 540     if ( (cp = getenv("MDRUN" )) != NULL )
 541         *cmd_mdrun  = strdup(cp);
 542     else
 543         *cmd_mdrun  = strdup(def_mdrun);
 544
 545
 546     /* If no simulations have to be performed, we are done here */
 547     if (repeats <= 0)
 548         return;
 549
 550     /* Run a small test to see whether mpirun + mdrun work  */
 551     fprintf(stdout, "Making sure that mdrun can be executed. ");
 552     if (bThreads)
 553     {
 554         snew(command, strlen(*cmd_mdrun) + strlen(cmd_np) + strlen(filename) + 50);
 555         sprintf(command, "%s%s-version -maxh 0.001 1> %s 2>&1", *cmd_mdrun, cmd_np, filename);
 556     }
 557     else
 558     {
 559         snew(command, strlen(*cmd_mpirun) + strlen(cmd_np) + strlen(*cmd_mdrun) + strlen(filename) + 50);
 560         sprintf(command, "%s%s%s -version -maxh 0.001 1> %s 2>&1", *cmd_mpirun, cmd_np, *cmd_mdrun, filename);
 561     }
 562     fprintf(stdout, "Trying '%s' ... ", command);
 563     make_backup(filename);
 564     gmx_system_call(command);
 565
 566     /* Check if we find the characteristic string in the output: */
 567     if (!gmx_fexist(filename))
 568         gmx_fatal(FARGS, "Output from test run could not be found.");
 569
 570     fp = fopen(filename, "r");
 571     /* We need to scan the whole output file, since sometimes the queuing system
 572      * also writes stuff to stdout/err */
 573     while ( !feof(fp) )
 574     {
 575         cp2=fgets(line, STRLEN, fp);
 576         if (cp2!=NULL)
 577         {
 578             if ( str_starts(line, match_mdrun) )
 579                 bMdrun = TRUE;
 580             if ( str_starts(line, match_mpi) )
 581                 bMPI = TRUE;
 582         }
 583     }
 584     fclose(fp);
 585
 586     if (bThreads)
 587     {
 588         if (bMPI)
 589         {
 590             gmx_fatal(FARGS, "Need a threaded version of mdrun. This one\n"
 591                     "(%s)\n"
 592                     "seems to have been compiled with MPI instead.",
 593                     *cmd_mdrun);
 594         }
 595     }
 596     else
 597     {
 598         if (bMdrun && !bMPI)
 599         {
 600             gmx_fatal(FARGS, "Need an MPI-enabled version of mdrun. This one\n"
 601                     "(%s)\n"
 602                     "seems to have been compiled without MPI support.",
 603                     *cmd_mdrun);
 604         }
 605     }
 606
 607     if (!bMdrun)
 608     {
 609         gmx_fatal(FARGS, "Cannot execute mdrun. Please check %s for problems!",
 610                 filename);
 611     }
 612
 613     fprintf(stdout, "passed.\n");
 614
 615     /* Clean up ... */
 616     remove(filename);
 617     sfree(command);
 618 }
 619
 620
 621 static void launch_simulation(
 622         bool bLaunch,           /* Should the simulation be launched? */
 623         FILE *fp,               /* General log file */
 624         bool bThreads,          /* whether to use threads */
 625         char *cmd_mpirun,       /* Command for mpirun */
 626         char *cmd_np,           /* Switch for -np or -nt or empty */
 627         char *cmd_mdrun,        /* Command for mdrun */
 628         char *args_for_mdrun,   /* Arguments for mdrun */
 629         const char *simulation_tpr,   /* This tpr will be simulated */
 630         int  nnodes,            /* Number of nodes to run on */
 631         int  nPMEnodes)         /* Number of PME nodes to use */
 632 {
 633     char  *command;
 634
 635
 636     /* Make enough space for the system call command,
 637      * (100 extra chars for -npme ... etc. options should suffice): */
 638     snew(command, strlen(cmd_mpirun)+strlen(cmd_mdrun)+strlen(cmd_np)+strlen(args_for_mdrun)+strlen(simulation_tpr)+100);
 639
 640     /* Note that the -passall options requires args_for_mdrun to be at the end
 641      * of the command line string */
 642     if (bThreads)
 643     {
 644         sprintf(command, "%s%s-npme %d -s %s %s",
 645                 cmd_mdrun, cmd_np, nPMEnodes, simulation_tpr, args_for_mdrun);
 646     }
 647     else
 648     {
 649         sprintf(command, "%s%s%s -npme %d -s %s %s",
 650                 cmd_mpirun, cmd_np, cmd_mdrun, nPMEnodes, simulation_tpr, args_for_mdrun);
 651     }
 652
 653     fprintf(fp, "%s this command line to launch the simulation:\n\n%s", bLaunch? "Using":"Please use", command);
 654     sep_line(fp);
 655     fflush(fp);
 656
 657     /* Now the real thing! */
 658     if (bLaunch)
 659     {
 660         fprintf(stdout, "\nLaunching simulation with best parameters now.\nExecuting '%s'", command);
 661         sep_line(stdout);
 662         fflush(stdout);
 663         gmx_system_call(command);
 664         thanx(fp);
 665     }
 666 }
 667
 668
 669 static void modify_PMEsettings(
 670         gmx_large_int_t simsteps,  /* Set this value as number of time steps */
 671         const char *fn_best_tpr,   /* tpr file with the best performance */
 672         const char *fn_sim_tpr)    /* name of tpr file to be launched */
 673 {
 674     t_inputrec   *ir;
 675     t_state      state;
 676     gmx_mtop_t   mtop;
 677     char         buf[200];
 678
 679     snew(ir,1);
 680     read_tpx_state(fn_best_tpr,ir,&state,NULL,&mtop);
 681
 682     /* Set nsteps to the right value */
 683     ir->nsteps = simsteps;
 684
 685     /* Write the tpr file which will be launched */
 686     sprintf(buf, "Writing optimized simulation file %s with nsteps=%s.\n", fn_sim_tpr, gmx_large_int_pfmt);
 687     fprintf(stdout,buf,ir->nsteps);
 688     fflush(stdout);
 689     write_tpx_state(fn_sim_tpr,ir,&state,&mtop);
 690
 691     sfree(ir);
 692 }
 693
 694
 695 #define EPME_SWITCHED(e) ((e) == eelPMESWITCH || (e) == eelPMEUSERSWITCH)
 696
 697 /* Make additional TPR files with more computational load for the
 698  * direct space processors: */
 699 static void make_benchmark_tprs(
 700         const char *fn_sim_tpr,       /* READ : User-provided tpr file */
 701         char *fn_bench_tprs[],  /* WRITE: Names of benchmark tpr files */
 702         gmx_large_int_t benchsteps,  /* Number of time steps for benchmark runs */
 703         gmx_large_int_t statesteps,  /* Step counter in checkpoint file */
 704         real upfac,             /* Scale rcoulomb inbetween downfac and upfac */
 705         real downfac,
 706         int ntprs,              /* No. of TPRs to write, each with a different rcoulomb and fourierspacing */
 707         real fourierspacing,    /* Basic fourierspacing from tpr input file */
 708         t_inputinfo *info,      /* Contains information about mdp file options */
 709         FILE *fp)               /* Write the output here */
 710 {
 711     int          i,j,d;
 712     t_inputrec   *ir;
 713     t_state      state;
 714     gmx_mtop_t   mtop;
 715     real         fac;
 716     real         nlist_buffer; /* Thickness of the buffer regions for PME-switch potentials: */
 717     char         buf[200];
 718     rvec         box_size;
 719     bool         bNote = FALSE;
 720
 721
 722     sprintf(buf, "Making benchmark tpr file%s with %s time steps", ntprs>1? "s":"", gmx_large_int_pfmt);
 723     fprintf(stdout, buf, benchsteps);
 724     if (statesteps > 0)
 725     {
 726         sprintf(buf, " (adding %s steps from checkpoint file)", gmx_large_int_pfmt);
 727         fprintf(stdout, buf, statesteps);
 728         benchsteps += statesteps;
 729     }
 730     fprintf(stdout, ".\n");
 731
 732
 733     snew(ir,1);
 734     read_tpx_state(fn_sim_tpr,ir,&state,NULL,&mtop);
 735
 736     /* Check if some kind of PME was chosen */
 737     if (EEL_PME(ir->coulombtype) == FALSE)
 738         gmx_fatal(FARGS, "Can only do optimizations for simulations with %s electrostatics.",
 739                 EELTYPE(eelPME));
 740
 741     /* Check if rcoulomb == rlist, which is necessary for plain PME. */
 742     if (  (eelPME == ir->coulombtype) && !(ir->rcoulomb == ir->rlist) )
 743     {
 744         gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to rlist (%f).",
 745                 EELTYPE(eelPME), ir->rcoulomb, ir->rlist);
 746     }
 747     /* For other PME types, rcoulomb is allowed to be smaller than rlist */
 748     else if (ir->rcoulomb > ir->rlist)
 749     {
 750         gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to or smaller than rlist (%f)",
 751                 EELTYPE(ir->coulombtype), ir->rcoulomb, ir->rlist);
 752     }
 753
 754     /* Reduce the number of steps for the benchmarks */
 755     info->orig_sim_steps = ir->nsteps;
 756     ir->nsteps           = benchsteps;
 757
 758     /* Determine length of triclinic box vectors */
 759     for(d=0; d<DIM; d++)
 760     {
 761         box_size[d] = 0;
 762         for(i=0;i<DIM;i++)
 763             box_size[d] += state.box[d][i]*state.box[d][i];
 764         box_size[d] = sqrt(box_size[d]);
 765     }
 766
 767     /* Remember the original values: */
 768     info->orig_rvdw            = ir->rvdw;
 769     info->orig_rcoulomb        = ir->rcoulomb;
 770     info->orig_rlist           = ir->rlist;
 771     info->orig_rlistlong       = ir->rlistlong;
 772     info->orig_nk[XX]          = ir->nkx;
 773     info->orig_nk[YY]          = ir->nky;
 774     info->orig_nk[ZZ]          = ir->nkz;
 775     info->orig_fs[XX]          = box_size[XX]/ir->nkx;  /* fourierspacing in x direction */
 776     info->orig_fs[YY]          = box_size[YY]/ir->nky;
 777     info->orig_fs[ZZ]          = box_size[ZZ]/ir->nkz;
 778
 779     /* For PME-switch potentials, keep the radial distance of the buffer region */
 780     nlist_buffer   = info->orig_rlist    - info->orig_rcoulomb;
 781
 782     /* Print information about settings of which some are potentially modified: */
 783     fprintf(fp, "   Coulomb type         : %s\n", EELTYPE(ir->coulombtype));
 784     fprintf(fp, "   Fourier nkx nky nkz  : %d %d %d\n",
 785             info->orig_nk[XX], info->orig_nk[YY], info->orig_nk[ZZ]);
 786     fprintf(fp, "   rcoulomb             : %f nm\n", info->orig_rcoulomb);
 787     fprintf(fp, "   Van der Waals type   : %s\n", EVDWTYPE(ir->vdwtype));
 788     fprintf(fp, "   rvdw                 : %f nm\n", info->orig_rvdw);
 789     if (EVDW_SWITCHED(ir->vdwtype))
 790         fprintf(fp, "   rvdw_switch          : %f nm\n", ir->rvdw_switch);
 791     if (EPME_SWITCHED(ir->coulombtype))
 792         fprintf(fp, "   rlist                : %f nm\n", info->orig_rlist);
 793     if (info->orig_rlistlong != max_cutoff(ir->rvdw,ir->rcoulomb))
 794         fprintf(fp, "   rlistlong            : %f nm\n", info->orig_rlistlong);
 795
 796     /* Print a descriptive line about the tpr settings tested */
 797     fprintf(fp, "\nWill try these real/reciprocal workload settings:\n");
 798     fprintf(fp, " No.   scaling  rcoulomb");
 799     fprintf(fp, "  nkx  nky  nkz");
 800     if (fourierspacing > 0)
 801         fprintf(fp, "   spacing");
 802     if (evdwCUT == ir->vdwtype)
 803         fprintf(fp, "      rvdw");
 804     if (EPME_SWITCHED(ir->coulombtype))
 805         fprintf(fp, "     rlist");
 806     if ( info->orig_rlistlong != max_cutoff(info->orig_rlist,max_cutoff(info->orig_rvdw,info->orig_rcoulomb)) )
 807         fprintf(fp, " rlistlong");
 808     fprintf(fp, "  tpr file\n");
 809
 810     if (ntprs > 1)
 811     {
 812         fprintf(stdout, "Calculating PME grid points on the basis of ");
 813         if (fourierspacing > 0)
 814             fprintf(stdout, "a fourierspacing of %f nm\n", fourierspacing);
 815         else
 816             fprintf(stdout, "original nkx/nky/nkz settings from tpr file\n");
 817     }
 818
 819     /* Loop to create the requested number of tpr input files */
 820     for (j = 0; j < ntprs; j++)
 821     {
 822         /* Rcoulomb scaling factor for this file: */
 823         if (ntprs == 1)
 824             fac = downfac;
 825          else
 826             fac = (upfac-downfac)/(ntprs-1) * j + downfac;
 827         fprintf(stdout, "--- Scaling factor %f ---\n", fac);
 828
 829         /* Scale the Coulomb radius */
 830         ir->rcoulomb = info->orig_rcoulomb*fac;
 831
 832         /* Adjust other radii since various conditions neet to be fulfilled */
 833         if (eelPME == ir->coulombtype)
 834         {
 835             /* plain PME, rcoulomb must be equal to rlist */
 836             ir->rlist = ir->rcoulomb;
 837         }
 838         else
 839         {
 840             /* rlist must be >= rcoulomb, we keep the size of the buffer region */
 841             ir->rlist = ir->rcoulomb + nlist_buffer;
 842         }
 843
 844         if (evdwCUT == ir->vdwtype)
 845         {
 846             /* For vdw cutoff, rvdw >= rlist */
 847             ir->rvdw = max(info->orig_rvdw, ir->rlist);
 848         }
 849
 850         ir->rlistlong = max_cutoff(ir->rlist,max_cutoff(ir->rvdw,ir->rcoulomb));
 851
 852         /* Try to reduce the number of reciprocal grid points in a smart way */
 853         /* Did the user supply a value for fourierspacing on the command line? */
 854         if (fourierspacing > 0)
 855         {
 856             info->fourier_sp[j] = fourierspacing*fac;
 857             /* Calculate the optimal grid dimensions */
 858             ir->nkx = 0;
 859             ir->nky = 0;
 860             ir->nkz = 0;
 861             calc_grid(stdout,state.box,info->fourier_sp[j],&(ir->nkx),&(ir->nky),&(ir->nkz),1);
 862             /* Check consistency */
 863             if (0 == j)
 864                 if ((ir->nkx != info->orig_nk[XX]) || (ir->nky != info->orig_nk[YY]) || (ir->nkz != info->orig_nk[ZZ]))
 865                 {
 866                     fprintf(stderr, "WARNING: Original grid was %dx%dx%d. The fourierspacing of %f nm does not reproduce the grid\n"
 867                                     "         found in the tpr input file! Will use the new settings.\n",
 868                                     info->orig_nk[XX],info->orig_nk[YY],info->orig_nk[ZZ],fourierspacing);
 869                     bNote = TRUE;
 870                 }
 871         }
 872         else
 873         {
 874             if (0 == j)
 875             {
 876                 /* Print out fourierspacing from input tpr */
 877                 fprintf(stdout, "Input file fourier grid is %dx%dx%d\n",
 878                         info->orig_nk[XX], info->orig_nk[YY], info->orig_nk[ZZ]);
 879             }
 880             /* Reconstruct fourierspacing for each dimension from the input file */
 881             ir->nkx=0;
 882             calc_grid(stdout,state.box,info->orig_fs[XX]*fac,&(ir->nkx),&(ir->nky),&(ir->nkz),1);
 883             ir->nky=0;
 884             calc_grid(stdout,state.box,info->orig_fs[YY]*fac,&(ir->nkx),&(ir->nky),&(ir->nkz),1);
 885             ir->nkz=0;
 886             calc_grid(stdout,state.box,info->orig_fs[ZZ]*fac,&(ir->nkx),&(ir->nky),&(ir->nkz),1);
 887         }
 888
 889         /* Save modified radii and fourier grid components for later output: */
 890         info->r_coulomb[j]        = ir->rcoulomb;
 891         info->r_vdw[j]            = ir->rvdw;
 892         info->fourier_nx[j]       = ir->nkx;
 893         info->fourier_ny[j]       = ir->nky;
 894         info->fourier_nz[j]       = ir->nkz;
 895         info->rlist[j]            = ir->rlist;
 896         info->rlistlong[j]        = ir->rlistlong;
 897
 898         /* Write the benchmark tpr file */
 899         strncpy(fn_bench_tprs[j],fn_sim_tpr,strlen(fn_sim_tpr)-strlen(".tpr"));
 900         sprintf(buf, "_bench%.2d.tpr", j);
 901         strcat(fn_bench_tprs[j], buf);
 902         fprintf(stdout,"Writing benchmark tpr %s with nsteps=", fn_bench_tprs[j]);
 903         fprintf(stdout, gmx_large_int_pfmt, ir->nsteps);
 904         fprintf(stdout,", scaling factor %f\n", fac);
 905         write_tpx_state(fn_bench_tprs[j],ir,&state,&mtop);
 906
 907         /* Write information about modified tpr settings to log file */
 908         fprintf(fp, "%4d%10f%10f", j, fac, ir->rcoulomb);
 909         fprintf(fp, "%5d%5d%5d", ir->nkx, ir->nky, ir->nkz);
 910         if (fourierspacing > 0)
 911             fprintf(fp, "%9f ", info->fourier_sp[j]);
 912         if (evdwCUT == ir->vdwtype)
 913             fprintf(fp, "%10f", ir->rvdw);
 914         if (EPME_SWITCHED(ir->coulombtype))
 915             fprintf(fp, "%10f", ir->rlist);
 916         if ( info->orig_rlistlong != max_cutoff(info->orig_rlist,max_cutoff(info->orig_rvdw,info->orig_rcoulomb)) )
 917             fprintf(fp, "%10f", ir->rlistlong);
 918         fprintf(fp, "  %-14s\n",fn_bench_tprs[j]);
 919
 920         /* Make it clear to the user that some additional settings were modified */
 921         if (   !is_equal(ir->rvdw           , info->orig_rvdw)
 922             || !is_equal(ir->rlistlong      , info->orig_rlistlong) )
 923         {
 924             bNote = TRUE;
 925         }
 926     }
 927     if (bNote)
 928         fprintf(fp, "\nNote that in addition to rcoulomb and the fourier grid\n"
 929                     "also other input settings were changed (see table above).\n"
 930                     "Please check if the modified settings are appropriate.\n");
 931     fflush(stdout);
 932     fflush(fp);
 933     sfree(ir);
 934 }
 935
 936
 937 /* Whether these files are written depends on tpr (or mdp) settings,
 938  * not on mdrun command line options! */
 939 static bool tpr_triggers_file(const char *opt)
 940 {
 941     if ( (0 == strcmp(opt, "-pf"))
 942       || (0 == strcmp(opt, "-px")) )
 943         return TRUE;
 944     else
 945         return FALSE;
 946 }
 947
 948
 949 /* Rename the files we want to keep to some meaningful filename and
 950  * delete the rest */
 951 static void cleanup(const t_filenm *fnm, int nfile, int k, int nnodes,
 952                     int nPMEnodes, int nr, bool bKeepStderr)
 953 {
 954     char numstring[STRLEN];
 955     char newfilename[STRLEN];
 956     const char *fn=NULL;
 957     int i;
 958     const char *opt;
 959
 960
 961     fprintf(stdout, "Cleaning up, deleting benchmark temp files ...\n");
 962
 963     for (i=0; i<nfile; i++)
 964     {
 965         opt = (char *)fnm[i].opt;
 966         if ( strcmp(opt, "-p") == 0 )
 967         {
 968             /* do nothing; keep this file */
 969             ;
 970         }
 971         else if (strcmp(opt, "-bg") == 0)
 972         {
 973             /* Give the log file a nice name so one can later see which parameters were used */
 974             numstring[0] = '\0';
 975             if (nr > 0)
 976                 sprintf(numstring, "_%d", nr);
 977             sprintf(newfilename, "%s_no%d_np%d_npme%d%s", opt2fn("-bg",nfile,fnm), k, nnodes, nPMEnodes, numstring);
 978             if (gmx_fexist(opt2fn("-bg",nfile,fnm)))
 979             {
 980                 fprintf(stdout, "renaming log file to %s\n", newfilename);
 981                 make_backup(newfilename);
 982                 rename(opt2fn("-bg",nfile,fnm), newfilename);
 983             }
 984         }
 985         else if (strcmp(opt, "-err") == 0)
 986         {
 987             /* This file contains the output of stderr. We want to keep it in
 988              * cases where there have been problems. */
 989             fn = opt2fn(opt, nfile, fnm);
 990             numstring[0] = '\0';
 991             if (nr > 0)
 992                 sprintf(numstring, "_%d", nr);
 993             sprintf(newfilename, "%s_no%d_np%d_npme%d%s", fn, k, nnodes, nPMEnodes, numstring);
 994             if (gmx_fexist(fn))
 995             {
 996                 if (bKeepStderr)
 997                 {
 998                     fprintf(stdout, "Saving stderr output in %s\n", newfilename);
 999                     make_backup(newfilename);
1000                     rename(fn, newfilename);
1001                 }
1002                 else
1003                 {
1004                     fprintf(stdout, "Deleting %s\n", fn);
1005                     remove(fn);
1006                 }
1007             }
1008         }
1009         /* Delete the files which are created for each benchmark run: (options -b*) */
1010         else if ( ( (0 == strncmp(opt, "-b", 2)) && (opt2bSet(opt,nfile,fnm) || !is_optional(&fnm[i])) )
1011                   || tpr_triggers_file(opt) )
1012         {
1013             fn = opt2fn(opt, nfile, fnm);
1014             if (gmx_fexist(fn))
1015             {
1016                 fprintf(stdout, "Deleting %s\n", fn);
1017                 remove(fn);
1018             }
1019         }
1020     }
1021 }
1022
1023
1024 /* Returns the largest common factor of n1 and n2 */
1025 static int largest_common_factor(int n1, int n2)
1026 {
1027     int factor, nmax;
1028
1029     nmax = min(n1, n2);
1030     for (factor=nmax; factor > 0; factor--)
1031     {
1032         if ( 0==(n1 % factor) && 0==(n2 % factor) )
1033         {
1034             return(factor);
1035         }
1036     }
1037     return 0; /* one for the compiler */
1038 }
1039
1040 enum {eNpmeAuto, eNpmeAll, eNpmeReduced, eNpmeSubset, eNpmeNr};
1041
1042 /* Create a list of numbers of PME nodes to test */
1043 static void make_npme_list(
1044         const char *npmevalues_opt,  /* Make a complete list with all
1045                            * possibilities or a short list that keeps only
1046                            * reasonable numbers of PME nodes                  */
1047         int *nentries,    /* Number of entries we put in the nPMEnodes list   */
1048         int *nPMEnodes[], /* Each entry contains the value for -npme          */
1049         int nnodes,       /* Total number of nodes to do the tests on         */
1050         int minPMEnodes,  /* Minimum number of PME nodes                      */
1051         int maxPMEnodes)  /* Maximum number of PME nodes                      */
1052 {
1053     int i,npme,npp;
1054     int min_factor=1;     /* We request that npp and npme have this minimal
1055                            * largest common factor (depends on npp)           */
1056     int nlistmax;         /* Max. list size                                   */
1057     int nlist;            /* Actual number of entries in list                 */
1058     int eNPME;
1059
1060
1061     /* Do we need to check all possible values for -npme or is a reduced list enough? */
1062     if ( 0 == strcmp(npmevalues_opt, "all") )
1063     {
1064         eNPME = eNpmeAll;
1065     }
1066     else if ( 0 == strcmp(npmevalues_opt, "subset") )
1067     {
1068         eNPME = eNpmeSubset;
1069     }
1070     else if ( 0 == strcmp(npmevalues_opt, "auto") )
1071     {
1072         if (nnodes <= 64)
1073             eNPME = eNpmeAll;
1074         else if (nnodes < 128)
1075             eNPME = eNpmeReduced;
1076         else
1077             eNPME = eNpmeSubset;
1078     }
1079     else
1080     {
1081         gmx_fatal(FARGS, "Unknown option for -npme in make_npme_list");
1082     }
1083
1084     /* Calculate how many entries we could possibly have (in case of -npme all) */
1085     if (nnodes > 2)
1086     {
1087         nlistmax = maxPMEnodes - minPMEnodes + 3;
1088         if (0 == minPMEnodes)
1089             nlistmax--;
1090     }
1091     else
1092         nlistmax = 1;
1093
1094     /* Now make the actual list which is at most of size nlist */
1095     snew(*nPMEnodes, nlistmax);
1096     nlist = 0; /* start counting again, now the real entries in the list */
1097     for (i = 0; i < nlistmax - 2; i++)
1098     {
1099         npme = maxPMEnodes - i;
1100         npp  = nnodes-npme;
1101         switch (eNPME)
1102         {
1103             case eNpmeAll:
1104                 min_factor = 1;
1105                 break;
1106             case eNpmeReduced:
1107                 min_factor = 2;
1108                 break;
1109             case eNpmeSubset:
1110                 /* For 2d PME we want a common largest factor of at least the cube
1111                  * root of the number of PP nodes */
1112                 min_factor = (int) pow(npp, 1.0/3.0);
1113                 break;
1114             default:
1115                 gmx_fatal(FARGS, "Unknown option for eNPME in make_npme_list");
1116                 break;
1117         }
1118         if (largest_common_factor(npp, npme) >= min_factor)
1119         {
1120             (*nPMEnodes)[nlist] = npme;
1121             nlist++;
1122         }
1123     }
1124     /* We always test 0 PME nodes and the automatic number */
1125     *nentries = nlist + 2;
1126     (*nPMEnodes)[nlist  ] =  0;
1127     (*nPMEnodes)[nlist+1] = -1;
1128
1129     fprintf(stderr, "Will try the following %d different values for -npme:\n", *nentries);
1130     for (i=0; i<*nentries-1; i++)
1131         fprintf(stderr, "%d, ", (*nPMEnodes)[i]);
1132     fprintf(stderr, "and %d (auto).\n", (*nPMEnodes)[*nentries-1]);
1133 }
1134
1135
1136 /* Allocate memory to store the performance data */
1137 static void init_perfdata(t_perf *perfdata[], int ntprs, int datasets, int repeats)
1138 {
1139     int i, j, k;
1140
1141
1142     for (k=0; k<ntprs; k++)
1143     {
1144         snew(perfdata[k], datasets);
1145         for (i=0; i<datasets; i++)
1146         {
1147             for (j=0; j<repeats; j++)
1148             {
1149                 snew(perfdata[k][i].Gcycles   , repeats);
1150                 snew(perfdata[k][i].ns_per_day, repeats);
1151                 snew(perfdata[k][i].PME_f_load, repeats);
1152             }
1153         }
1154     }
1155 }
1156
1157
1158 static void do_the_tests(
1159         FILE *fp,                   /* General g_tune_pme output file         */
1160         char **tpr_names,           /* Filenames of the input files to test   */
1161         int maxPMEnodes,            /* Max fraction of nodes to use for PME   */
1162         int minPMEnodes,            /* Min fraction of nodes to use for PME   */
1163         const char *npmevalues_opt, /* Which -npme values should be tested    */
1164         t_perf **perfdata,          /* Here the performace data is stored     */
1165         int *pmeentries,            /* Entries in the nPMEnodes list          */
1166         int repeats,                /* Repeat each test this often            */
1167         int nnodes,                 /* Total number of nodes = nPP + nPME     */
1168         int nr_tprs,                /* Total number of tpr files to test      */
1169         bool bThreads,              /* Threads or MPI?                        */
1170         char *cmd_mpirun,           /* mpirun command string                  */
1171         char *cmd_np,               /* "-np", "-n", whatever mpirun needs     */
1172         char *cmd_mdrun,            /* mdrun command string                   */
1173         char *cmd_args_bench,       /* arguments for mdrun in a string        */
1174         const t_filenm *fnm,        /* List of filenames from command line    */
1175         int nfile,                  /* Number of files specified on the cmdl. */
1176         int sim_part,               /* For checkpointing                      */
1177         int presteps,               /* DLB equilibration steps, is checked    */
1178         gmx_large_int_t cpt_steps)  /* Time step counter in the checkpoint    */
1179 {
1180     int     i,nr,k,ret,count=0,totaltests;
1181     int     *nPMEnodes=NULL;
1182     t_perf  *pd=NULL;
1183     int     cmdline_length;
1184     char    *command, *cmd_stub;
1185     char    buf[STRLEN];
1186     bool    bResetProblem=FALSE;
1187
1188
1189     /* This string array corresponds to the eParselog enum type at the start
1190      * of this file */
1191     const char* ParseLog[] = {"OK.",
1192                               "Logfile not found!",
1193                               "No timings, logfile truncated?",
1194                               "Run was terminated.",
1195                               "Counters were not reset properly.",
1196                               "No DD grid found for these settings.",
1197                               "TPX version conflict!",
1198                               "mdrun was not started in parallel!",
1199                               "A fatal error occured!" };
1200     char    str_PME_f_load[13];
1201
1202
1203     /* Allocate space for the mdrun command line. 100 extra characters should
1204        be more than enough for the -npme etcetera arguments */
1205     cmdline_length =  strlen(cmd_mpirun)
1206                     + strlen(cmd_np)
1207                     + strlen(cmd_mdrun)
1208                     + strlen(cmd_args_bench)
1209                     + strlen(tpr_names[0]) + 100;
1210     snew(command , cmdline_length);
1211     snew(cmd_stub, cmdline_length);
1212
1213     /* Construct the part of the command line that stays the same for all tests: */
1214     if (bThreads)
1215     {
1216         sprintf(cmd_stub, "%s%s", cmd_mdrun, cmd_np);
1217     }
1218     else
1219     {
1220         sprintf(cmd_stub, "%s%s%s ", cmd_mpirun, cmd_np, cmd_mdrun);
1221     }
1222
1223     /* Create a list of numbers of PME nodes to test */
1224     make_npme_list(npmevalues_opt, pmeentries, &nPMEnodes,
1225                    nnodes, minPMEnodes, maxPMEnodes);
1226
1227     if (0 == repeats)
1228     {
1229         fprintf(fp, "\nNo benchmarks done since number of repeats (-r) is 0.\n");
1230         fclose(fp);
1231         finalize(opt2fn("-p", nfile, fnm));
1232         exit(0);
1233     }
1234
1235     /* Allocate one dataset for each tpr input file: */
1236     init_perfdata(perfdata, nr_tprs, *pmeentries, repeats);
1237
1238     /*****************************************/
1239     /* Main loop over all tpr files to test: */
1240     /*****************************************/
1241     totaltests = nr_tprs*(*pmeentries)*repeats;
1242     for (k=0; k<nr_tprs;k++)
1243     {
1244         fprintf(fp, "\nIndividual timings for input file %d (%s):\n", k, tpr_names[k]);
1245         fprintf(fp, "PME nodes      Gcycles       ns/day        PME/f    Remark\n");
1246         /* Loop over various numbers of PME nodes: */
1247         for (i = 0; i < *pmeentries; i++)
1248         {
1249             pd = &perfdata[k][i];
1250
1251             /* Loop over the repeats for each scenario: */
1252             for (nr = 0; nr < repeats; nr++)
1253             {
1254                 pd->nPMEnodes = nPMEnodes[i];
1255
1256                 /* Add -npme and -s to the command line and save it. Note that
1257                  * the -passall (if set) options requires cmd_args_bench to be
1258                  * at the end of the command line string */
1259                 snew(pd->mdrun_cmd_line, cmdline_length);
1260                 sprintf(pd->mdrun_cmd_line, "%s-npme %d -s %s %s",
1261                         cmd_stub, pd->nPMEnodes, tpr_names[k], cmd_args_bench);
1262
1263                 /* Do a benchmark simulation: */
1264                 if (repeats > 1)
1265                     sprintf(buf, ", pass %d/%d", nr+1, repeats);
1266                 else
1267                     buf[0]='\0';
1268                 fprintf(stdout, "\n=== Progress %2.0f%%, tpr %d/%d, run %d/%d%s:\n",
1269                         (100.0*count)/totaltests,
1270                         k+1, nr_tprs, i+1, *pmeentries, buf);
1271                 make_backup(opt2fn("-err",nfile,fnm));
1272                 sprintf(command, "%s 1> /dev/null 2>%s", pd->mdrun_cmd_line, opt2fn("-err",nfile,fnm));
1273                 fprintf(stdout, "%s\n", pd->mdrun_cmd_line);
1274                 gmx_system_call(command);
1275
1276                 /* Collect the performance data from the log file; also check stderr
1277                  * for fatal errors */
1278                 ret = parse_logfile(opt2fn("-bg",nfile,fnm), opt2fn("-err",nfile,fnm),
1279                         pd, nr, presteps, cpt_steps, nnodes);
1280                 if ((presteps > 0) && (ret == eParselogResetProblem))
1281                     bResetProblem = TRUE;
1282
1283                 if (-1 == pd->nPMEnodes)
1284                     sprintf(buf, "(%3d)", pd->guessPME);
1285                 else
1286                     sprintf(buf, "     ");
1287
1288                 /* Nicer output */
1289                 if (pd->PME_f_load[nr] > 0.0)
1290                     sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load[nr]);
1291                 else
1292                     sprintf(str_PME_f_load, "%s", "         -  ");
1293
1294                 /* Write the data we got to disk */
1295                 fprintf(fp, "%4d%s %12.3f %12.3f %s    %s", pd->nPMEnodes,
1296                         buf, pd->Gcycles[nr], pd->ns_per_day[nr], str_PME_f_load, ParseLog[ret]);
1297                 if (! (ret==eParselogOK || ret==eParselogNoDDGrid || ret==eParselogNotFound) )
1298                     fprintf(fp, " Check %s file for problems.", ret==eParselogFatal? "err":"log");
1299                 fprintf(fp, "\n");
1300                 fflush(fp);
1301                 count++;
1302
1303                 /* Do some cleaning up and delete the files we do not need any more */
1304                 cleanup(fnm, nfile, k, nnodes, pd->nPMEnodes, nr, ret==eParselogFatal);
1305
1306                 /* If the first run with this number of processors already failed, do not try again: */
1307                 if (pd->Gcycles[0] <= 0.0 && repeats > 1)
1308                 {
1309                     fprintf(stdout, "Skipping remaining passes of unsuccessful setting, see log file for details.\n");
1310                     count += repeats-(nr+1);
1311                     break;
1312                 }
1313             } /* end of repeats loop */
1314         } /* end of -npme loop */
1315     } /* end of tpr file loop */
1316     if (bResetProblem)
1317     {
1318         sep_line(fp);
1319         fprintf(fp, "WARNING: The cycle and time step counters could not be reset\n"
1320                     "properly. The reason could be that mpirun did not manage to\n"
1321                     "export the environment variable GMX_RESET_COUNTER. You might\n"
1322                     "have to give a special switch to mpirun for that.\n"
1323                     "Alternatively, you can manually set GMX_RESET_COUNTER to the\n"
1324                     "value normally provided by -presteps.");
1325         sep_line(fp);
1326     }
1327     sfree(command);
1328     sfree(cmd_stub);
1329 }
1330
1331
1332 static void check_input(
1333         int nnodes,
1334         int repeats,
1335         int *ntprs,
1336         real *upfac,
1337         real *downfac,
1338         real maxPMEfraction,
1339         real minPMEfraction,
1340         real fourierspacing,
1341         gmx_large_int_t bench_nsteps,
1342         const t_filenm *fnm,
1343         int nfile,
1344         int sim_part,
1345         int presteps,
1346         int npargs,
1347         t_pargs *pa)
1348 {
1349     /* Make sure the input file exists */
1350     if (!gmx_fexist(opt2fn("-s",nfile,fnm)))
1351         gmx_fatal(FARGS, "File %s not found.", opt2fn("-s",nfile,fnm));
1352
1353     /* Make sure that the checkpoint file is not overwritten by the benchmark runs */
1354     if ( (0 == strcmp(opt2fn("-cpi",nfile,fnm), opt2fn("-cpo",nfile,fnm)) ) && (sim_part > 1) )
1355         gmx_fatal(FARGS, "Checkpoint input and output file must not be identical,\nbecause then the input file might change during the benchmarks.");
1356
1357     /* Make sure that repeats is >= 0 (if == 0, only write tpr files) */
1358     if (repeats < 0)
1359         gmx_fatal(FARGS, "Number of repeats < 0!");
1360
1361     /* Check number of nodes */
1362     if (nnodes < 1)
1363         gmx_fatal(FARGS, "Number of nodes/threads must be a positive integer.");
1364
1365     /* Automatically choose -ntpr if not set */
1366     if (*ntprs < 1)
1367     {
1368         if (nnodes < 16)
1369             *ntprs = 1;
1370         else
1371             *ntprs = 3;
1372         fprintf(stderr, "Will test %d tpr file%s.\n", *ntprs, *ntprs==1?"":"s");
1373     }
1374     else
1375     {
1376         if (1 == *ntprs)
1377             fprintf(stderr, "Note: Choose ntpr>1 to shift PME load between real and reciprocal space.\n");
1378     }
1379
1380     if ( is_equal(*downfac,*upfac) && (*ntprs > 1) )
1381     {
1382         fprintf(stderr, "WARNING: Resetting -ntpr to 1 since both scaling factors are the same.\n"
1383                         "Please choose upfac unequal to downfac to test various PME grid settings\n");
1384         *ntprs = 1;
1385     }
1386
1387     /* Check whether max and min fraction are within required values */
1388     if (maxPMEfraction > 0.5 || maxPMEfraction < 0)
1389         gmx_fatal(FARGS, "-max must be between 0 and 0.5");
1390     if (minPMEfraction > 0.5 || minPMEfraction < 0)
1391         gmx_fatal(FARGS, "-min must be between 0 and 0.5");
1392     if (maxPMEfraction < minPMEfraction)
1393         gmx_fatal(FARGS, "-max must be larger or equal to -min");
1394
1395     /* Check whether the number of steps - if it was set - has a reasonable value */
1396     if (bench_nsteps < 0)
1397         gmx_fatal(FARGS, "Number of steps must be positive.");
1398
1399     if (bench_nsteps > 10000 || bench_nsteps < 100)
1400     {
1401         fprintf(stderr, "WARNING: steps=");
1402         fprintf(stderr, gmx_large_int_pfmt, bench_nsteps);
1403         fprintf(stderr, ". Are you sure you want to perform so %s steps for each benchmark?\n", (bench_nsteps < 100)? "few" : "many");
1404     }
1405
1406     if (presteps < 0)
1407     {
1408         gmx_fatal(FARGS, "Cannot have a negative number of presteps.\n");
1409     }
1410
1411     if (*upfac <= 0.0 || *downfac <= 0.0 || *downfac > *upfac)
1412         gmx_fatal(FARGS, "Both scaling factors must be larger than zero and upper\n"
1413                          "scaling limit (%f) must be larger than lower limit (%f).",
1414                          *upfac, *downfac);
1415
1416     if (*downfac < 0.75 || *upfac > 1.5)
1417         fprintf(stderr, "WARNING: Applying extreme scaling factor. I hope you know what you are doing.\n");
1418
1419     if (fourierspacing < 0)
1420         gmx_fatal(FARGS, "Please choose a positive value for fourierspacing.");
1421
1422     /* Make shure that the scaling factor options are compatible with the number of tprs */
1423     if ( (1 == *ntprs) && ( opt2parg_bSet("-upfac",npargs,pa) || opt2parg_bSet("-downfac",npargs,pa) ) )
1424     {
1425         if (opt2parg_bSet("-upfac",npargs,pa) && opt2parg_bSet("-downfac",npargs,pa) && !is_equal(*upfac,*downfac))
1426         {
1427             gmx_fatal(FARGS, "Please specify -ntpr > 1 for both scaling factors to take effect.\n"
1428                              "(upfac=%f, downfac=%f)\n", *upfac, *downfac);
1429         }
1430         if (opt2parg_bSet("-upfac",npargs,pa))
1431             *downfac = *upfac;
1432         if (opt2parg_bSet("-downfac",npargs,pa))
1433             *upfac = *downfac;
1434         if (!is_equal(*upfac, 1.0))
1435         {
1436             fprintf(stderr, "WARNING: Using a scaling factor of %f with -ntpr 1, thus not testing the original tpr settings.\n",
1437                     *upfac);
1438         }
1439     }
1440 }
1441
1442
1443 /* Returns TRUE when "opt" is a switch for g_tune_pme itself */
1444 static bool is_main_switch(char *opt)
1445 {
1446     if ( (0 == strcmp(opt,"-s"        ))
1447       || (0 == strcmp(opt,"-p"        ))
1448       || (0 == strcmp(opt,"-launch"   ))
1449       || (0 == strcmp(opt,"-r"        ))
1450       || (0 == strcmp(opt,"-ntpr"     ))
1451       || (0 == strcmp(opt,"-max"      ))
1452       || (0 == strcmp(opt,"-min"      ))
1453       || (0 == strcmp(opt,"-upfac"    ))
1454       || (0 == strcmp(opt,"-downfac"  ))
1455       || (0 == strcmp(opt,"-four"     ))
1456       || (0 == strcmp(opt,"-steps"    ))
1457       || (0 == strcmp(opt,"-simsteps" ))
1458       || (0 == strcmp(opt,"-resetstep"))
1459       || (0 == strcmp(opt,"-so"       ))
1460       || (0 == strcmp(opt,"-npstring" ))
1461       || (0 == strcmp(opt,"-npme"     ))
1462       || (0 == strcmp(opt,"-passall"  )) )
1463     return TRUE;
1464
1465     return FALSE;
1466 }
1467
1468
1469 /* Returns TRUE when "opt" is needed at launch time */
1470 static bool is_launch_option(char *opt, bool bSet)
1471 {
1472     if (bSet)
1473         return TRUE;
1474     else
1475         return FALSE;
1476 }
1477
1478
1479 /* Returns TRUE when "opt" is needed at launch time */
1480 static bool is_launch_file(char *opt, bool bSet)
1481 {
1482     /* We need all options that were set on the command line
1483      * and that do not start with -b */
1484     if (0 == strncmp(opt,"-b", 2))
1485         return FALSE;
1486
1487     if (bSet)
1488         return TRUE;
1489     else
1490         return FALSE;
1491 }
1492
1493
1494 /* Returns TRUE when "opt" gives an option needed for the benchmarks runs */
1495 static bool is_bench_option(char *opt, bool bSet)
1496 {
1497     /* If option is set, we might need it for the benchmarks.
1498      * This includes -cpi */
1499     if (bSet)
1500     {
1501         if ( (0 == strcmp(opt, "-append" ))
1502           || (0 == strcmp(opt, "-maxh"   ))
1503           || (0 == strcmp(opt, "-deffnm" ))
1504           || (0 == strcmp(opt, "-resethway")) )
1505             return FALSE;
1506         else
1507             return TRUE;
1508     }
1509     else
1510         return FALSE;
1511 }
1512
1513
1514 /* Returns TRUE when "opt" defines a file which is needed for the benchmarks runs */
1515 static bool is_bench_file(char *opt, bool bSet, bool bOptional, bool bIsOutput)
1516 {
1517     /* All options starting with "-b" are for _b_enchmark files exclusively */
1518     if (0 == strncmp(opt,"-b", 2))
1519     {
1520         if (!bOptional || bSet)
1521             return TRUE;
1522         else
1523             return FALSE;
1524     }
1525     else
1526     {
1527         if (bIsOutput)
1528             return FALSE;
1529         else
1530             if (bSet) /* These are additional input files like -cpi -ei */
1531                 return TRUE;
1532             else
1533                 return FALSE;
1534     }
1535 }
1536
1537
1538 /* Adds 'buf' to 'str' */
1539 static void add_to_string(char **str, char *buf)
1540 {
1541     int len;
1542
1543
1544     len = strlen(*str) + strlen(buf) + 1;
1545     srenew(*str, len);
1546     strcat(*str, buf);
1547 }
1548
1549
1550 /* Create the command line for the benchmark as well as for the real run */
1551 static void create_command_line_snippets(
1552         bool     bThreads,
1553         int      presteps,
1554         int      nfile,
1555         t_filenm fnm[],
1556         int      npargs,
1557         t_pargs  *pa,
1558         const char *procstring,      /* How to pass the number of processors to $MPIRUN */
1559         char     *cmd_np[],          /* Actual command line snippet, e.g. '-np <N>' */
1560         char     *cmd_args_bench[],  /* command line arguments for benchmark runs */
1561         char     *cmd_args_launch[], /* command line arguments for simulation run */
1562         char     extra_args[])       /* Add this to the end of the command line */
1563 {
1564     int        i;
1565     char       *opt;
1566     const char *name;
1567     char       *np_or_nt;
1568 #define BUFLENGTH 255
1569     char       buf[BUFLENGTH];
1570     char       strbuf[BUFLENGTH];
1571     char       strbuf2[BUFLENGTH];
1572
1573
1574     if (bThreads)
1575         np_or_nt=strdup("-nt");
1576     else
1577         np_or_nt=strdup("-np");
1578
1579     /* strlen needs at least '\0' as a string: */
1580     snew(*cmd_args_bench ,1);
1581     snew(*cmd_args_launch,1);
1582     *cmd_args_launch[0]='\0';
1583     *cmd_args_bench[0] ='\0';
1584
1585
1586     /*******************************************/
1587     /* 1. Process other command line arguments */
1588     /*******************************************/
1589     for (i=0; i<npargs; i++)
1590     {
1591         /* What command line switch are we currently processing: */
1592         opt = (char *)pa[i].option;
1593         /* Skip options not meant for mdrun */
1594         if (!is_main_switch(opt))
1595         {
1596             /* Print it to a string buffer, strip away trailing whitespaces that pa_val also returns: */
1597             sprintf(strbuf2, "%s", pa_val(&pa[i],buf,BUFLENGTH));
1598             rtrim(strbuf2);
1599             sprintf(strbuf, "%s %s ", opt, strbuf2);
1600             /* We need the -np (or -nt) switch in a separate buffer - whether or not it was set! */
1601             if (0 == strcmp(opt,np_or_nt))
1602             {
1603                 if (strcmp(procstring, "none")==0 && !bThreads)
1604                 {
1605                     /* Omit -np <N> entirely */
1606                     snew(*cmd_np, 2);
1607                     sprintf(*cmd_np, " ");
1608                 }
1609                 else
1610                 {
1611                     /* This is the normal case with -np <N> */
1612                     snew(*cmd_np, strlen(procstring)+strlen(strbuf2)+4);
1613                     sprintf(*cmd_np, " %s %s ", bThreads? "-nt" : procstring, strbuf2);
1614                 }
1615             }
1616             else
1617             {
1618                 if (is_bench_option(opt,pa[i].bSet))
1619                     add_to_string(cmd_args_bench, strbuf);
1620
1621                 if (is_launch_option(opt,pa[i].bSet))
1622                     add_to_string(cmd_args_launch, strbuf);
1623             }
1624         }
1625     }
1626     if (presteps > 0)
1627     {
1628         /* Add equilibration steps to benchmark options */
1629         sprintf(strbuf, "-resetstep %d ", presteps);
1630         add_to_string(cmd_args_bench, strbuf);
1631     }
1632
1633     /********************/
1634     /* 2. Process files */
1635     /********************/
1636     for (i=0; i<nfile; i++)
1637     {
1638         opt  = (char *)fnm[i].opt;
1639         name = opt2fn(opt,nfile,fnm);
1640
1641         /* Strbuf contains the options, now let's sort out where we need that */
1642         sprintf(strbuf, "%s %s ", opt, name);
1643
1644         /* Skip options not meant for mdrun */
1645         if (!is_main_switch(opt))
1646         {
1647
1648             if ( is_bench_file(opt, opt2bSet(opt,nfile,fnm), is_optional(&fnm[i]), is_output(&fnm[i])) )
1649             {
1650                 /* All options starting with -b* need the 'b' removed,
1651                  * therefore overwrite strbuf */
1652                 if (0 == strncmp(opt, "-b", 2))
1653                     sprintf(strbuf, "-%s %s ", &opt[2], name);
1654
1655                 add_to_string(cmd_args_bench, strbuf);
1656             }
1657
1658             if ( is_launch_file(opt,opt2bSet(opt,nfile,fnm)) )
1659                 add_to_string(cmd_args_launch, strbuf);
1660         }
1661     }
1662
1663     add_to_string(cmd_args_bench , extra_args);
1664     add_to_string(cmd_args_launch, extra_args);
1665 #undef BUFLENGTH
1666 }
1667
1668
1669 /* Set option opt */
1670 static void setopt(const char *opt,int nfile,t_filenm fnm[])
1671 {
1672   int i;
1673
1674   for(i=0; (i<nfile); i++)
1675     if (strcmp(opt,fnm[i].opt)==0)
1676       fnm[i].flag |= ffSET;
1677 }
1678
1679
1680 static void couple_files_options(int nfile, t_filenm fnm[])
1681 {
1682     int i;
1683     bool bSet,bBench;
1684     char *opt;
1685     char buf[20];
1686
1687
1688     for (i=0; i<nfile; i++)
1689     {
1690         opt  = (char *)fnm[i].opt;
1691         bSet = ((fnm[i].flag & ffSET) != 0);
1692         bBench = (0 == strncmp(opt,"-b", 2));
1693
1694         /* Check optional files */
1695         /* If e.g. -eo is set, then -beo also needs to be set */
1696         if (is_optional(&fnm[i]) && bSet && !bBench)
1697         {
1698             sprintf(buf, "-b%s", &opt[1]);
1699             setopt(buf,nfile,fnm);
1700         }
1701         /* If -beo is set, then -eo also needs to be! */
1702         if (is_optional(&fnm[i]) && bSet && bBench)
1703         {
1704             sprintf(buf, "-%s", &opt[2]);
1705             setopt(buf,nfile,fnm);
1706         }
1707     }
1708 }
1709
1710
1711 static double gettime()
1712 {
1713 #ifdef HAVE_GETTIMEOFDAY
1714     struct timeval t;
1715     struct timezone tz = { 0,0 };
1716     double seconds;
1717
1718     gettimeofday(&t,&tz);
1719
1720     seconds = (double) t.tv_sec + 1e-6*(double)t.tv_usec;
1721
1722     return seconds;
1723 #else
1724     double  seconds;
1725
1726     seconds = time(NULL);
1727
1728     return seconds;
1729 #endif
1730 }
1731
1732
1733 #define BENCHSTEPS (1000)
1734
1735 int gmx_tune_pme(int argc,char *argv[])
1736 {
1737     const char *desc[] = {
1738             "For a given number [TT]-np[tt] or [TT]-nt[tt] of processors/threads, this program systematically",
1739             "times mdrun with various numbers of PME-only nodes and determines",
1740             "which setting is fastest. It will also test whether performance can",
1741             "be enhanced by shifting load from the reciprocal to the real space",
1742             "part of the Ewald sum. ",
1743             "Simply pass your [TT].tpr[tt] file to g_tune_pme together with other options",
1744             "for mdrun as needed.[PAR]",
1745             "Which executables are used can be set in the environment variables",
1746             "MPIRUN and MDRUN. If these are not present, 'mpirun' and 'mdrun'",
1747             "will be used as defaults. Note that for certain MPI frameworks you",
1748             "need to provide a machine- or hostfile. This can also be passed",
1749             "via the MPIRUN variable, e.g.",
1750             "'export MPIRUN=\"/usr/local/mpirun -machinefile hosts\"'[PAR]",
1751             "Please call g_tune_pme with the normal options you would pass to",
1752             "mdrun and add [TT]-np[tt] for the number of processors to perform the",
1753             "tests on, or [TT]-nt[tt] for the number of threads. You can also add [TT]-r[tt]",
1754             "to repeat each test several times to get better statistics. [PAR]",
1755             "g_tune_pme can test various real space / reciprocal space workloads",
1756             "for you. With [TT]-ntpr[tt] you control how many extra [TT].tpr[tt] files will be",
1757             "written with enlarged cutoffs and smaller fourier grids respectively.",
1758             "Typically, the first test (no. 0) will be with the settings from the input",
1759             "[TT].tpr[tt] file; the last test (no. [TT]ntpr[tt]) will have cutoffs multiplied",
1760             "by (and at the same time fourier grid dimensions divided by) the scaling",
1761             "factor [TT]-fac[tt] (default 1.2). The remaining [TT].tpr[tt] files will have equally",
1762             "spaced values inbetween these extremes. Note that you can set [TT]-ntpr[tt] to 1",
1763             "if you just want to find the optimal number of PME-only nodes; in that case",
1764             "your input [TT].tpr[tt] file will remain unchanged.[PAR]",
1765             "For the benchmark runs, the default of 1000 time steps should suffice for most",
1766             "MD systems. The dynamic load balancing needs about 100 time steps",
1767             "to adapt to local load imbalances, therefore the time step counters",
1768             "are by default reset after 100 steps. For large systems",
1769             "(>1M atoms) you may have to set [TT]-resetstep[tt] to a higher value.",
1770             "From the 'DD' load imbalance entries in the md.log output file you",
1771             "can tell after how many steps the load is sufficiently balanced.[PAR]"
1772             "Example call: [TT]g_tune_pme -np 64 -s protein.tpr -launch[tt][PAR]",
1773             "After calling mdrun several times, detailed performance information",
1774             "is available in the output file perf.out. ",
1775             "Note that during the benchmarks a couple of temporary files are written",
1776             "(options -b*), these will be automatically deleted after each test.[PAR]",
1777             "If you want the simulation to be started automatically with the",
1778             "optimized parameters, use the command line option [TT]-launch[tt].[PAR]",
1779     };
1780
1781     int        nnodes =1;
1782     int        repeats=2;
1783     int        pmeentries=0; /* How many values for -npme do we actually test for each tpr file */
1784     real       maxPMEfraction=0.50;
1785     real       minPMEfraction=0.25;
1786     int        maxPMEnodes, minPMEnodes;
1787     real       downfac=1.0,upfac=1.2;
1788     int        ntprs=0;
1789     real       fs=0.0;                    /* 0 indicates: not set by the user */
1790     gmx_large_int_t bench_nsteps=BENCHSTEPS;
1791     gmx_large_int_t new_sim_nsteps=-1;   /* -1 indicates: not set by the user */
1792     gmx_large_int_t cpt_steps=0;         /* Step counter in .cpt input file   */
1793     int        presteps=100;    /* Do a full cycle reset after presteps steps */
1794     bool       bOverwrite=FALSE, bKeepTPR;
1795     bool       bLaunch=FALSE;
1796     bool       bPassAll=FALSE;
1797     char       *ExtraArgs=NULL;
1798     char       **tpr_names=NULL;
1799     const char *simulation_tpr=NULL;
1800     int        best_npme, best_tpr;
1801     int        sim_part = 1;     /* For benchmarks with checkpoint files */
1802
1803     /* Default program names if nothing else is found */
1804     char        *cmd_mpirun=NULL, *cmd_mdrun=NULL;
1805     char        *cmd_args_bench, *cmd_args_launch;
1806     char        *cmd_np=NULL;
1807
1808     t_perf      **perfdata=NULL;
1809     t_inputinfo *info;
1810     int         i;
1811     FILE        *fp;
1812     t_commrec   *cr;
1813
1814     /* Print out how long the tuning took */
1815     double      seconds;
1816
1817     static t_filenm fnm[] = {
1818       /* g_tune_pme */
1819       { efOUT, "-p",      "perf",     ffWRITE },
1820       { efLOG, "-err",    "errors",   ffWRITE },
1821       { efTPX, "-so",     "tuned",    ffWRITE },
1822       /* mdrun: */
1823       { efTPX, NULL,      NULL,       ffREAD },
1824       { efTRN, "-o",      NULL,       ffWRITE },
1825       { efXTC, "-x",      NULL,       ffOPTWR },
1826       { efCPT, "-cpi",    NULL,       ffOPTRD },
1827       { efCPT, "-cpo",    NULL,       ffOPTWR },
1828       { efSTO, "-c",      "confout",  ffWRITE },
1829       { efEDR, "-e",      "ener",     ffWRITE },
1830       { efLOG, "-g",      "md",       ffWRITE },
1831       { efXVG, "-dhdl",   "dhdl",     ffOPTWR },
1832       { efXVG, "-field",  "field",    ffOPTWR },
1833       { efXVG, "-table",  "table",    ffOPTRD },
1834       { efXVG, "-tablep", "tablep",   ffOPTRD },
1835       { efXVG, "-tableb", "table",    ffOPTRD },
1836       { efTRX, "-rerun",  "rerun",    ffOPTRD },
1837       { efXVG, "-tpi",    "tpi",      ffOPTWR },
1838       { efXVG, "-tpid",   "tpidist",  ffOPTWR },
1839       { efEDI, "-ei",     "sam",      ffOPTRD },
1840       { efEDO, "-eo",     "sam",      ffOPTWR },
1841       { efGCT, "-j",      "wham",     ffOPTRD },
1842       { efGCT, "-jo",     "bam",      ffOPTWR },
1843       { efXVG, "-ffout",  "gct",      ffOPTWR },
1844       { efXVG, "-devout", "deviatie", ffOPTWR },
1845       { efXVG, "-runav",  "runaver",  ffOPTWR },
1846       { efXVG, "-px",     "pullx",    ffOPTWR },
1847       { efXVG, "-pf",     "pullf",    ffOPTWR },
1848       { efMTX, "-mtx",    "nm",       ffOPTWR },
1849       { efNDX, "-dn",     "dipole",   ffOPTWR },
1850       /* Output files that are deleted after each benchmark run */
1851       { efTRN, "-bo",     "bench",    ffWRITE },
1852       { efXTC, "-bx",     "bench",    ffWRITE },
1853       { efCPT, "-bcpo",   "bench",    ffWRITE },
1854       { efSTO, "-bc",     "bench",    ffWRITE },
1855       { efEDR, "-be",     "bench",    ffWRITE },
1856       { efLOG, "-bg",     "bench",    ffWRITE },
1857       { efEDO, "-beo",    "bench",    ffOPTWR },
1858       { efXVG, "-bdhdl",  "benchdhdl",ffOPTWR },
1859       { efXVG, "-bfield", "benchfld" ,ffOPTWR },
1860       { efXVG, "-btpi",   "benchtpi", ffOPTWR },
1861       { efXVG, "-btpid",  "benchtpid",ffOPTWR },
1862       { efGCT, "-bjo",    "bench",    ffOPTWR },
1863       { efXVG, "-bffout", "benchgct", ffOPTWR },
1864       { efXVG, "-bdevout","benchdev", ffOPTWR },
1865       { efXVG, "-brunav", "benchrnav",ffOPTWR },
1866       { efXVG, "-bpx",    "benchpx",  ffOPTWR },
1867       { efXVG, "-bpf",    "benchpf",  ffOPTWR },
1868       { efMTX, "-bmtx",   "benchn",   ffOPTWR },
1869       { efNDX, "-bdn",    "bench",    ffOPTWR }
1870     };
1871
1872     /* Command line options of mdrun */
1873     bool bDDBondCheck = TRUE;
1874     bool bDDBondComm  = TRUE;
1875     bool bVerbose     = FALSE;
1876     bool bCompact     = TRUE;
1877     bool bSepPot      = FALSE;
1878     bool bRerunVSite  = FALSE;
1879     bool bIonize      = FALSE;
1880     bool bConfout     = TRUE;
1881     bool bReproducible = FALSE;
1882     bool bThreads     = FALSE;
1883
1884     int  nmultisim=0;
1885     int  nstglobalcomm=-1;
1886     int  repl_ex_nst=0;
1887     int  repl_ex_seed=-1;
1888     int  nstepout=100;
1889     int  nthreads=1;
1890
1891     const char *ddno_opt[ddnoNR+1] =
1892       { NULL, "interleave", "pp_pme", "cartesian", NULL };
1893     const char *dddlb_opt[] =
1894       { NULL, "auto", "no", "yes", NULL };
1895     const char *procstring[] =
1896       { NULL, "-np", "-n", "none", NULL };
1897     const char *npmevalues_opt[] =
1898       { NULL, "auto", "all", "subset", NULL };
1899     real rdd=0.0,rconstr=0.0,dlb_scale=0.8,pforce=-1;
1900     char *ddcsx=NULL,*ddcsy=NULL,*ddcsz=NULL;
1901     char *deffnm=NULL;
1902 #define STD_CPT_PERIOD (15.0)
1903     real cpt_period=STD_CPT_PERIOD,max_hours=-1;
1904     bool bAppendFiles=TRUE;
1905     bool bResetCountersHalfWay=FALSE;
1906     output_env_t oenv=NULL;
1907
1908     t_pargs pa[] = {
1909       /***********************/
1910       /* g_tune_pme options: */
1911       /***********************/
1912       { "-np",       FALSE, etINT,  {&nnodes},
1913         "Number of nodes to run the tests on (must be > 2 for separate PME nodes)" },
1914       { "-npstring", FALSE, etENUM, {procstring},
1915         "Specify the number of processors to $MPIRUN using this string"},
1916       { "-passall",  FALSE, etBOOL, {&bPassAll},
1917         "HIDDENPut arguments unknown to mdrun at the end of the command line. Can e.g. be used for debugging purposes. "},
1918       { "-nt",       FALSE, etINT,  {&nthreads},
1919         "Number of threads to run the tests on (turns MPI & mpirun off)"},
1920       { "-r",        FALSE, etINT,  {&repeats},
1921         "Repeat each test this often" },
1922       { "-max",      FALSE, etREAL, {&maxPMEfraction},
1923         "Max fraction of PME nodes to test with" },
1924       { "-min",      FALSE, etREAL, {&minPMEfraction},
1925         "Min fraction of PME nodes to test with" },
1926       { "-npme",     FALSE, etENUM, {npmevalues_opt},
1927         "Benchmark all possible values for -npme or just the subset that is expected to perform well"},
1928       { "-upfac",    FALSE, etREAL, {&upfac},
1929         "Upper limit for rcoulomb scaling factor (Note that rcoulomb upscaling results in fourier grid downscaling)" },
1930       { "-downfac",  FALSE, etREAL, {&downfac},
1931         "Lower limit for rcoulomb scaling factor" },
1932       { "-ntpr",     FALSE, etINT,  {&ntprs},
1933         "Number of tpr files to benchmark. Create these many files with scaling factors ranging from 1.0 to fac. If < 1, automatically choose the number of tpr files to test" },
1934       { "-four",     FALSE, etREAL, {&fs},
1935         "Use this fourierspacing value instead of the grid found in the tpr input file. (Spacing applies to a scaling factor of 1.0 if multiple tpr files are written)" },
1936       { "-steps",    FALSE, etGMX_LARGE_INT, {&bench_nsteps},
1937         "Take timings for these many steps in the benchmark runs" },
1938       { "-resetstep",FALSE, etINT,  {&presteps},
1939         "Let dlb equilibrate these many steps before timings are taken (reset cycle counters after these many steps)" },
1940       { "-simsteps", FALSE, etGMX_LARGE_INT, {&new_sim_nsteps},
1941         "If non-negative, perform these many steps in the real run (overwrite nsteps from tpr, add cpt steps)" },
1942       { "-launch",   FALSE, etBOOL, {&bLaunch},
1943         "Lauch the real simulation after optimization" },
1944       /******************/
1945       /* mdrun options: */
1946       /******************/
1947       { "-deffnm",    FALSE, etSTR, {&deffnm},
1948           "Set the default filename for all file options at launch time" },
1949       { "-ddorder",   FALSE, etENUM, {ddno_opt},
1950         "DD node order" },
1951       { "-ddcheck",   FALSE, etBOOL, {&bDDBondCheck},
1952         "Check for all bonded interactions with DD" },
1953       { "-ddbondcomm",FALSE, etBOOL, {&bDDBondComm},
1954         "HIDDENUse special bonded atom communication when -rdd > cut-off" },
1955       { "-rdd",       FALSE, etREAL, {&rdd},
1956         "The maximum distance for bonded interactions with DD (nm), 0 is determine from initial coordinates" },
1957       { "-rcon",      FALSE, etREAL, {&rconstr},
1958         "Maximum distance for P-LINCS (nm), 0 is estimate" },
1959       { "-dlb",       FALSE, etENUM, {dddlb_opt},
1960         "Dynamic load balancing (with DD)" },
1961       { "-dds",       FALSE, etREAL, {&dlb_scale},
1962         "Minimum allowed dlb scaling of the DD cell size" },
1963       { "-ddcsx",     FALSE, etSTR,  {&ddcsx},
1964         "HIDDENThe DD cell sizes in x" },
1965       { "-ddcsy",     FALSE, etSTR,  {&ddcsy},
1966         "HIDDENThe DD cell sizes in y" },
1967       { "-ddcsz",     FALSE, etSTR,  {&ddcsz},
1968         "HIDDENThe DD cell sizes in z" },
1969       { "-gcom",      FALSE, etINT,  {&nstglobalcomm},
1970         "Global communication frequency" },
1971       { "-v",         FALSE, etBOOL, {&bVerbose},
1972         "Be loud and noisy" },
1973       { "-compact",   FALSE, etBOOL, {&bCompact},
1974         "Write a compact log file" },
1975       { "-seppot",    FALSE, etBOOL, {&bSepPot},
1976         "Write separate V and dVdl terms for each interaction type and node to the log file(s)" },
1977       { "-pforce",    FALSE, etREAL, {&pforce},
1978         "Print all forces larger than this (kJ/mol nm)" },
1979       { "-reprod",    FALSE, etBOOL, {&bReproducible},
1980         "Try to avoid optimizations that affect binary reproducibility" },
1981       { "-cpt",       FALSE, etREAL, {&cpt_period},
1982         "Checkpoint interval (minutes)" },
1983       { "-append",    FALSE, etBOOL, {&bAppendFiles},
1984         "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names (for launch only)" },
1985       { "-maxh",      FALSE, etREAL, {&max_hours},
1986         "Terminate after 0.99 times this time (hours)" },
1987       { "-multi",     FALSE, etINT,  {&nmultisim},
1988         "Do multiple simulations in parallel" },
1989       { "-replex",    FALSE, etINT,  {&repl_ex_nst},
1990         "Attempt replica exchange every # steps" },
1991       { "-reseed",    FALSE, etINT,  {&repl_ex_seed},
1992         "Seed for replica exchange, -1 is generate a seed" },
1993       { "-rerunvsite", FALSE, etBOOL, {&bRerunVSite},
1994         "HIDDENRecalculate virtual site coordinates with -rerun" },
1995       { "-ionize",    FALSE, etBOOL, {&bIonize},
1996         "Do a simulation including the effect of an X-Ray bombardment on your system" },
1997       { "-confout",   FALSE, etBOOL, {&bConfout},
1998         "HIDDENWrite the last configuration with -c and force checkpointing at the last step" },
1999       { "-stepout",   FALSE, etINT,  {&nstepout},
2000         "HIDDENFrequency of writing the remaining runtime" },
2001       { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay},
2002         "HIDDENReset the cycle counters after half the number of steps or halfway -maxh (launch only)" }
2003     };
2004
2005
2006 #define NFILE asize(fnm)
2007
2008     CopyRight(stderr,argv[0]);
2009
2010     seconds = gettime();
2011
2012     parse_common_args(&argc,argv,PCA_NOEXIT_ON_ARGS,
2013                       NFILE,fnm,asize(pa),pa,asize(desc),desc,
2014                       0,NULL,&oenv);
2015
2016     /* Store the remaining unparsed command line entries in a string */
2017     snew(ExtraArgs, 1);
2018     ExtraArgs[0] = '\0';
2019     for (i=1; i<argc; i++) /* argc will now be 1 if everything was understood */
2020     {
2021         add_to_string(&ExtraArgs, argv[i]);
2022         add_to_string(&ExtraArgs, " ");
2023     }
2024     if ( !bPassAll && (ExtraArgs[0] != '\0') )
2025     {
2026         fprintf(stderr, "\nWARNING: The following arguments you provided have no effect:\n"
2027                         "%s\n"
2028                         "Use the -passall option to force them to appear on the command lines\n"
2029                         "for the benchmark simulations%s.\n\n",
2030                         ExtraArgs, bLaunch? " and at launch time" : "");
2031     }
2032
2033     if (opt2parg_bSet("-nt",asize(pa),pa))
2034     {
2035         bThreads=TRUE;
2036         if (opt2parg_bSet("-npstring",asize(pa),pa))
2037             fprintf(stderr, "WARNING: -npstring has no effect when using threads.\n");
2038
2039         if (nnodes > 1)
2040             gmx_fatal(FARGS, "Can't run multi-threaded MPI simulation yet!");
2041         /* and now we just set this; a bit of an ugly hack*/
2042         nnodes=nthreads;
2043     }
2044     /* Automatically set -beo options if -eo is set etc. */
2045     couple_files_options(NFILE,fnm);
2046
2047     /* Construct the command line arguments for benchmark runs
2048      * as well as for the simulation run
2049      */
2050     create_command_line_snippets(bThreads,presteps,NFILE,fnm,asize(pa),pa,procstring[0],
2051                                  &cmd_np, &cmd_args_bench, &cmd_args_launch,
2052                                  bPassAll? ExtraArgs : (char *)"");
2053
2054     /* Read in checkpoint file if requested */
2055     sim_part = 1;
2056     if(opt2bSet("-cpi",NFILE,fnm))
2057     {
2058         snew(cr,1);
2059         cr->duty=DUTY_PP; /* makes the following routine happy */
2060         read_checkpoint_simulation_part(opt2fn("-cpi",NFILE,fnm),
2061                                         &sim_part,&cpt_steps,cr,
2062                                         FALSE,NULL,NULL);
2063         sfree(cr);
2064         sim_part++;
2065         /* sim_part will now be 1 if no checkpoint file was found */
2066         if (sim_part<=1)
2067             gmx_fatal(FARGS, "Checkpoint file %s not found!", opt2fn("-cpi",
2068                                                                      NFILE,
2069                                                                      fnm));
2070     }
2071
2072     /* Open performance output file and write header info */
2073     fp = ffopen(opt2fn("-p",NFILE,fnm),"w");
2074
2075     /* Make a quick consistency check of command line parameters */
2076     check_input(nnodes, repeats, &ntprs, &upfac, &downfac, maxPMEfraction,
2077                 minPMEfraction, fs, bench_nsteps, fnm, NFILE, sim_part, presteps,
2078                 asize(pa),pa);
2079
2080     /* Determine max and min number of PME nodes to test: */
2081     if (nnodes > 2)
2082     {
2083         maxPMEnodes = floor(maxPMEfraction*nnodes);
2084         minPMEnodes = max(floor(minPMEfraction*nnodes), 0);
2085         fprintf(stdout, "Will try runs with %d ", minPMEnodes);
2086         if (maxPMEnodes != minPMEnodes)
2087             fprintf(stdout, "- %d ", maxPMEnodes);
2088         fprintf(stdout, "PME-only nodes.\n  Note that the automatic number of PME-only nodes and no separate PME nodes are always tested.\n");
2089     }
2090     else
2091     {
2092         maxPMEnodes = 0;
2093         minPMEnodes = 0;
2094     }
2095
2096     /* Get the commands we need to set up the runs from environment variables */
2097     get_program_paths(bThreads, &cmd_mpirun, cmd_np, &cmd_mdrun, repeats);
2098
2099     /* Print some header info to file */
2100     sep_line(fp);
2101     fprintf(fp, "\n      P E R F O R M A N C E   R E S U L T S\n");
2102     sep_line(fp);
2103     fprintf(fp, "%s for Gromacs %s\n", ShortProgram(),GromacsVersion());
2104     if (!bThreads)
2105     {
2106         fprintf(fp, "Number of nodes         : %d\n", nnodes);
2107         fprintf(fp, "The mpirun command is   : %s\n", cmd_mpirun);
2108         if ( strcmp(procstring[0], "none") != 0)
2109             fprintf(fp, "Passing # of nodes via  : %s\n", procstring[0]);
2110         else
2111             fprintf(fp, "Not setting number of nodes in system call\n");
2112     }
2113     else
2114         fprintf(fp, "Number of threads       : %d\n", nnodes);
2115
2116     fprintf(fp, "The mdrun  command is   : %s\n", cmd_mdrun);
2117     fprintf(fp, "mdrun args benchmarks   : %s\n", cmd_args_bench);
2118     fprintf(fp, "Benchmark steps         : ");
2119     fprintf(fp, gmx_large_int_pfmt, bench_nsteps);
2120     fprintf(fp, "\n");
2121     fprintf(fp, "dlb equilibration steps : %d\n", presteps);
2122     if (sim_part > 1)
2123     {
2124         fprintf(fp, "Checkpoint time step    : ");
2125         fprintf(fp, gmx_large_int_pfmt, cpt_steps);
2126         fprintf(fp, "\n");
2127     }
2128     if (bLaunch)
2129         fprintf(fp, "mdrun args at launchtime: %s\n", cmd_args_launch);
2130     if (!bPassAll && ExtraArgs[0] != '\0')
2131         fprintf(fp, "Unused arguments        : %s\n", ExtraArgs);
2132     if (new_sim_nsteps >= 0)
2133     {
2134         bOverwrite = TRUE;
2135         fprintf(stderr, "Note: Simulation input file %s will have ", opt2fn("-so",NFILE,fnm));
2136         fprintf(stderr, gmx_large_int_pfmt, new_sim_nsteps+cpt_steps);
2137         fprintf(stderr, " steps.\n");
2138         fprintf(fp, "Simulation steps        : ");
2139         fprintf(fp, gmx_large_int_pfmt, new_sim_nsteps);
2140         fprintf(fp, "\n");
2141     }
2142     if (repeats > 1)
2143         fprintf(fp, "Repeats for each test   : %d\n", repeats);
2144
2145     if (fs > 0.0)
2146     {
2147         fprintf(fp, "Requested grid spacing  : %f (tpr file will be changed accordingly)\n", fs);
2148         fprintf(fp, "                          This will be the grid spacing at a scaling factor of 1.0\n");
2149     }
2150
2151     fprintf(fp, "Input file              : %s\n", opt2fn("-s",NFILE,fnm));
2152
2153     /* Allocate memory for the inputinfo struct: */
2154     snew(info, 1);
2155     info->nr_inputfiles = ntprs;
2156     for (i=0; i<ntprs; i++)
2157     {
2158         snew(info->r_coulomb , ntprs);
2159         snew(info->r_vdw     , ntprs);
2160         snew(info->rlist     , ntprs);
2161         snew(info->rlistlong , ntprs);
2162         snew(info->fourier_nx, ntprs);
2163         snew(info->fourier_ny, ntprs);
2164         snew(info->fourier_nz, ntprs);
2165         snew(info->fourier_sp, ntprs);
2166     }
2167     /* Make alternative tpr files to test: */
2168     snew(tpr_names, ntprs);
2169     for (i=0; i<ntprs; i++)
2170         snew(tpr_names[i], STRLEN);
2171
2172     make_benchmark_tprs(opt2fn("-s",NFILE,fnm), tpr_names, bench_nsteps+presteps,
2173             cpt_steps, upfac, downfac, ntprs, fs, info, fp);
2174
2175
2176     /********************************************************************************/
2177     /* Main loop over all scenarios we need to test: tpr files, PME nodes, repeats  */
2178     /********************************************************************************/
2179     snew(perfdata, ntprs);
2180     do_the_tests(fp, tpr_names, maxPMEnodes, minPMEnodes, npmevalues_opt[0], perfdata, &pmeentries,
2181                  repeats, nnodes, ntprs, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
2182                  cmd_args_bench, fnm, NFILE, sim_part, presteps, cpt_steps);
2183
2184     fprintf(fp, "\nTuning took%8.1f minutes.\n", (gettime()-seconds)/60.0);
2185
2186     /* Analyse the results and give a suggestion for optimal settings: */
2187     bKeepTPR = analyze_data(fp, opt2fn("-p", NFILE, fnm), perfdata, nnodes, ntprs, pmeentries,
2188                             repeats, info, &best_tpr, &best_npme);
2189
2190     /* Take the best-performing tpr file and enlarge nsteps to original value */
2191     if ( bKeepTPR && !bOverwrite && !(fs > 0.0) )
2192     {
2193         simulation_tpr = opt2fn("-s",NFILE,fnm);
2194     }
2195     else
2196     {
2197         simulation_tpr = opt2fn("-so",NFILE,fnm);
2198         modify_PMEsettings(bOverwrite? (new_sim_nsteps+cpt_steps) :
2199                            info->orig_sim_steps, tpr_names[best_tpr],
2200                            simulation_tpr);
2201     }
2202
2203     /* Now start the real simulation if the user requested it ... */
2204     launch_simulation(bLaunch, fp, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
2205                       cmd_args_launch, simulation_tpr, nnodes, best_npme);
2206     ffclose(fp);
2207
2208     /* ... or simply print the performance results to screen: */
2209     if (!bLaunch)
2210         finalize(opt2fn("-p", NFILE, fnm));
2211
2212     return 0;
2213 }