updated several things related with OpenMP thread count
[gromacs.git] / src / kernel / runner.c
blobb7549d9a442e62ac11ea82718a89a4ab8744b2ec
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
3 *
4 * This source code is part of
5 *
6 * G R O M A C S
7 *
8 * GROningen MAchine for Chemical Simulations
9 *
10 * VERSION 3.2.0
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
33 * And Hey:
34 * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
36 #ifdef HAVE_CONFIG_H
37 #include <config.h>
38 #endif
39 #ifdef __linux
40 #define _GNU_SOURCE
41 #include <sched.h>
42 #include <sys/syscall.h>
43 #endif
44 #include <signal.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <assert.h>
49 #include "typedefs.h"
50 #include "smalloc.h"
51 #include "sysstuff.h"
52 #include "statutil.h"
53 #include "mdrun.h"
54 #include "md_logging.h"
55 #include "md_support.h"
56 #include "network.h"
57 #include "pull.h"
58 #include "names.h"
59 #include "disre.h"
60 #include "orires.h"
61 #include "pme.h"
62 #include "mdatoms.h"
63 #include "repl_ex.h"
64 #include "qmmm.h"
65 #include "mpelogging.h"
66 #include "domdec.h"
67 #include "partdec.h"
68 #include "coulomb.h"
69 #include "constr.h"
70 #include "mvdata.h"
71 #include "checkpoint.h"
72 #include "mtop_util.h"
73 #include "sighandler.h"
74 #include "tpxio.h"
75 #include "txtdump.h"
76 #include "gmx_detect_hardware.h"
77 #include "gmx_omp_nthreads.h"
78 #include "pull_rotation.h"
79 #include "calc_verletbuf.h"
80 #include "../mdlib/nbnxn_search.h"
81 #include "../mdlib/nbnxn_consts.h"
82 #include "gmx_fatal_collective.h"
83 #include "membed.h"
84 #include "md_openmm.h"
85 #include "gmx_omp.h"
87 #ifdef GMX_LIB_MPI
88 #include <mpi.h>
89 #endif
90 #ifdef GMX_THREAD_MPI
91 #include "tmpi.h"
92 #endif
94 #ifdef GMX_FAHCORE
95 #include "corewrap.h"
96 #endif
98 #ifdef GMX_OPENMM
99 #include "md_openmm.h"
100 #endif
102 #include "gpu_utils.h"
103 #include "nbnxn_cuda_data_mgmt.h"
105 typedef struct {
106 gmx_integrator_t *func;
107 } gmx_intp_t;
109 /* The array should match the eI array in include/types/enums.h */
110 #ifdef GMX_OPENMM /* FIXME do_md_openmm needs fixing */
111 const gmx_intp_t integrator[eiNR] = { {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm},{do_md_openmm}};
112 #else
113 const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md},{do_md}};
114 #endif
116 gmx_large_int_t deform_init_init_step_tpx;
117 matrix deform_init_box_tpx;
118 #ifdef GMX_THREAD_MPI
119 tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
120 #endif
123 #ifdef GMX_THREAD_MPI
124 struct mdrunner_arglist
126 gmx_hw_opt_t *hw_opt;
127 FILE *fplog;
128 t_commrec *cr;
129 int nfile;
130 const t_filenm *fnm;
131 output_env_t oenv;
132 gmx_bool bVerbose;
133 gmx_bool bCompact;
134 int nstglobalcomm;
135 ivec ddxyz;
136 int dd_node_order;
137 real rdd;
138 real rconstr;
139 const char *dddlb_opt;
140 real dlb_scale;
141 const char *ddcsx;
142 const char *ddcsy;
143 const char *ddcsz;
144 const char *nbpu_opt;
145 int nsteps_cmdline;
146 int nstepout;
147 int resetstep;
148 int nmultisim;
149 int repl_ex_nst;
150 int repl_ex_nex;
151 int repl_ex_seed;
152 real pforce;
153 real cpt_period;
154 real max_hours;
155 const char *deviceOptions;
156 unsigned long Flags;
157 int ret; /* return value */
161 /* The function used for spawning threads. Extracts the mdrunner()
162 arguments from its one argument and calls mdrunner(), after making
163 a commrec. */
164 static void mdrunner_start_fn(void *arg)
166 struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg;
167 struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure
168 that it's thread-local. This doesn't
169 copy pointed-to items, of course,
170 but those are all const. */
171 t_commrec *cr; /* we need a local version of this */
172 FILE *fplog=NULL;
173 t_filenm *fnm;
175 fnm = dup_tfn(mc.nfile, mc.fnm);
177 cr = init_par_threads(mc.cr);
179 if (MASTER(cr))
181 fplog=mc.fplog;
184 mda->ret=mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
185 mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
186 mc.ddxyz, mc.dd_node_order, mc.rdd,
187 mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
188 mc.ddcsx, mc.ddcsy, mc.ddcsz,
189 mc.nbpu_opt,
190 mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
191 mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
192 mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
195 /* called by mdrunner() to start a specific number of threads (including
196 the main thread) for thread-parallel runs. This in turn calls mdrunner()
197 for each thread.
198 All options besides nthreads are the same as for mdrunner(). */
199 static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
200 FILE *fplog,t_commrec *cr,int nfile,
201 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
202 gmx_bool bCompact, int nstglobalcomm,
203 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
204 const char *dddlb_opt,real dlb_scale,
205 const char *ddcsx,const char *ddcsy,const char *ddcsz,
206 const char *nbpu_opt,
207 int nsteps_cmdline, int nstepout,int resetstep,
208 int nmultisim,int repl_ex_nst,int repl_ex_nex, int repl_ex_seed,
209 real pforce,real cpt_period, real max_hours,
210 const char *deviceOptions, unsigned long Flags)
212 int ret;
213 struct mdrunner_arglist *mda;
214 t_commrec *crn; /* the new commrec */
215 t_filenm *fnmn;
217 /* first check whether we even need to start tMPI */
218 if (hw_opt->nthreads_tmpi < 2)
220 return cr;
223 /* a few small, one-time, almost unavoidable memory leaks: */
224 snew(mda,1);
225 fnmn=dup_tfn(nfile, fnm);
227 /* fill the data structure to pass as void pointer to thread start fn */
228 mda->hw_opt=hw_opt;
229 mda->fplog=fplog;
230 mda->cr=cr;
231 mda->nfile=nfile;
232 mda->fnm=fnmn;
233 mda->oenv=oenv;
234 mda->bVerbose=bVerbose;
235 mda->bCompact=bCompact;
236 mda->nstglobalcomm=nstglobalcomm;
237 mda->ddxyz[XX]=ddxyz[XX];
238 mda->ddxyz[YY]=ddxyz[YY];
239 mda->ddxyz[ZZ]=ddxyz[ZZ];
240 mda->dd_node_order=dd_node_order;
241 mda->rdd=rdd;
242 mda->rconstr=rconstr;
243 mda->dddlb_opt=dddlb_opt;
244 mda->dlb_scale=dlb_scale;
245 mda->ddcsx=ddcsx;
246 mda->ddcsy=ddcsy;
247 mda->ddcsz=ddcsz;
248 mda->nbpu_opt=nbpu_opt;
249 mda->nsteps_cmdline=nsteps_cmdline;
250 mda->nstepout=nstepout;
251 mda->resetstep=resetstep;
252 mda->nmultisim=nmultisim;
253 mda->repl_ex_nst=repl_ex_nst;
254 mda->repl_ex_nex=repl_ex_nex;
255 mda->repl_ex_seed=repl_ex_seed;
256 mda->pforce=pforce;
257 mda->cpt_period=cpt_period;
258 mda->max_hours=max_hours;
259 mda->deviceOptions=deviceOptions;
260 mda->Flags=Flags;
262 fprintf(stderr, "Starting %d tMPI threads\n",hw_opt->nthreads_tmpi);
263 fflush(stderr);
264 /* now spawn new threads that start mdrunner_start_fn(), while
265 the main thread returns */
266 ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
267 (hw_opt->bThreadPinning ? TMPI_AFFINITY_ALL_CORES : TMPI_AFFINITY_NONE),
268 mdrunner_start_fn, (void*)(mda) );
269 if (ret!=TMPI_SUCCESS)
270 return NULL;
272 /* make a new comm_rec to reflect the new situation */
273 crn=init_par_threads(cr);
274 return crn;
278 static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
279 const gmx_hw_opt_t *hw_opt,
280 int nthreads_tot,
281 int ngpu)
283 int nthreads_tmpi;
285 /* There are no separate PME nodes here, as we ensured in
286 * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
287 * and a conditional ensures we would not have ended up here.
288 * Note that separate PME nodes might be switched on later.
290 if (ngpu > 0)
292 nthreads_tmpi = ngpu;
293 if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
295 nthreads_tmpi = nthreads_tot;
298 else if (hw_opt->nthreads_omp > 0)
300 /* Here we could oversubscribe, when we do, we issue a warning later */
301 nthreads_tmpi = max(1,nthreads_tot/hw_opt->nthreads_omp);
303 else
305 /* TODO choose nthreads_omp based on hardware topology
306 when we have a hardware topology detection library */
307 /* In general, when running up to 4 threads, OpenMP should be faster.
308 * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
309 * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
310 * even on two CPUs it's usually faster (but with many OpenMP threads
311 * it could be faster not to use HT, currently we always use HT).
312 * On Nehalem/Westmere we want to avoid running 16 threads over
313 * two CPUs with HT, so we need a limit<16; thus we use 12.
314 * A reasonable limit for Intel Sandy and Ivy bridge,
315 * not knowing the topology, is 16 threads.
317 const int nthreads_omp_always_faster = 4;
318 const int nthreads_omp_always_faster_Nehalem = 12;
319 const int nthreads_omp_always_faster_SandyBridge = 16;
320 const int first_model_Nehalem = 0x1A;
321 const int first_model_SandyBridge = 0x2A;
322 gmx_bool bIntel_Family6;
324 bIntel_Family6 =
325 (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
326 gmx_cpuid_family(hwinfo->cpuid_info) == 6);
328 if (nthreads_tot <= nthreads_omp_always_faster ||
329 (bIntel_Family6 &&
330 ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
331 (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
333 /* Use pure OpenMP parallelization */
334 nthreads_tmpi = 1;
336 else
338 /* Don't use OpenMP parallelization */
339 nthreads_tmpi = nthreads_tot;
343 return nthreads_tmpi;
347 /* Get the number of threads to use for thread-MPI based on how many
348 * were requested, which algorithms we're using,
349 * and how many particles there are.
350 * At the point we have already called check_and_update_hw_opt.
351 * Thus all options should be internally consistent and consistent
352 * with the hardware, except that ntmpi could be larger than #GPU.
354 static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
355 gmx_hw_opt_t *hw_opt,
356 t_inputrec *inputrec, gmx_mtop_t *mtop,
357 const t_commrec *cr,
358 FILE *fplog)
360 int nthreads_hw,nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
361 int min_atoms_per_mpi_thread;
362 char *env;
363 char sbuf[STRLEN];
364 gmx_bool bCanUseGPU;
366 if (hw_opt->nthreads_tmpi > 0)
368 /* Trivial, return right away */
369 return hw_opt->nthreads_tmpi;
372 nthreads_hw = hwinfo->nthreads_hw_avail;
374 /* How many total (#tMPI*#OpenMP) threads can we start? */
375 if (hw_opt->nthreads_tot > 0)
377 nthreads_tot_max = hw_opt->nthreads_tot;
379 else
381 nthreads_tot_max = nthreads_hw;
384 bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
385 if (bCanUseGPU)
387 ngpu = hwinfo->gpu_info.ncuda_dev_use;
389 else
391 ngpu = 0;
394 nthreads_tmpi =
395 get_tmpi_omp_thread_division(hwinfo,hw_opt,nthreads_tot_max,ngpu);
397 if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
399 /* Steps are divided over the nodes iso splitting the atoms */
400 min_atoms_per_mpi_thread = 0;
402 else
404 if (bCanUseGPU)
406 min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
408 else
410 min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
414 /* Check if an algorithm does not support parallel simulation. */
415 if (nthreads_tmpi != 1 &&
416 ( inputrec->eI == eiLBFGS ||
417 inputrec->coulombtype == eelEWALD ) )
419 nthreads_tmpi = 1;
421 md_print_warn(cr,fplog,"The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
422 if (hw_opt->nthreads_tmpi > nthreads_tmpi)
424 gmx_fatal(FARGS,"You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
427 else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
429 /* the thread number was chosen automatically, but there are too many
430 threads (too few atoms per thread) */
431 nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
433 /* Avoid partial use of Hyper-Threading */
434 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
435 nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
437 nthreads_new = nthreads_hw/2;
440 /* Avoid large prime numbers in the thread count */
441 if (nthreads_new >= 6)
443 /* Use only 6,8,10 with additional factors of 2 */
444 int fac;
446 fac = 2;
447 while (3*fac*2 <= nthreads_new)
449 fac *= 2;
452 nthreads_new = (nthreads_new/fac)*fac;
454 else
456 /* Avoid 5 */
457 if (nthreads_new == 5)
459 nthreads_new = 4;
463 nthreads_tmpi = nthreads_new;
465 fprintf(stderr,"\n");
466 fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
467 fprintf(stderr," only starting %d thread-MPI threads.\n",nthreads_tmpi);
468 fprintf(stderr," You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
471 return nthreads_tmpi;
473 #endif /* GMX_THREAD_MPI */
476 /* Environment variable for setting nstlist */
477 static const char* NSTLIST_ENVVAR = "GMX_NSTLIST";
478 /* Try to increase nstlist when using a GPU with nstlist less than this */
479 static const int NSTLIST_GPU_ENOUGH = 20;
480 /* Increase nstlist until the non-bonded cost increases more than this factor */
481 static const float NBNXN_GPU_LIST_OK_FAC = 1.25;
482 /* Don't increase nstlist beyond a non-bonded cost increases of this factor */
483 static const float NBNXN_GPU_LIST_MAX_FAC = 1.40;
485 /* Try to increase nstlist when running on a GPU */
486 static void increase_nstlist(FILE *fp,t_commrec *cr,
487 t_inputrec *ir,const gmx_mtop_t *mtop,matrix box)
489 char *env;
490 int nstlist_orig,nstlist_prev;
491 verletbuf_list_setup_t ls;
492 real rlist_inc,rlist_ok,rlist_max,rlist_new,rlist_prev;
493 int i;
494 t_state state_tmp;
495 gmx_bool bBox,bDD,bCont;
496 const char *nstl_fmt="\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
497 const char *vbd_err="Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
498 const char *box_err="Can not increase nstlist for GPU run because the box is too small";
499 const char *dd_err ="Can not increase nstlist for GPU run because of domain decomposition limitations";
500 char buf[STRLEN];
502 /* Number of + nstlist alternative values to try when switching */
503 const int nstl[]={ 20, 25, 40, 50 };
504 #define NNSTL sizeof(nstl)/sizeof(nstl[0])
506 env = getenv(NSTLIST_ENVVAR);
507 if (env == NULL)
509 if (fp != NULL)
511 fprintf(fp,nstl_fmt,ir->nstlist);
515 if (ir->verletbuf_drift == 0)
517 gmx_fatal(FARGS,"You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
520 if (ir->verletbuf_drift < 0)
522 if (MASTER(cr))
524 fprintf(stderr,"%s\n",vbd_err);
526 if (fp != NULL)
528 fprintf(fp,"%s\n",vbd_err);
531 return;
534 nstlist_orig = ir->nstlist;
535 if (env != NULL)
537 sprintf(buf,"Getting nstlist from environment variable GMX_NSTLIST=%s",env);
538 if (MASTER(cr))
540 fprintf(stderr,"%s\n",buf);
542 if (fp != NULL)
544 fprintf(fp,"%s\n",buf);
546 sscanf(env,"%d",&ir->nstlist);
549 verletbuf_get_list_setup(TRUE,&ls);
551 /* Allow rlist to make the list double the size of the cut-off sphere */
552 rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE,mtop->natoms/det(box));
553 rlist_ok = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC,1.0/3.0) - rlist_inc;
554 rlist_max = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC,1.0/3.0) - rlist_inc;
555 if (debug)
557 fprintf(debug,"GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
558 rlist_inc,rlist_max);
561 i = 0;
562 nstlist_prev = nstlist_orig;
563 rlist_prev = ir->rlist;
566 if (env == NULL)
568 ir->nstlist = nstl[i];
571 /* Set the pair-list buffer size in ir */
572 calc_verlet_buffer_size(mtop,det(box),ir,ir->verletbuf_drift,&ls,
573 NULL,&rlist_new);
575 /* Does rlist fit in the box? */
576 bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC,box));
577 bDD = TRUE;
578 if (bBox && DOMAINDECOMP(cr))
580 /* Check if rlist fits in the domain decomposition */
581 if (inputrec2nboundeddim(ir) < DIM)
583 gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
585 copy_mat(box,state_tmp.box);
586 bDD = change_dd_cutoff(cr,&state_tmp,ir,rlist_new);
589 bCont = FALSE;
591 if (env == NULL)
593 if (bBox && bDD && rlist_new <= rlist_max)
595 /* Increase nstlist */
596 nstlist_prev = ir->nstlist;
597 rlist_prev = rlist_new;
598 bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
600 else
602 /* Stick with the previous nstlist */
603 ir->nstlist = nstlist_prev;
604 rlist_new = rlist_prev;
605 bBox = TRUE;
606 bDD = TRUE;
610 i++;
612 while (bCont);
614 if (!bBox || !bDD)
616 gmx_warning(!bBox ? box_err : dd_err);
617 if (fp != NULL)
619 fprintf(fp,"\n%s\n",bBox ? box_err : dd_err);
621 ir->nstlist = nstlist_orig;
623 else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
625 sprintf(buf,"Changing nstlist from %d to %d, rlist from %g to %g",
626 nstlist_orig,ir->nstlist,
627 ir->rlist,rlist_new);
628 if (MASTER(cr))
630 fprintf(stderr,"%s\n\n",buf);
632 if (fp != NULL)
634 fprintf(fp,"%s\n\n",buf);
636 ir->rlist = rlist_new;
637 ir->rlistlong = rlist_new;
641 static void prepare_verlet_scheme(FILE *fplog,
642 gmx_hw_info_t *hwinfo,
643 t_commrec *cr,
644 gmx_hw_opt_t *hw_opt,
645 const char *nbpu_opt,
646 t_inputrec *ir,
647 const gmx_mtop_t *mtop,
648 matrix box,
649 gmx_bool *bUseGPU)
651 /* Here we only check for GPU usage on the MPI master process,
652 * as here we don't know how many GPUs we will use yet.
653 * We check for a GPU on all processes later.
655 *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
657 if (ir->verletbuf_drift > 0)
659 /* Update the Verlet buffer size for the current run setup */
660 verletbuf_list_setup_t ls;
661 real rlist_new;
663 /* Here we assume CPU acceleration is on. But as currently
664 * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
665 * and 4x2 gives a larger buffer than 4x4, this is ok.
667 verletbuf_get_list_setup(*bUseGPU,&ls);
669 calc_verlet_buffer_size(mtop,det(box),ir,
670 ir->verletbuf_drift,&ls,
671 NULL,&rlist_new);
672 if (rlist_new != ir->rlist)
674 if (fplog != NULL)
676 fprintf(fplog,"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
677 ir->rlist,rlist_new,
678 ls.cluster_size_i,ls.cluster_size_j);
680 ir->rlist = rlist_new;
681 ir->rlistlong = rlist_new;
685 /* With GPU or emulation we should check nstlist for performance */
686 if ((EI_DYNAMICS(ir->eI) &&
687 *bUseGPU &&
688 ir->nstlist < NSTLIST_GPU_ENOUGH) ||
689 getenv(NSTLIST_ENVVAR) != NULL)
691 /* Choose a better nstlist */
692 increase_nstlist(fplog,cr,ir,mtop,box);
696 static void convert_to_verlet_scheme(FILE *fplog,
697 t_inputrec *ir,
698 gmx_mtop_t *mtop,real box_vol)
700 char *conv_mesg="Converting input file with group cut-off scheme to the Verlet cut-off scheme";
702 md_print_warn(NULL,fplog,"%s\n",conv_mesg);
704 ir->cutoff_scheme = ecutsVERLET;
705 ir->verletbuf_drift = 0.005;
707 if (ir->rcoulomb != ir->rvdw)
709 gmx_fatal(FARGS,"The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
712 if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
714 gmx_fatal(FARGS,"User non-bonded potentials are not (yet) supported with the Verlet scheme");
716 else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
718 md_print_warn(NULL,fplog,"Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
720 if (EVDW_SWITCHED(ir->vdwtype))
722 ir->vdwtype = evdwCUT;
724 if (EEL_SWITCHED(ir->coulombtype))
726 if (EEL_FULL(ir->coulombtype))
728 /* With full electrostatic only PME can be switched */
729 ir->coulombtype = eelPME;
731 else
733 md_print_warn(NULL,fplog,"NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n",eel_names[ir->coulombtype]);
734 ir->coulombtype = eelRF;
735 ir->epsilon_rf = 0.0;
739 /* We set the target energy drift to a small number.
740 * Note that this is only for testing. For production the user
741 * should think about this and set the mdp options.
743 ir->verletbuf_drift = 1e-4;
746 if (inputrec2nboundeddim(ir) != 3)
748 gmx_fatal(FARGS,"Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
751 if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
753 gmx_fatal(FARGS,"Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
756 if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
758 verletbuf_list_setup_t ls;
760 verletbuf_get_list_setup(FALSE,&ls);
761 calc_verlet_buffer_size(mtop,box_vol,ir,ir->verletbuf_drift,&ls,
762 NULL,&ir->rlist);
764 else
766 ir->verletbuf_drift = -1;
767 ir->rlist = 1.05*max(ir->rvdw,ir->rcoulomb);
770 gmx_mtop_remove_chargegroups(mtop);
774 /* Set CPU affinity. Can be important for performance.
775 On some systems (e.g. Cray) CPU Affinity is set by default.
776 But default assigning doesn't work (well) with only some ranks
777 having threads. This causes very low performance.
778 External tools have cumbersome syntax for setting affinity
779 in the case that only some ranks have threads.
780 Thus it is important that GROMACS sets the affinity internally
781 if only PME is using threads.
783 static void set_cpu_affinity(FILE *fplog,
784 const t_commrec *cr,
785 gmx_hw_opt_t *hw_opt,
786 int nthreads_pme,
787 const gmx_hw_info_t *hwinfo,
788 const t_inputrec *inputrec)
790 #if defined GMX_THREAD_MPI
791 /* With the number of TMPI threads equal to the number of cores
792 * we already pinned in thread-MPI, so don't pin again here.
794 if (hw_opt->nthreads_tmpi == tMPI_Thread_get_hw_number())
796 return;
798 #endif
800 #ifdef GMX_OPENMP /* TODO: actually we could do this even without OpenMP?! */
801 #ifdef __linux /* TODO: only linux? why not everywhere if sched_setaffinity is available */
802 if (hw_opt->bThreadPinning)
804 int thread, nthread_local, nthread_node, nthread_hw_max, nphyscore;
805 int offset;
806 char *env;
808 /* threads on this MPI process or TMPI thread */
809 if (cr->duty & DUTY_PP)
811 nthread_local = gmx_omp_nthreads_get(emntNonbonded);
813 else
815 nthread_local = gmx_omp_nthreads_get(emntPME);
818 /* map the current process to cores */
819 thread = 0;
820 nthread_node = nthread_local;
821 #ifdef GMX_MPI
822 if (PAR(cr) || MULTISIM(cr))
824 /* We need to determine a scan of the thread counts in this
825 * compute node.
827 MPI_Comm comm_intra;
829 MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->nodeid_intra,
830 &comm_intra);
831 MPI_Scan(&nthread_local,&thread,1,MPI_INT,MPI_SUM,comm_intra);
832 /* MPI_Scan is inclusive, but here we need exclusive */
833 thread -= nthread_local;
834 /* Get the total number of threads on this physical node */
835 MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
836 MPI_Comm_free(&comm_intra);
838 #endif
840 offset = 0;
841 if (hw_opt->core_pinning_offset > 0)
843 offset = hw_opt->core_pinning_offset;
844 if (SIMMASTER(cr))
846 fprintf(stderr, "Applying core pinning offset %d\n", offset);
848 if (fplog)
850 fprintf(fplog, "Applying core pinning offset %d\n", offset);
854 /* With Intel Hyper-Threading enabled, we want to pin consecutive
855 * threads to physical cores when using more threads than physical
856 * cores or when the user requests so.
858 nthread_hw_max = hwinfo->nthreads_hw_avail;
859 nphyscore = -1;
860 if (hw_opt->bPinHyperthreading ||
861 (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
862 nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL))
864 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED)
866 /* We print to stderr on all processes, as we might have
867 * different settings on different physical nodes.
869 if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL)
871 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
872 "but non-Intel CPU detected (vendor: %s)\n",
873 gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
875 else
877 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
878 "but the CPU detected does not have Intel Hyper-Threading support "
879 "(or it is turned off)\n");
882 nphyscore = nthread_hw_max/2;
884 if (SIMMASTER(cr))
886 fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
887 nphyscore);
889 if (fplog)
891 fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
892 nphyscore);
896 /* set the per-thread affinity */
897 #pragma omp parallel firstprivate(thread) num_threads(nthread_local)
899 cpu_set_t mask;
900 int core;
902 CPU_ZERO(&mask);
903 thread += gmx_omp_get_thread_num();
904 if (nphyscore <= 0)
906 core = offset + thread;
908 else
910 /* Lock pairs of threads to the same hyperthreaded core */
911 core = offset + thread/2 + (thread % 2)*nphyscore;
913 CPU_SET(core, &mask);
914 sched_setaffinity((pid_t) syscall (SYS_gettid), sizeof(cpu_set_t), &mask);
917 #endif /* __linux */
918 #endif /* GMX_OPENMP */
922 static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
923 int cutoff_scheme)
925 gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp);
927 #ifndef GMX_THREAD_MPI
928 if (hw_opt->nthreads_tot > 0)
930 gmx_fatal(FARGS,"Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
932 if (hw_opt->nthreads_tmpi > 0)
934 gmx_fatal(FARGS,"Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
936 #endif
938 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
940 /* We have the same number of OpenMP threads for PP and PME processes,
941 * thus we can perform several consistency checks.
943 if (hw_opt->nthreads_tmpi > 0 &&
944 hw_opt->nthreads_omp > 0 &&
945 hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
947 gmx_fatal(FARGS,"The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
948 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi,hw_opt->nthreads_omp);
951 if (hw_opt->nthreads_tmpi > 0 &&
952 hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
954 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
955 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi);
958 if (hw_opt->nthreads_omp > 0 &&
959 hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
961 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
962 hw_opt->nthreads_tot,hw_opt->nthreads_omp);
965 if (hw_opt->nthreads_tmpi > 0 &&
966 hw_opt->nthreads_omp <= 0)
968 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
972 #ifndef GMX_OPENMP
973 if (hw_opt->nthreads_omp > 1)
975 gmx_fatal(FARGS,"OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
977 #endif
979 if (cutoff_scheme == ecutsGROUP)
981 /* We only have OpenMP support for PME only nodes */
982 if (hw_opt->nthreads_omp > 1)
984 gmx_fatal(FARGS,"OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
985 ecutscheme_names[cutoff_scheme],
986 ecutscheme_names[ecutsVERLET]);
988 hw_opt->nthreads_omp = 1;
991 if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
993 gmx_fatal(FARGS,"You need to specify -ntomp in addition to -ntomp_pme");
996 if (hw_opt->nthreads_tot == 1)
998 hw_opt->nthreads_tmpi = 1;
1000 if (hw_opt->nthreads_omp > 1)
1002 gmx_fatal(FARGS,"You requested %d OpenMP threads with %d total threads",
1003 hw_opt->nthreads_tmpi,hw_opt->nthreads_tot);
1005 hw_opt->nthreads_omp = 1;
1008 if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
1010 hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
1013 if (debug)
1015 fprintf(debug,"hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
1016 hw_opt->nthreads_tot,
1017 hw_opt->nthreads_tmpi,
1018 hw_opt->nthreads_omp,
1019 hw_opt->nthreads_omp_pme,
1020 hw_opt->gpu_id!=NULL ? hw_opt->gpu_id : "");
1026 /* Override the value in inputrec with value passed on the command line (if any) */
1027 static void override_nsteps_cmdline(FILE *fplog,
1028 int nsteps_cmdline,
1029 t_inputrec *ir,
1030 const t_commrec *cr)
1032 assert(ir);
1033 assert(cr);
1035 /* override with anything else than the default -2 */
1036 if (nsteps_cmdline > -2)
1038 char stmp[STRLEN];
1040 ir->nsteps = nsteps_cmdline;
1041 if (EI_DYNAMICS(ir->eI))
1043 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
1044 nsteps_cmdline, nsteps_cmdline*ir->delta_t);
1046 else
1048 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
1049 nsteps_cmdline);
1052 md_print_warn(cr, fplog, "%s\n", stmp);
1056 /* Data structure set by SIMMASTER which needs to be passed to all nodes
1057 * before the other nodes have read the tpx file and called gmx_detect_hardware.
1059 typedef struct {
1060 int cutoff_scheme; /* The cutoff-scheme from inputrec_t */
1061 gmx_bool bUseGPU; /* Use GPU or GPU emulation */
1062 } master_inf_t;
1064 int mdrunner(gmx_hw_opt_t *hw_opt,
1065 FILE *fplog,t_commrec *cr,int nfile,
1066 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
1067 gmx_bool bCompact, int nstglobalcomm,
1068 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
1069 const char *dddlb_opt,real dlb_scale,
1070 const char *ddcsx,const char *ddcsy,const char *ddcsz,
1071 const char *nbpu_opt,
1072 int nsteps_cmdline, int nstepout,int resetstep,
1073 int nmultisim,int repl_ex_nst,int repl_ex_nex,
1074 int repl_ex_seed, real pforce,real cpt_period,real max_hours,
1075 const char *deviceOptions, unsigned long Flags)
1077 gmx_bool bForceUseGPU,bTryUseGPU;
1078 double nodetime=0,realtime;
1079 t_inputrec *inputrec;
1080 t_state *state=NULL;
1081 matrix box;
1082 gmx_ddbox_t ddbox={0};
1083 int npme_major,npme_minor;
1084 real tmpr1,tmpr2;
1085 t_nrnb *nrnb;
1086 gmx_mtop_t *mtop=NULL;
1087 t_mdatoms *mdatoms=NULL;
1088 t_forcerec *fr=NULL;
1089 t_fcdata *fcd=NULL;
1090 real ewaldcoeff=0;
1091 gmx_pme_t *pmedata=NULL;
1092 gmx_vsite_t *vsite=NULL;
1093 gmx_constr_t constr;
1094 int i,m,nChargePerturbed=-1,status,nalloc;
1095 char *gro;
1096 gmx_wallcycle_t wcycle;
1097 gmx_bool bReadRNG,bReadEkin;
1098 int list;
1099 gmx_runtime_t runtime;
1100 int rc;
1101 gmx_large_int_t reset_counters;
1102 gmx_edsam_t ed=NULL;
1103 t_commrec *cr_old=cr;
1104 int nthreads_pme=1;
1105 int nthreads_pp=1;
1106 gmx_membed_t membed=NULL;
1107 gmx_hw_info_t *hwinfo=NULL;
1108 master_inf_t minf={-1,FALSE};
1110 /* CAUTION: threads may be started later on in this function, so
1111 cr doesn't reflect the final parallel state right now */
1112 snew(inputrec,1);
1113 snew(mtop,1);
1115 if (Flags & MD_APPENDFILES)
1117 fplog = NULL;
1120 bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
1121 bTryUseGPU = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
1123 snew(state,1);
1124 if (SIMMASTER(cr))
1126 /* Read (nearly) all data required for the simulation */
1127 read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
1129 if (inputrec->cutoff_scheme != ecutsVERLET &&
1130 ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
1132 convert_to_verlet_scheme(fplog,inputrec,mtop,det(state->box));
1135 /* Detect hardware, gather information. With tMPI only thread 0 does it
1136 * and after threads are started broadcasts hwinfo around. */
1137 snew(hwinfo, 1);
1138 gmx_detect_hardware(fplog, hwinfo, cr,
1139 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1141 minf.cutoff_scheme = inputrec->cutoff_scheme;
1142 minf.bUseGPU = FALSE;
1144 if (inputrec->cutoff_scheme == ecutsVERLET)
1146 prepare_verlet_scheme(fplog,hwinfo,cr,hw_opt,nbpu_opt,
1147 inputrec,mtop,state->box,
1148 &minf.bUseGPU);
1150 else if (hwinfo->bCanUseGPU)
1152 md_print_warn(cr,fplog,
1153 "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
1154 " To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
1155 " (for quick performance testing you can use the -testverlet option)\n");
1157 if (bForceUseGPU)
1159 gmx_fatal(FARGS,"GPU requested, but can't be used without cutoff-scheme=Verlet");
1163 #ifndef GMX_THREAD_MPI
1164 if (PAR(cr))
1166 gmx_bcast_sim(sizeof(minf),&minf,cr);
1168 #endif
1169 if (minf.bUseGPU && cr->npmenodes == -1)
1171 /* Don't automatically use PME-only nodes with GPUs */
1172 cr->npmenodes = 0;
1175 #ifdef GMX_THREAD_MPI
1176 /* With thread-MPI inputrec is only set here on the master thread */
1177 if (SIMMASTER(cr))
1178 #endif
1180 check_and_update_hw_opt(hw_opt,minf.cutoff_scheme);
1182 #ifdef GMX_THREAD_MPI
1183 if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
1185 gmx_fatal(FARGS,"You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
1187 #endif
1189 if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
1190 cr->npmenodes <= 0)
1192 gmx_fatal(FARGS,"You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
1196 /* Check for externally set OpenMP affinity and turn off internal
1197 * pinning if any is found. We need to do this check early to tell
1198 * thread-MPI whether it should do pinning when spawning threads.
1200 gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
1202 #ifdef GMX_THREAD_MPI
1203 if (SIMMASTER(cr))
1205 /* NOW the threads will be started: */
1206 hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
1207 hw_opt,
1208 inputrec, mtop,
1209 cr, fplog);
1210 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
1212 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
1215 if (hw_opt->nthreads_tmpi > 1)
1217 /* now start the threads. */
1218 cr=mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
1219 oenv, bVerbose, bCompact, nstglobalcomm,
1220 ddxyz, dd_node_order, rdd, rconstr,
1221 dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
1222 nbpu_opt,
1223 nsteps_cmdline, nstepout, resetstep, nmultisim,
1224 repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
1225 cpt_period, max_hours, deviceOptions,
1226 Flags);
1227 /* the main thread continues here with a new cr. We don't deallocate
1228 the old cr because other threads may still be reading it. */
1229 if (cr == NULL)
1231 gmx_comm("Failed to spawn threads");
1235 #endif
1236 /* END OF CAUTION: cr is now reliable */
1238 /* g_membed initialisation *
1239 * Because we change the mtop, init_membed is called before the init_parallel *
1240 * (in case we ever want to make it run in parallel) */
1241 if (opt2bSet("-membed",nfile,fnm))
1243 if (MASTER(cr))
1245 fprintf(stderr,"Initializing membed");
1247 membed = init_membed(fplog,nfile,fnm,mtop,inputrec,state,cr,&cpt_period);
1250 if (PAR(cr))
1252 /* now broadcast everything to the non-master nodes/threads: */
1253 init_parallel(fplog, cr, inputrec, mtop);
1255 /* This check needs to happen after get_nthreads_mpi() */
1256 if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
1258 gmx_fatal_collective(FARGS,cr,NULL,
1259 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
1260 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
1263 if (fplog != NULL)
1265 pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
1268 #if defined GMX_THREAD_MPI
1269 /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
1270 * to the other threads -- slightly uncool, but works fine, just need to
1271 * make sure that the data doesn't get freed twice. */
1272 if (cr->nnodes > 1)
1274 if (!SIMMASTER(cr))
1276 snew(hwinfo, 1);
1278 gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
1280 #else
1281 if (PAR(cr) && !SIMMASTER(cr))
1283 /* now we have inputrec on all nodes, can run the detection */
1284 /* TODO: perhaps it's better to propagate within a node instead? */
1285 snew(hwinfo, 1);
1286 gmx_detect_hardware(fplog, hwinfo, cr,
1287 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1289 #endif
1291 /* now make sure the state is initialized and propagated */
1292 set_state_entries(state,inputrec,cr->nnodes);
1294 /* remove when vv and rerun works correctly! */
1295 if (PAR(cr) && EI_VV(inputrec->eI) && ((Flags & MD_RERUN) || (Flags & MD_RERUN_VSITE)))
1297 gmx_fatal(FARGS,
1298 "Currently can't do velocity verlet with rerun in parallel.");
1301 /* A parallel command line option consistency check that we can
1302 only do after any threads have started. */
1303 if (!PAR(cr) &&
1304 (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
1306 gmx_fatal(FARGS,
1307 "The -dd or -npme option request a parallel simulation, "
1308 #ifndef GMX_MPI
1309 "but %s was compiled without threads or MPI enabled"
1310 #else
1311 #ifdef GMX_THREAD_MPI
1312 "but the number of threads (option -nt) is 1"
1313 #else
1314 "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
1315 #endif
1316 #endif
1317 , ShortProgram()
1321 if ((Flags & MD_RERUN) &&
1322 (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
1324 gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
1327 if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog) && PAR(cr))
1329 /* All-vs-all loops do not work with domain decomposition */
1330 Flags |= MD_PARTDEC;
1333 if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
1335 if (cr->npmenodes > 0)
1337 if (!EEL_PME(inputrec->coulombtype))
1339 gmx_fatal_collective(FARGS,cr,NULL,
1340 "PME nodes are requested, but the system does not use PME electrostatics");
1342 if (Flags & MD_PARTDEC)
1344 gmx_fatal_collective(FARGS,cr,NULL,
1345 "PME nodes are requested, but particle decomposition does not support separate PME nodes");
1349 cr->npmenodes = 0;
1352 #ifdef GMX_FAHCORE
1353 fcRegisterSteps(inputrec->nsteps,inputrec->init_step);
1354 #endif
1356 /* NMR restraints must be initialized before load_checkpoint,
1357 * since with time averaging the history is added to t_state.
1358 * For proper consistency check we therefore need to extend
1359 * t_state here.
1360 * So the PME-only nodes (if present) will also initialize
1361 * the distance restraints.
1363 snew(fcd,1);
1365 /* This needs to be called before read_checkpoint to extend the state */
1366 init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state);
1368 if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0)
1370 if (PAR(cr) && !(Flags & MD_PARTDEC))
1372 gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
1374 /* Orientation restraints */
1375 if (MASTER(cr))
1377 init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires),
1378 state);
1382 if (DEFORM(*inputrec))
1384 /* Store the deform reference box before reading the checkpoint */
1385 if (SIMMASTER(cr))
1387 copy_mat(state->box,box);
1389 if (PAR(cr))
1391 gmx_bcast(sizeof(box),box,cr);
1393 /* Because we do not have the update struct available yet
1394 * in which the reference values should be stored,
1395 * we store them temporarily in static variables.
1396 * This should be thread safe, since they are only written once
1397 * and with identical values.
1399 #ifdef GMX_THREAD_MPI
1400 tMPI_Thread_mutex_lock(&deform_init_box_mutex);
1401 #endif
1402 deform_init_init_step_tpx = inputrec->init_step;
1403 copy_mat(box,deform_init_box_tpx);
1404 #ifdef GMX_THREAD_MPI
1405 tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
1406 #endif
1409 if (opt2bSet("-cpi",nfile,fnm))
1411 /* Check if checkpoint file exists before doing continuation.
1412 * This way we can use identical input options for the first and subsequent runs...
1414 if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) )
1416 load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog,
1417 cr,Flags & MD_PARTDEC,ddxyz,
1418 inputrec,state,&bReadRNG,&bReadEkin,
1419 (Flags & MD_APPENDFILES),
1420 (Flags & MD_APPENDFILESSET));
1422 if (bReadRNG)
1424 Flags |= MD_READ_RNG;
1426 if (bReadEkin)
1428 Flags |= MD_READ_EKIN;
1433 if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
1434 #ifdef GMX_THREAD_MPI
1435 /* With thread MPI only the master node/thread exists in mdrun.c,
1436 * therefore non-master nodes need to open the "seppot" log file here.
1438 || (!MASTER(cr) && (Flags & MD_SEPPOT))
1439 #endif
1442 gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT),
1443 Flags,&fplog);
1446 /* override nsteps with value from cmdline */
1447 override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
1449 if (SIMMASTER(cr))
1451 copy_mat(state->box,box);
1454 if (PAR(cr))
1456 gmx_bcast(sizeof(box),box,cr);
1459 /* Essential dynamics */
1460 if (opt2bSet("-ei",nfile,fnm))
1462 /* Open input and output files, allocate space for ED data structure */
1463 ed = ed_open(nfile,fnm,Flags,cr);
1466 if (PAR(cr) && !((Flags & MD_PARTDEC) ||
1467 EI_TPI(inputrec->eI) ||
1468 inputrec->eI == eiNM))
1470 cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr,
1471 dddlb_opt,dlb_scale,
1472 ddcsx,ddcsy,ddcsz,
1473 mtop,inputrec,
1474 box,state->x,
1475 &ddbox,&npme_major,&npme_minor);
1477 make_dd_communicators(fplog,cr,dd_node_order);
1479 /* Set overallocation to avoid frequent reallocation of arrays */
1480 set_over_alloc_dd(TRUE);
1482 else
1484 /* PME, if used, is done on all nodes with 1D decomposition */
1485 cr->npmenodes = 0;
1486 cr->duty = (DUTY_PP | DUTY_PME);
1487 npme_major = 1;
1488 npme_minor = 1;
1489 if (!EI_TPI(inputrec->eI))
1491 npme_major = cr->nnodes;
1494 if (inputrec->ePBC == epbcSCREW)
1496 gmx_fatal(FARGS,
1497 "pbc=%s is only implemented with domain decomposition",
1498 epbc_names[inputrec->ePBC]);
1502 if (PAR(cr))
1504 /* After possible communicator splitting in make_dd_communicators.
1505 * we can set up the intra/inter node communication.
1507 gmx_setup_nodecomm(fplog,cr);
1510 /* Initialize per-node process ID and counters. */
1511 gmx_init_intra_counters(cr);
1513 #ifdef GMX_MPI
1514 md_print_info(cr,fplog,"Using %d MPI %s\n",
1515 cr->nnodes,
1516 #ifdef GMX_THREAD_MPI
1517 cr->nnodes==1 ? "thread" : "threads"
1518 #else
1519 cr->nnodes==1 ? "process" : "processes"
1520 #endif
1522 #endif
1524 gmx_omp_nthreads_init(fplog, cr,
1525 hwinfo->nthreads_hw_avail,
1526 hw_opt->nthreads_omp,
1527 hw_opt->nthreads_omp_pme,
1528 (cr->duty & DUTY_PP) == 0,
1529 inputrec->cutoff_scheme == ecutsVERLET);
1531 gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
1533 /* getting number of PP/PME threads
1534 PME: env variable should be read only on one node to make sure it is
1535 identical everywhere;
1537 /* TODO nthreads_pp is only used for pinning threads.
1538 * This is a temporary solution until we have a hw topology library.
1540 nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
1541 nthreads_pme = gmx_omp_nthreads_get(emntPME);
1543 wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pp,nthreads_pme);
1545 if (PAR(cr))
1547 /* Master synchronizes its value of reset_counters with all nodes
1548 * including PME only nodes */
1549 reset_counters = wcycle_get_reset_counters(wcycle);
1550 gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr);
1551 wcycle_set_reset_counters(wcycle, reset_counters);
1554 snew(nrnb,1);
1555 if (cr->duty & DUTY_PP)
1557 /* For domain decomposition we allocate dynamically
1558 * in dd_partition_system.
1560 if (DOMAINDECOMP(cr))
1562 bcast_state_setup(cr,state);
1564 else
1566 if (PAR(cr))
1568 bcast_state(cr,state,TRUE);
1572 /* Initiate forcerecord */
1573 fr = mk_forcerec();
1574 fr->hwinfo = hwinfo;
1575 init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
1576 opt2fn("-table",nfile,fnm),
1577 opt2fn("-tabletf",nfile,fnm),
1578 opt2fn("-tablep",nfile,fnm),
1579 opt2fn("-tableb",nfile,fnm),
1580 nbpu_opt,
1581 FALSE,pforce);
1583 /* version for PCA_NOT_READ_NODE (see md.c) */
1584 /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
1585 "nofile","nofile","nofile","nofile",FALSE,pforce);
1587 fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
1589 /* Initialize QM-MM */
1590 if(fr->bQMMM)
1592 init_QMMMrec(cr,box,mtop,inputrec,fr);
1595 /* Initialize the mdatoms structure.
1596 * mdatoms is not filled with atom data,
1597 * as this can not be done now with domain decomposition.
1599 mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO);
1601 /* Initialize the virtual site communication */
1602 vsite = init_vsite(mtop,cr,FALSE);
1604 calc_shifts(box,fr->shift_vec);
1606 /* With periodic molecules the charge groups should be whole at start up
1607 * and the virtual sites should not be far from their proper positions.
1609 if (!inputrec->bContinuation && MASTER(cr) &&
1610 !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
1612 /* Make molecules whole at start of run */
1613 if (fr->ePBC != epbcNONE)
1615 do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x);
1617 if (vsite)
1619 /* Correct initial vsite positions are required
1620 * for the initial distribution in the domain decomposition
1621 * and for the initial shell prediction.
1623 construct_vsites_mtop(fplog,vsite,mtop,state->x);
1627 if (EEL_PME(fr->eeltype))
1629 ewaldcoeff = fr->ewaldcoeff;
1630 pmedata = &fr->pmedata;
1632 else
1634 pmedata = NULL;
1637 else
1639 /* This is a PME only node */
1641 /* We don't need the state */
1642 done_state(state);
1644 ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
1645 snew(pmedata,1);
1648 /* Set the CPU affinity */
1649 set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
1651 /* Initiate PME if necessary,
1652 * either on all nodes or on dedicated PME nodes only. */
1653 if (EEL_PME(inputrec->coulombtype))
1655 if (mdatoms)
1657 nChargePerturbed = mdatoms->nChargePerturbed;
1659 if (cr->npmenodes > 0)
1661 /* The PME only nodes need to know nChargePerturbed */
1662 gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
1665 if (cr->duty & DUTY_PME)
1667 status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
1668 mtop ? mtop->natoms : 0,nChargePerturbed,
1669 (Flags & MD_REPRODUCIBLE),nthreads_pme);
1670 if (status != 0)
1672 gmx_fatal(FARGS,"Error %d initializing PME",status);
1678 if (integrator[inputrec->eI].func == do_md
1679 #ifdef GMX_OPENMM
1681 integrator[inputrec->eI].func == do_md_openmm
1682 #endif
1685 /* Turn on signal handling on all nodes */
1687 * (A user signal from the PME nodes (if any)
1688 * is communicated to the PP nodes.
1690 signal_handler_install();
1693 if (cr->duty & DUTY_PP)
1695 if (inputrec->ePull != epullNO)
1697 /* Initialize pull code */
1698 init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv, inputrec->fepvals->init_lambda,
1699 EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags);
1702 if (inputrec->bRot)
1704 /* Initialize enforced rotation code */
1705 init_rot(fplog,inputrec,nfile,fnm,cr,state->x,box,mtop,oenv,
1706 bVerbose,Flags);
1709 constr = init_constraints(fplog,mtop,inputrec,ed,state,cr);
1711 if (DOMAINDECOMP(cr))
1713 dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec,
1714 Flags & MD_DDBONDCHECK,fr->cginfo_mb);
1716 set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox);
1718 setup_dd_grid(fplog,cr->dd);
1721 /* Now do whatever the user wants us to do (how flexible...) */
1722 integrator[inputrec->eI].func(fplog,cr,nfile,fnm,
1723 oenv,bVerbose,bCompact,
1724 nstglobalcomm,
1725 vsite,constr,
1726 nstepout,inputrec,mtop,
1727 fcd,state,
1728 mdatoms,nrnb,wcycle,ed,fr,
1729 repl_ex_nst,repl_ex_nex,repl_ex_seed,
1730 membed,
1731 cpt_period,max_hours,
1732 deviceOptions,
1733 Flags,
1734 &runtime);
1736 if (inputrec->ePull != epullNO)
1738 finish_pull(fplog,inputrec->pull);
1741 if (inputrec->bRot)
1743 finish_rot(fplog,inputrec->rot);
1747 else
1749 /* do PME only */
1750 gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec);
1753 if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
1755 /* Some timing stats */
1756 if (SIMMASTER(cr))
1758 if (runtime.proc == 0)
1760 runtime.proc = runtime.real;
1763 else
1765 runtime.real = 0;
1769 wallcycle_stop(wcycle,ewcRUN);
1771 /* Finish up, write some stuff
1772 * if rerunMD, don't write last frame again
1774 finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
1775 inputrec,nrnb,wcycle,&runtime,
1776 fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
1777 nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
1778 nthreads_pp,
1779 EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
1781 if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
1783 char gpu_err_str[STRLEN];
1785 /* free GPU memory and uninitialize GPU (by destroying the context) */
1786 nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
1788 if (!free_gpu(gpu_err_str))
1790 gmx_warning("On node %d failed to free GPU #%d: %s",
1791 cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
1795 if (opt2bSet("-membed",nfile,fnm))
1797 sfree(membed);
1800 #ifdef GMX_THREAD_MPI
1801 if (PAR(cr) && SIMMASTER(cr))
1802 #endif
1804 gmx_hardware_info_free(hwinfo);
1807 /* Does what it says */
1808 print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
1810 /* Close logfile already here if we were appending to it */
1811 if (MASTER(cr) && (Flags & MD_APPENDFILES))
1813 gmx_log_close(fplog);
1816 rc=(int)gmx_get_stop_condition();
1818 #ifdef GMX_THREAD_MPI
1819 /* we need to join all threads. The sub-threads join when they
1820 exit this function, but the master thread needs to be told to
1821 wait for that. */
1822 if (PAR(cr) && MASTER(cr))
1824 tMPI_Finalize();
1826 #endif
1828 return rc;