2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 * \brief Implements functions for tuning adjustable parameters for the nbnxn non-bonded search and interaction kernels
40 * \author Berk Hess <hess@kth.se>
41 * \ingroup module_nbnxm
46 #include "pairlist_tuning.h"
55 #include "gromacs/domdec/domdec.h"
56 #include "gromacs/hardware/cpuinfo.h"
57 #include "gromacs/math/vec.h"
58 #include "gromacs/mdlib/calc_verletbuf.h"
59 #include "gromacs/mdtypes/commrec.h"
60 #include "gromacs/mdtypes/inputrec.h"
61 #include "gromacs/mdtypes/interaction_const.h"
62 #include "gromacs/mdtypes/state.h"
63 #include "gromacs/pbcutil/pbc.h"
64 #include "gromacs/topology/topology.h"
65 #include "gromacs/utility/cstringutil.h"
66 #include "gromacs/utility/fatalerror.h"
67 #include "gromacs/utility/gmxassert.h"
68 #include "gromacs/utility/logger.h"
69 #include "gromacs/utility/strconvert.h"
70 #include "gromacs/utility/stringutil.h"
72 #include "nbnxm_geometry.h"
73 #include "pairlistsets.h"
75 /*! \brief Returns if we can (heuristically) change nstlist and rlist
77 * \param [in] ir The input parameter record
79 static bool supportsDynamicPairlistGenerationInterval(const t_inputrec
& ir
)
81 return ir
.cutoff_scheme
== ecutsVERLET
&& EI_DYNAMICS(ir
.eI
)
82 && !(EI_MD(ir
.eI
) && ir
.etc
== etcNO
) && ir
.verletbuf_tol
> 0;
85 /*! \brief Cost of non-bonded kernels
87 * We determine the extra cost of the non-bonded kernels compared to
88 * a reference nstlist value of 10 (which is the default in grompp).
90 static const int nbnxnReferenceNstlist
= 10;
91 //! The values to try when switching
92 const int nstlist_try
[] = { 20, 25, 40, 50, 80, 100 };
93 //! Number of elements in the neighborsearch list trials.
94 #define NNSTL (sizeof(nstlist_try) / sizeof(nstlist_try[0]))
95 /* Increase nstlist until the size of the pair-list increased by
96 * \p c_nbnxnListSizeFactor??? or more, but never more than
97 * \p c_nbnxnListSizeFactor??? + \p c_nbnxnListSizeFactorMargin.
98 * Since we have dynamic pair list pruning, the force kernel cost depends
99 * only very weakly on nstlist. It depends strongly on nstlistPrune.
100 * Increasing nstlist mainly affects the cost of the pair search (down due
101 * to lower frequency, up due to larger list) and the list pruning kernel.
102 * We increase nstlist conservatively with regard to kernel performance.
103 * In serial the search cost is not high and thus we don't gain much by
104 * increasing nstlist a lot. In parallel the MPI and CPU-GPU communication
105 * volume as well as the communication buffer preparation and reduction time
106 * increase quickly with rlist and thus nslist. Therefore we should avoid
107 * large nstlist, even if that also reduces the domain decomposition cost.
108 * With GPUs we perform the dynamic pruning in a rolling fashion and this
109 * overlaps with the update on the CPU, which allows even larger nstlist.
111 // CPU: pair-search is a factor ~1.5 slower than the non-bonded kernel.
112 //! Target pair-list size increase ratio for CPU
113 static const float c_nbnxnListSizeFactorCpu
= 1.25;
114 // Intel KNL: pair-search is a factor ~2-3 slower than the non-bonded kernel.
115 //! Target pair-list size increase ratio for Intel KNL
116 static const float c_nbnxnListSizeFactorIntelXeonPhi
= 1.4;
117 // GPU: pair-search is a factor 1.5-3 slower than the non-bonded kernel.
118 //! Target pair-list size increase ratio for GPU
119 static const float c_nbnxnListSizeFactorGPU
= 1.4;
120 //! Never increase the size of the pair-list more than the factor above plus this margin
121 static const float c_nbnxnListSizeFactorMargin
= 0.1;
123 void increaseNstlist(FILE* fp
,
127 const gmx_mtop_t
* mtop
,
129 bool useOrEmulateGpuForNonbondeds
,
130 const gmx::CpuInfo
& cpuinfo
)
132 if (!EI_DYNAMICS(ir
->eI
))
134 /* Can only increase nstlist with dynamics */
138 float listfac_ok
, listfac_max
;
139 int nstlist_orig
, nstlist_prev
;
140 real rlist_inc
, rlist_ok
, rlist_max
;
141 real rlist_new
, rlist_prev
;
142 size_t nstlist_ind
= 0;
143 gmx_bool bBox
, bDD
, bCont
;
144 const char* nstl_gpu
=
145 "\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe "
146 "optimum depends on your CPU and GPU resources.\nYou might want to try several "
148 const char* nve_err
= "Can not increase nstlist because an NVE ensemble is used";
149 const char* vbd_err
=
150 "Can not increase nstlist because verlet-buffer-tolerance is not set or used";
151 const char* box_err
= "Can not increase nstlist because the box is too small";
152 const char* dd_err
= "Can not increase nstlist because of domain decomposition limitations";
155 if (nstlist_cmdline
<= 0)
157 if (ir
->nstlist
== 1)
159 /* The user probably set nstlist=1 for a reason,
160 * don't mess with the settings.
165 /* With a GPU and fixed nstlist suggest tuning nstlist */
166 if (fp
!= nullptr && useOrEmulateGpuForNonbondeds
&& ir
->nstlist
< nstlist_try
[0]
167 && !supportsDynamicPairlistGenerationInterval(*ir
))
169 fprintf(fp
, nstl_gpu
, ir
->nstlist
);
173 while (nstlist_ind
< NNSTL
&& ir
->nstlist
>= nstlist_try
[nstlist_ind
])
177 if (nstlist_ind
== NNSTL
)
179 /* There are no larger nstlist value to try */
184 if (EI_MD(ir
->eI
) && ir
->etc
== etcNO
)
188 fprintf(stderr
, "%s\n", nve_err
);
192 fprintf(fp
, "%s\n", nve_err
);
198 if (ir
->verletbuf_tol
== 0 && useOrEmulateGpuForNonbondeds
)
201 "You are using an old tpr file with a GPU, please generate a new tpr file with "
202 "an up to date version of grompp");
205 if (ir
->verletbuf_tol
< 0)
209 fprintf(stderr
, "%s\n", vbd_err
);
213 fprintf(fp
, "%s\n", vbd_err
);
219 GMX_RELEASE_ASSERT(supportsDynamicPairlistGenerationInterval(*ir
),
220 "In all cases that do not support dynamic nstlist, we should have returned "
221 "with an appropriate message above");
223 if (useOrEmulateGpuForNonbondeds
)
225 listfac_ok
= c_nbnxnListSizeFactorGPU
;
227 else if (cpuinfo
.brandString().find("Xeon Phi") != std::string::npos
)
229 listfac_ok
= c_nbnxnListSizeFactorIntelXeonPhi
;
233 listfac_ok
= c_nbnxnListSizeFactorCpu
;
235 listfac_max
= listfac_ok
+ c_nbnxnListSizeFactorMargin
;
237 nstlist_orig
= ir
->nstlist
;
238 if (nstlist_cmdline
> 0)
242 sprintf(buf
, "Getting nstlist=%d from command line option", nstlist_cmdline
);
244 ir
->nstlist
= nstlist_cmdline
;
247 ListSetupType listType
=
248 (useOrEmulateGpuForNonbondeds
? ListSetupType::Gpu
: ListSetupType::CpuSimdWhenSupported
);
249 VerletbufListSetup listSetup
= verletbufGetSafeListSetup(listType
);
251 /* Allow rlist to make the list a given factor larger than the list
252 * would be with the reference value for nstlist (10).
254 nstlist_prev
= ir
->nstlist
;
255 ir
->nstlist
= nbnxnReferenceNstlist
;
256 const real rlistWithReferenceNstlist
=
257 calcVerletBufferSize(*mtop
, det(box
), *ir
, ir
->nstlist
, ir
->nstlist
- 1, -1, listSetup
);
258 ir
->nstlist
= nstlist_prev
;
260 /* Determine the pair list size increase due to zero interactions */
261 rlist_inc
= nbnxn_get_rlist_effective_inc(listSetup
.cluster_size_j
, mtop
->natoms
/ det(box
));
262 rlist_ok
= (rlistWithReferenceNstlist
+ rlist_inc
) * std::cbrt(listfac_ok
) - rlist_inc
;
263 rlist_max
= (rlistWithReferenceNstlist
+ rlist_inc
) * std::cbrt(listfac_max
) - rlist_inc
;
266 fprintf(debug
, "nstlist tuning: rlist_inc %.3f rlist_ok %.3f rlist_max %.3f\n", rlist_inc
,
267 rlist_ok
, rlist_max
);
270 nstlist_prev
= nstlist_orig
;
271 rlist_prev
= ir
->rlist
;
274 if (nstlist_cmdline
<= 0)
276 ir
->nstlist
= nstlist_try
[nstlist_ind
];
279 /* Set the pair-list buffer size in ir */
280 rlist_new
= calcVerletBufferSize(*mtop
, det(box
), *ir
, ir
->nstlist
, ir
->nstlist
- 1, -1, listSetup
);
282 /* Does rlist fit in the box? */
283 bBox
= (gmx::square(rlist_new
) < max_cutoff2(ir
->pbcType
, box
));
285 if (bBox
&& DOMAINDECOMP(cr
))
287 /* Check if rlist fits in the domain decomposition */
288 if (inputrec2nboundeddim(ir
) < DIM
)
291 "Changing nstlist with domain decomposition and unbounded dimensions is "
292 "not implemented yet");
294 bDD
= change_dd_cutoff(cr
, box
, gmx::ArrayRef
<const gmx::RVec
>(), rlist_new
);
299 fprintf(debug
, "nstlist %d rlist %.3f bBox %s bDD %s\n", ir
->nstlist
, rlist_new
,
300 gmx::boolToString(bBox
), gmx::boolToString(bDD
));
305 if (nstlist_cmdline
<= 0)
307 if (bBox
&& bDD
&& rlist_new
<= rlist_max
)
309 /* Increase nstlist */
310 nstlist_prev
= ir
->nstlist
;
311 rlist_prev
= rlist_new
;
312 bCont
= (nstlist_ind
+ 1 < NNSTL
&& rlist_new
< rlist_ok
);
316 /* Stick with the previous nstlist */
317 ir
->nstlist
= nstlist_prev
;
318 rlist_new
= rlist_prev
;
329 gmx_warning("%s", !bBox
? box_err
: dd_err
);
332 fprintf(fp
, "\n%s\n", !bBox
? box_err
: dd_err
);
334 ir
->nstlist
= nstlist_orig
;
336 else if (ir
->nstlist
!= nstlist_orig
|| rlist_new
!= ir
->rlist
)
338 sprintf(buf
, "Changing nstlist from %d to %d, rlist from %g to %g", nstlist_orig
,
339 ir
->nstlist
, ir
->rlist
, rlist_new
);
342 fprintf(stderr
, "%s\n\n", buf
);
346 fprintf(fp
, "%s\n\n", buf
);
348 ir
->rlist
= rlist_new
;
352 /*! \brief The interval in steps at which we perform dynamic, rolling pruning on a GPU.
354 * Ideally we should auto-tune this value.
355 * Not considering overheads, 1 would be the ideal value. But 2 seems
356 * a reasonable compromise that reduces GPU kernel launch overheads and
357 * also avoids inefficiency on large GPUs when pruning small lists.
358 * Because with domain decomposition we alternate local/non-local pruning
359 * at even/odd steps, which gives a period of 2, this value currenly needs
360 * to be 2, which is indirectly asserted when the GPU pruning is dispatched
361 * during the force evaluation.
363 static const int c_nbnxnGpuRollingListPruningInterval
= 2;
365 /*! \brief The minimum nstlist for dynamic pair list pruning.
367 * In most cases going lower than 4 will lead to a too high pruning cost.
368 * This value should be a multiple of \p c_nbnxnGpuRollingListPruningInterval
370 static const int c_nbnxnDynamicListPruningMinLifetime
= 4;
372 /*! \brief Set the dynamic pairlist pruning parameters in \p ic
374 * \param[in] ir The input parameter record
375 * \param[in] mtop The global topology
376 * \param[in] box The unit cell
377 * \param[in] useGpuList Tells if we are using a GPU type pairlist
378 * \param[in] listSetup The nbnxn pair list setup
379 * \param[in] userSetNstlistPrune The user set ic->nstlistPrune (using an env.var.)
380 * \param[in] ic The nonbonded interactions constants
381 * \param[in,out] listParams The list setup parameters
383 static void setDynamicPairlistPruningParameters(const t_inputrec
* ir
,
384 const gmx_mtop_t
* mtop
,
386 const bool useGpuList
,
387 const VerletbufListSetup
& listSetup
,
388 const bool userSetNstlistPrune
,
389 const interaction_const_t
* ic
,
390 PairlistParams
* listParams
)
392 listParams
->lifetime
= ir
->nstlist
- 1;
394 /* When nstlistPrune was set by the user, we need to execute one loop
395 * iteration to determine rlistInner.
396 * Otherwise we compute rlistInner and increase nstlist as long as
397 * we have a pairlist buffer of length 0 (i.e. rlistInner == cutoff).
399 const real interactionCutoff
= std::max(ic
->rcoulomb
, ic
->rvdw
);
400 int tunedNstlistPrune
= listParams
->nstlistPrune
;
403 /* Dynamic pruning on the GPU is performed on the list for
404 * the next step on the coordinates of the current step,
405 * so the list lifetime is nstlistPrune (not the usual nstlist-1).
407 int listLifetime
= tunedNstlistPrune
- (useGpuList
? 0 : 1);
408 listParams
->nstlistPrune
= tunedNstlistPrune
;
409 listParams
->rlistInner
= calcVerletBufferSize(*mtop
, det(box
), *ir
, tunedNstlistPrune
,
410 listLifetime
, -1, listSetup
);
412 /* On the GPU we apply the dynamic pruning in a rolling fashion
413 * every c_nbnxnGpuRollingListPruningInterval steps,
414 * so keep nstlistPrune a multiple of the interval.
416 tunedNstlistPrune
+= useGpuList
? c_nbnxnGpuRollingListPruningInterval
: 1;
417 } while (!userSetNstlistPrune
&& tunedNstlistPrune
< ir
->nstlist
418 && listParams
->rlistInner
== interactionCutoff
);
420 if (userSetNstlistPrune
)
422 listParams
->useDynamicPruning
= true;
426 /* Determine the pair list size increase due to zero interactions */
427 real rlistInc
= nbnxn_get_rlist_effective_inc(listSetup
.cluster_size_j
, mtop
->natoms
/ det(box
));
429 /* Dynamic pruning is only useful when the inner list is smaller than
430 * the outer. The factor 0.99 ensures at least 3% list size reduction.
432 * With dynamic pruning on the CPU we prune after updating,
433 * so nstlistPrune=nstlist-1 would add useless extra work.
434 * With the GPU there will probably be more overhead than gain
435 * with nstlistPrune=nstlist-1, so we disable dynamic pruning.
436 * Note that in such cases the first sub-condition is likely also false.
438 listParams
->useDynamicPruning
=
439 (listParams
->rlistInner
+ rlistInc
< 0.99 * (listParams
->rlistOuter
+ rlistInc
)
440 && listParams
->nstlistPrune
< listParams
->lifetime
);
443 if (!listParams
->useDynamicPruning
)
445 /* These parameters should not be used, but set them to useful values */
446 listParams
->nstlistPrune
= -1;
447 listParams
->rlistInner
= listParams
->rlistOuter
;
451 /*! \brief Returns a string describing the setup of a single pair-list
453 * \param[in] listName Short name of the list, can be ""
454 * \param[in] nstList The list update interval in steps
455 * \param[in] nstListForSpacing Update interval for setting the number characters for printing \p nstList
456 * \param[in] rList List cut-off radius
457 * \param[in] interactionCutoff The interaction cut-off, use for printing the list buffer size
459 static std::string
formatListSetup(const std::string
& listName
,
461 int nstListForSpacing
,
463 real interactionCutoff
)
465 std::string listSetup
= " ";
466 if (!listName
.empty())
468 listSetup
+= listName
+ " list: ";
470 listSetup
+= "updated every ";
471 // Make the shortest int format string that fits nstListForSpacing
472 std::string nstListFormat
=
473 "%" + gmx::formatString("%zu", gmx::formatString("%d", nstListForSpacing
).size()) + "d";
474 listSetup
+= gmx::formatString(nstListFormat
.c_str(), nstList
);
475 listSetup
+= gmx::formatString(" steps, buffer %.3f nm, rlist %.3f nm\n",
476 rList
- interactionCutoff
, rList
);
481 void setupDynamicPairlistPruning(const gmx::MDLogger
& mdlog
,
482 const t_inputrec
* ir
,
483 const gmx_mtop_t
* mtop
,
485 const interaction_const_t
* ic
,
486 PairlistParams
* listParams
)
488 GMX_RELEASE_ASSERT(listParams
->rlistOuter
> 0, "With the nbnxn setup rlist should be > 0");
490 /* Initialize the parameters to no dynamic list pruning */
491 listParams
->useDynamicPruning
= false;
493 const VerletbufListSetup ls
= { IClusterSizePerListType
[listParams
->pairlistType
],
494 JClusterSizePerListType
[listParams
->pairlistType
] };
496 /* Currently emulation mode does not support dual pair-lists */
497 const bool useGpuList
= (listParams
->pairlistType
== PairlistType::HierarchicalNxN
);
499 if (supportsDynamicPairlistGenerationInterval(*ir
) && getenv("GMX_DISABLE_DYNAMICPRUNING") == nullptr)
501 /* Note that nstlistPrune can have any value independently of nstlist.
502 * Actually applying rolling pruning is only useful when
503 * nstlistPrune < nstlist -1
505 char* env
= getenv("GMX_NSTLIST_DYNAMICPRUNING");
506 bool userSetNstlistPrune
= (env
!= nullptr);
508 if (userSetNstlistPrune
)
511 listParams
->nstlistPrune
= strtol(env
, &end
, 10);
512 if (!end
|| (*end
!= 0)
513 || !(listParams
->nstlistPrune
> 0 && listParams
->nstlistPrune
< ir
->nstlist
))
516 "Invalid value passed in GMX_NSTLIST_DYNAMICPRUNING=%s, should be > 0 "
523 static_assert(c_nbnxnDynamicListPruningMinLifetime
% c_nbnxnGpuRollingListPruningInterval
== 0,
524 "c_nbnxnDynamicListPruningMinLifetime sets the starting value for "
525 "nstlistPrune, which should be divisible by the rolling pruning interval "
526 "for efficiency reasons.");
528 // TODO: Use auto-tuning to determine nstlistPrune
529 listParams
->nstlistPrune
= c_nbnxnDynamicListPruningMinLifetime
;
532 setDynamicPairlistPruningParameters(ir
, mtop
, box
, useGpuList
, ls
, userSetNstlistPrune
, ic
,
535 if (listParams
->useDynamicPruning
&& useGpuList
)
537 /* Note that we can round down here. This makes the effective
538 * rolling pruning interval slightly shorter than nstlistTune,
539 * thus giving correct results, but a slightly lower efficiency.
541 GMX_RELEASE_ASSERT(listParams
->nstlistPrune
>= c_nbnxnGpuRollingListPruningInterval
, ("With dynamic list pruning on GPUs pruning frequency must be at least as large as the rolling pruning interval ("
542 + std::to_string(c_nbnxnGpuRollingListPruningInterval
)
545 listParams
->numRollingPruningParts
=
546 listParams
->nstlistPrune
/ c_nbnxnGpuRollingListPruningInterval
;
550 listParams
->numRollingPruningParts
= 1;
556 const real interactionCutoff
= std::max(ic
->rcoulomb
, ic
->rvdw
);
557 if (listParams
->useDynamicPruning
)
559 mesg
+= gmx::formatString(
560 "Using a dual %dx%d pair-list setup updated with dynamic%s pruning:\n", ls
.cluster_size_i
,
561 ls
.cluster_size_j
, listParams
->numRollingPruningParts
> 1 ? ", rolling" : "");
562 mesg
+= formatListSetup("outer", ir
->nstlist
, ir
->nstlist
, listParams
->rlistOuter
, interactionCutoff
);
563 mesg
+= formatListSetup("inner", listParams
->nstlistPrune
, ir
->nstlist
,
564 listParams
->rlistInner
, interactionCutoff
);
568 mesg
+= gmx::formatString("Using a %dx%d pair-list setup:\n", ls
.cluster_size_i
, ls
.cluster_size_j
);
569 mesg
+= formatListSetup("", ir
->nstlist
, ir
->nstlist
, listParams
->rlistOuter
, interactionCutoff
);
571 if (supportsDynamicPairlistGenerationInterval(*ir
))
573 const VerletbufListSetup listSetup1x1
= { 1, 1 };
574 const real rlistOuter
= calcVerletBufferSize(*mtop
, det(box
), *ir
, ir
->nstlist
,
575 ir
->nstlist
- 1, -1, listSetup1x1
);
576 real rlistInner
= rlistOuter
;
577 if (listParams
->useDynamicPruning
)
579 int listLifeTime
= listParams
->nstlistPrune
- (useGpuList
? 0 : 1);
580 rlistInner
= calcVerletBufferSize(*mtop
, det(box
), *ir
, listParams
->nstlistPrune
,
581 listLifeTime
, -1, listSetup1x1
);
584 mesg
+= gmx::formatString(
585 "At tolerance %g kJ/mol/ps per atom, equivalent classical 1x1 list would be:\n",
587 if (listParams
->useDynamicPruning
)
589 mesg
+= formatListSetup("outer", ir
->nstlist
, ir
->nstlist
, rlistOuter
, interactionCutoff
);
590 mesg
+= formatListSetup("inner", listParams
->nstlistPrune
, ir
->nstlist
, rlistInner
,
595 mesg
+= formatListSetup("", ir
->nstlist
, ir
->nstlist
, rlistOuter
, interactionCutoff
);
599 GMX_LOG(mdlog
.info
).asParagraph().appendText(mesg
);