2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
38 * \brief This file defines functions for managing threading of listed
41 * \author Mark Abraham <mark.j.abraham@gmail.com>
42 * \ingroup module_listed_forces
46 #include "manage_threading.h"
58 #include "gromacs/listed_forces/gpubonded.h"
59 #include "gromacs/mdlib/gmx_omp_nthreads.h"
60 #include "gromacs/pbcutil/ishift.h"
61 #include "gromacs/topology/ifunc.h"
62 #include "gromacs/utility/exceptions.h"
63 #include "gromacs/utility/fatalerror.h"
64 #include "gromacs/utility/gmxassert.h"
65 #include "gromacs/utility/smalloc.h"
67 #include "listed_internal.h"
68 #include "utilities.h"
70 /*! \brief struct for passing all data required for a function type */
72 const t_ilist
*il
; /**< pointer to t_ilist entry corresponding to ftype */
73 int ftype
; /**< the function type index */
74 int nat
; /**< nr of atoms involved in a single ftype interaction */
77 /*! \brief Divides listed interactions over threads
79 * This routine attempts to divide all interactions of the numType bondeds
80 * types stored in ild over the threads such that each thread has roughly
81 * equal load and different threads avoid touching the same atoms as much
84 static void divide_bondeds_by_locality(bonded_threading_t
*bt
,
86 const ilist_data_t
*ild
)
89 int ind
[F_NRE
]; /* index into the ild[].il->iatoms */
90 int at_ind
[F_NRE
]; /* index of the first atom of the interaction at ind */
93 assert(numType
<= F_NRE
);
96 for (f
= 0; f
< numType
; f
++)
98 /* Sum #bondeds*#atoms_per_bond over all bonded types */
99 nat_tot
+= ild
[f
].il
->nr
/(ild
[f
].nat
+ 1)*ild
[f
].nat
;
100 /* The start bound for thread 0 is 0 for all interactions */
102 /* Initialize the next atom index array */
103 assert(ild
[f
].il
->nr
> 0);
104 at_ind
[f
] = ild
[f
].il
->iatoms
[1];
108 /* Loop over the end bounds of the nthreads threads to determine
109 * which interactions threads 0 to nthreads shall calculate.
111 * NOTE: The cost of these combined loops is #interactions*numType.
112 * This code is running single threaded (difficult to parallelize
113 * over threads). So the relative cost of this function increases
114 * linearly with the number of threads. Since the inner-most loop
115 * is cheap and this is done only at DD repartitioning, the cost should
116 * be negligble. At high thread count many other parts of the code
117 * scale the same way, so it's (currently) not worth improving this.
119 for (t
= 1; t
<= bt
->nthreads
; t
++)
123 /* Here we assume that the computational cost is proportional
124 * to the number of atoms in the interaction. This is a rough
125 * measure, but roughly correct. Usually there are very few
126 * interactions anyhow and there are distributed relatively
127 * uniformly. Proper and RB dihedrals are often distributed
128 * non-uniformly, but their cost is roughly equal.
130 nat_thread
= (nat_tot
*t
)/bt
->nthreads
;
132 while (nat_sum
< nat_thread
)
134 /* To divide bonds based on atom order, we compare
135 * the index of the first atom in the bonded interaction.
136 * This works well, since the domain decomposition generates
137 * bondeds in order of the atoms by looking up interactions
138 * which are linked to the first atom in each interaction.
139 * It usually also works well without DD, since than the atoms
140 * in bonded interactions are usually in increasing order.
141 * If they are not assigned in increasing order, the balancing
142 * is still good, but the memory access and reduction cost will
147 /* Find out which of the types has the lowest atom index */
149 for (f
= 1; f
< numType
; f
++)
151 if (at_ind
[f
] < at_ind
[f_min
])
156 assert(f_min
>= 0 && f_min
< numType
);
158 /* Assign the interaction with the lowest atom index (of type
159 * index f_min) to thread t-1 by increasing ind.
161 ind
[f_min
] += ild
[f_min
].nat
+ 1;
162 nat_sum
+= ild
[f_min
].nat
;
164 /* Update the first unassigned atom index for this type */
165 if (ind
[f_min
] < ild
[f_min
].il
->nr
)
167 at_ind
[f_min
] = ild
[f_min
].il
->iatoms
[ind
[f_min
] + 1];
171 /* We have assigned all interactions of this type.
172 * Setting at_ind to INT_MAX ensures this type will not be
173 * chosen in the for loop above during next iterations.
175 at_ind
[f_min
] = INT_MAX
;
179 /* Store the bonded end boundaries (at index t) for thread t-1 */
180 for (f
= 0; f
< numType
; f
++)
182 bt
->workDivision
.setBound(ild
[f
].ftype
, t
, ind
[f
]);
186 for (f
= 0; f
< numType
; f
++)
188 assert(ind
[f
] == ild
[f
].il
->nr
);
192 //! Return whether function type \p ftype in \p idef has perturbed interactions
193 static bool ftypeHasPerturbedEntries(const t_idef
&idef
,
196 GMX_ASSERT(idef
.ilsort
== ilsortNO_FE
|| idef
.ilsort
== ilsortFE_SORTED
,
197 "Perturbed interations should be sorted here");
199 const t_ilist
&ilist
= idef
.il
[ftype
];
201 return (idef
.ilsort
!= ilsortNO_FE
&& ilist
.nr_nonperturbed
!= ilist
.nr
);
204 //! Divides bonded interactions over threads and GPU
205 static void divide_bondeds_over_threads(bonded_threading_t
*bt
,
206 bool useGpuForBondeds
,
209 ilist_data_t ild
[F_NRE
];
211 GMX_ASSERT(bt
->nthreads
> 0, "Must have positive number of threads");
212 const int numThreads
= bt
->nthreads
;
214 bt
->haveBondeds
= false;
216 size_t fTypeGpuIndex
= 0;
217 for (int fType
= 0; fType
< F_NRE
; fType
++)
219 if (!ftype_is_bonded_potential(fType
))
224 const t_ilist
&il
= idef
.il
[fType
];
225 int nrToAssignToCpuThreads
= il
.nr
;
227 if (useGpuForBondeds
&&
228 fTypeGpuIndex
< gmx::fTypesOnGpu
.size() &&
229 gmx::fTypesOnGpu
[fTypeGpuIndex
] == fType
)
233 /* Perturbation is not implemented in the GPU bonded kernels.
234 * But instead of doing all on the CPU, we could do only
235 * the actually perturbed interactions on the CPU.
237 if (!ftypeHasPerturbedEntries(idef
, fType
))
239 /* We will assign this interaction type to the GPU */
240 nrToAssignToCpuThreads
= 0;
244 if (nrToAssignToCpuThreads
> 0)
246 bt
->haveBondeds
= true;
249 if (nrToAssignToCpuThreads
== 0)
251 /* No interactions, avoid all the integer math below */
252 for (int t
= 0; t
<= numThreads
; t
++)
254 bt
->workDivision
.setBound(fType
, t
, 0);
257 else if (numThreads
<= bt
->max_nthread_uniform
|| fType
== F_DISRES
)
259 /* On up to 4 threads, load balancing the bonded work
260 * is more important than minimizing the reduction cost.
263 const int stride
= 1 + NRAL(fType
);
265 for (int t
= 0; t
<= numThreads
; t
++)
267 /* Divide equally over the threads */
268 int nr_t
= (((nrToAssignToCpuThreads
/stride
)*t
)/numThreads
)*stride
;
270 if (fType
== F_DISRES
)
272 /* Ensure that distance restraint pairs with the same label
273 * end up on the same thread.
275 while (nr_t
> 0 && nr_t
< nrToAssignToCpuThreads
&&
276 idef
.iparams
[il
.iatoms
[nr_t
]].disres
.label
==
277 idef
.iparams
[il
.iatoms
[nr_t
- stride
]].disres
.label
)
283 bt
->workDivision
.setBound(fType
, t
, nr_t
);
288 /* Add this fType to the list to be distributed */
289 int nat
= NRAL(fType
);
290 ild
[numType
].ftype
= fType
;
291 ild
[numType
].il
= &il
;
292 ild
[numType
].nat
= nat
;
294 /* The first index for the thread division is always 0 */
295 bt
->workDivision
.setBound(fType
, 0, 0);
303 divide_bondeds_by_locality(bt
, numType
, ild
);
310 fprintf(debug
, "Division of bondeds over threads:\n");
311 for (f
= 0; f
< F_NRE
; f
++)
313 if (ftype_is_bonded_potential(f
) && idef
.il
[f
].nr
> 0)
317 fprintf(debug
, "%16s", interaction_function
[f
].name
);
318 for (t
= 0; t
< numThreads
; t
++)
320 fprintf(debug
, " %4d",
321 (bt
->workDivision
.bound(f
, t
+ 1) -
322 bt
->workDivision
.bound(f
, t
))/
325 fprintf(debug
, "\n");
331 //! Construct a reduction mask for which parts (blocks) of the force array are touched on which thread task
333 calc_bonded_reduction_mask(int natoms
,
334 f_thread_t
*f_thread
,
337 const bonded_threading_t
&bondedThreading
)
339 static_assert(BITMASK_SIZE
== GMX_OPENMP_MAX_THREADS
, "For the error message below we assume these two are equal.");
341 if (bondedThreading
.nthreads
> BITMASK_SIZE
)
344 gmx_fatal(FARGS
, "You are using %d OpenMP threads, which is larger than GMX_OPENMP_MAX_THREADS (%d). Decrease the number of OpenMP threads or rebuild GROMACS with a larger value for GMX_OPENMP_MAX_THREADS passed to CMake.",
345 bondedThreading
.nthreads
, GMX_OPENMP_MAX_THREADS
);
348 GMX_ASSERT(bondedThreading
.nthreads
<= BITMASK_SIZE
, "We need at least nthreads bits in the mask");
350 int nblock
= (natoms
+ reduction_block_size
- 1) >> reduction_block_bits
;
352 if (nblock
> f_thread
->block_nalloc
)
354 f_thread
->block_nalloc
= over_alloc_large(nblock
);
355 srenew(f_thread
->mask
, f_thread
->block_nalloc
);
356 srenew(f_thread
->block_index
, f_thread
->block_nalloc
);
357 // NOTE: It seems f_thread->f does not need to be aligned
358 sfree_aligned(f_thread
->f
);
359 snew_aligned(f_thread
->f
, f_thread
->block_nalloc
*reduction_block_size
, 128);
362 gmx_bitmask_t
*mask
= f_thread
->mask
;
364 for (int b
= 0; b
< nblock
; b
++)
366 bitmask_clear(&mask
[b
]);
369 for (int ftype
= 0; ftype
< F_NRE
; ftype
++)
371 if (ftype_is_bonded_potential(ftype
))
373 int nb
= idef
.il
[ftype
].nr
;
376 int nat1
= interaction_function
[ftype
].nratoms
+ 1;
378 int nb0
= bondedThreading
.workDivision
.bound(ftype
, thread
);
379 int nb1
= bondedThreading
.workDivision
.bound(ftype
, thread
+ 1);
381 for (int i
= nb0
; i
< nb1
; i
+= nat1
)
383 for (int a
= 1; a
< nat1
; a
++)
385 bitmask_set_bit(&mask
[idef
.il
[ftype
].iatoms
[i
+a
] >> reduction_block_bits
], thread
);
392 /* Make an index of the blocks our thread touches, so we can do fast
393 * force buffer clearing.
395 f_thread
->nblock_used
= 0;
396 for (int b
= 0; b
< nblock
; b
++)
398 if (bitmask_is_set(mask
[b
], thread
))
400 f_thread
->block_index
[f_thread
->nblock_used
++] = b
;
405 void setup_bonded_threading(bonded_threading_t
*bt
,
407 bool useGpuForBondeds
,
412 assert(bt
->nthreads
>= 1);
414 /* Divide the bonded interaction over the threads */
415 divide_bondeds_over_threads(bt
, useGpuForBondeds
, idef
);
417 if (!bt
->haveBondeds
)
419 /* We don't have bondeds, so there is nothing to reduce */
423 /* Determine to which blocks each thread's bonded force calculation
424 * contributes. Store this as a mask for each thread.
426 #pragma omp parallel for num_threads(bt->nthreads) schedule(static)
427 for (int t
= 0; t
< bt
->nthreads
; t
++)
431 calc_bonded_reduction_mask(numAtoms
, bt
->f_t
[t
].get(),
434 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
437 /* Reduce the masks over the threads and determine which blocks
438 * we need to reduce over.
440 int nblock_tot
= (numAtoms
+ reduction_block_size
- 1) >> reduction_block_bits
;
441 /* Ensure we have sufficient space for all blocks */
442 if (static_cast<size_t>(nblock_tot
) > bt
->block_index
.size())
444 bt
->block_index
.resize(nblock_tot
);
446 if (static_cast<size_t>(nblock_tot
) > bt
->mask
.size())
448 bt
->mask
.resize(nblock_tot
);
451 for (int b
= 0; b
< nblock_tot
; b
++)
453 gmx_bitmask_t
*mask
= &bt
->mask
[b
];
455 /* Generate the union over the threads of the bitmask */
457 for (int t
= 0; t
< bt
->nthreads
; t
++)
459 bitmask_union(mask
, bt
->f_t
[t
]->mask
[b
]);
461 if (!bitmask_is_zero(*mask
))
463 bt
->block_index
[bt
->nblock_used
++] = b
;
469 for (int t
= 0; t
< bt
->nthreads
; t
++)
471 if (bitmask_is_set(*mask
, t
))
480 fprintf(debug
, "block %d flags %s count %d\n",
481 b
, to_hex_string(*mask
).c_str(), c
);
487 fprintf(debug
, "Number of %d atom blocks to reduce: %d\n",
488 reduction_block_size
, bt
->nblock_used
);
489 fprintf(debug
, "Reduction density %.2f for touched blocks only %.2f\n",
490 ctot
*reduction_block_size
/static_cast<double>(numAtoms
),
491 ctot
/static_cast<double>(bt
->nblock_used
));
495 void tear_down_bonded_threading(bonded_threading_t
*bt
)
500 f_thread_t::f_thread_t(int numEnergyGroups
) :
501 grpp(numEnergyGroups
)
503 snew(fshift
, SHIFTS
);
506 f_thread_t::~f_thread_t()
514 bonded_threading_t::bonded_threading_t(const int numThreads
,
515 const int numEnergyGroups
) :
516 nthreads(numThreads
),
519 workDivision(nthreads
),
520 foreignLambdaWorkDivision(1)
522 f_t
.resize(numThreads
);
523 #pragma omp parallel for num_threads(nthreads) schedule(static)
524 for (int t
= 0; t
< nthreads
; t
++)
528 /* Note that thread 0 uses the global fshift and energy arrays,
529 * but to keep the code simple, we initialize all data here.
531 f_t
[t
] = std::make_unique
<f_thread_t
>(numEnergyGroups
);
533 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
537 bonded_threading_t
*init_bonded_threading(FILE *fplog
,
540 /* These thread local data structures are used for bondeds only.
542 * Note that we also use there structures when running single-threaded.
543 * This is because the bonded force buffer uses type rvec4, whereas
544 * the normal force buffer is uses type rvec. This leads to a little
545 * reduction overhead, but the speed gain in the bonded calculations
546 * of doing transposeScatterIncr/DecrU with aligment 4 instead of 3
547 * is much larger than the reduction overhead.
549 bonded_threading_t
*bt
= new bonded_threading_t(gmx_omp_nthreads_get(emntBonded
),
552 /* The optimal value after which to switch from uniform to localized
553 * bonded interaction distribution is 3, 4 or 5 depending on the system
556 const int max_nthread_uniform
= 4;
559 if ((ptr
= getenv("GMX_BONDED_NTHREAD_UNIFORM")) != nullptr)
561 sscanf(ptr
, "%d", &bt
->max_nthread_uniform
);
562 if (fplog
!= nullptr)
564 fprintf(fplog
, "\nMax threads for uniform bonded distribution set to %d by env.var.\n",
565 bt
->max_nthread_uniform
);
570 bt
->max_nthread_uniform
= max_nthread_uniform
;