src/gromacs/listed_forces/manage_threading.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  * \brief This file defines functions for managing threading of listed
  39  * interactions.
  40  *
  41  * \author Mark Abraham <mark.j.abraham@gmail.com>
  42  * \ingroup module_listed_forces
  43  */
  44 #include "gmxpre.h"
  45
  46 #include "manage_threading.h"
  47
  48 #include "config.h"
  49
  50 #include <cassert>
  51 #include <cinttypes>
  52 #include <climits>
  53 #include <cstdlib>
  54
  55 #include <algorithm>
  56 #include <string>
  57
  58 #include "gromacs/listed_forces/gpubonded.h"
  59 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  60 #include "gromacs/pbcutil/ishift.h"
  61 #include "gromacs/topology/ifunc.h"
  62 #include "gromacs/utility/exceptions.h"
  63 #include "gromacs/utility/fatalerror.h"
  64 #include "gromacs/utility/gmxassert.h"
  65 #include "gromacs/utility/smalloc.h"
  66
  67 #include "listed_internal.h"
  68 #include "utilities.h"
  69
  70 /*! \brief struct for passing all data required for a function type */
  71 typedef struct {
  72     const t_ilist *il;    /**< pointer to t_ilist entry corresponding to ftype */
  73     int            ftype; /**< the function type index */
  74     int            nat;   /**< nr of atoms involved in a single ftype interaction */
  75 } ilist_data_t;
  76
  77 /*! \brief Divides listed interactions over threads
  78  *
  79  * This routine attempts to divide all interactions of the numType bondeds
  80  * types stored in ild over the threads such that each thread has roughly
  81  * equal load and different threads avoid touching the same atoms as much
  82  * as possible.
  83  */
  84 static void divide_bondeds_by_locality(bonded_threading_t *bt,
  85                                        int                 numType,
  86                                        const ilist_data_t *ild)
  87 {
  88     int nat_tot, nat_sum;
  89     int ind[F_NRE];    /* index into the ild[].il->iatoms */
  90     int at_ind[F_NRE]; /* index of the first atom of the interaction at ind */
  91     int f, t;
  92
  93     assert(numType <= F_NRE);
  94
  95     nat_tot = 0;
  96     for (f = 0; f < numType; f++)
  97     {
  98         /* Sum #bondeds*#atoms_per_bond over all bonded types */
  99         nat_tot  += ild[f].il->nr/(ild[f].nat + 1)*ild[f].nat;
 100         /* The start bound for thread 0 is 0 for all interactions */
 101         ind[f]    = 0;
 102         /* Initialize the next atom index array */
 103         assert(ild[f].il->nr > 0);
 104         at_ind[f] = ild[f].il->iatoms[1];
 105     }
 106
 107     nat_sum = 0;
 108     /* Loop over the end bounds of the nthreads threads to determine
 109      * which interactions threads 0 to nthreads shall calculate.
 110      *
 111      * NOTE: The cost of these combined loops is #interactions*numType.
 112      * This code is running single threaded (difficult to parallelize
 113      * over threads). So the relative cost of this function increases
 114      * linearly with the number of threads. Since the inner-most loop
 115      * is cheap and this is done only at DD repartitioning, the cost should
 116      * be negligble. At high thread count many other parts of the code
 117      * scale the same way, so it's (currently) not worth improving this.
 118      */
 119     for (t = 1; t <= bt->nthreads; t++)
 120     {
 121         int nat_thread;
 122
 123         /* Here we assume that the computational cost is proportional
 124          * to the number of atoms in the interaction. This is a rough
 125          * measure, but roughly correct. Usually there are very few
 126          * interactions anyhow and there are distributed relatively
 127          * uniformly. Proper and RB dihedrals are often distributed
 128          * non-uniformly, but their cost is roughly equal.
 129          */
 130         nat_thread = (nat_tot*t)/bt->nthreads;
 131
 132         while (nat_sum < nat_thread)
 133         {
 134             /* To divide bonds based on atom order, we compare
 135              * the index of the first atom in the bonded interaction.
 136              * This works well, since the domain decomposition generates
 137              * bondeds in order of the atoms by looking up interactions
 138              * which are linked to the first atom in each interaction.
 139              * It usually also works well without DD, since than the atoms
 140              * in bonded interactions are usually in increasing order.
 141              * If they are not assigned in increasing order, the balancing
 142              * is still good, but the memory access and reduction cost will
 143              * be higher.
 144              */
 145             int f_min;
 146
 147             /* Find out which of the types has the lowest atom index */
 148             f_min = 0;
 149             for (f = 1; f < numType; f++)
 150             {
 151                 if (at_ind[f] < at_ind[f_min])
 152                 {
 153                     f_min = f;
 154                 }
 155             }
 156             assert(f_min >= 0 && f_min < numType);
 157
 158             /* Assign the interaction with the lowest atom index (of type
 159              * index f_min) to thread t-1 by increasing ind.
 160              */
 161             ind[f_min] += ild[f_min].nat + 1;
 162             nat_sum    += ild[f_min].nat;
 163
 164             /* Update the first unassigned atom index for this type */
 165             if (ind[f_min] < ild[f_min].il->nr)
 166             {
 167                 at_ind[f_min] = ild[f_min].il->iatoms[ind[f_min] + 1];
 168             }
 169             else
 170             {
 171                 /* We have assigned all interactions of this type.
 172                  * Setting at_ind to INT_MAX ensures this type will not be
 173                  * chosen in the for loop above during next iterations.
 174                  */
 175                 at_ind[f_min] = INT_MAX;
 176             }
 177         }
 178
 179         /* Store the bonded end boundaries (at index t) for thread t-1 */
 180         for (f = 0; f < numType; f++)
 181         {
 182             bt->workDivision.setBound(ild[f].ftype, t, ind[f]);
 183         }
 184     }
 185
 186     for (f = 0; f < numType; f++)
 187     {
 188         assert(ind[f] == ild[f].il->nr);
 189     }
 190 }
 191
 192 //! Return whether function type \p ftype in \p idef has perturbed interactions
 193 static bool ftypeHasPerturbedEntries(const t_idef  &idef,
 194                                      int            ftype)
 195 {
 196     GMX_ASSERT(idef.ilsort == ilsortNO_FE || idef.ilsort == ilsortFE_SORTED,
 197                "Perturbed interations should be sorted here");
 198
 199     const t_ilist &ilist = idef.il[ftype];
 200
 201     return (idef.ilsort != ilsortNO_FE && ilist.nr_nonperturbed != ilist.nr);
 202 }
 203
 204 //! Divides bonded interactions over threads and GPU
 205 static void divide_bondeds_over_threads(bonded_threading_t *bt,
 206                                         bool                useGpuForBondeds,
 207                                         const t_idef       &idef)
 208 {
 209     ilist_data_t ild[F_NRE];
 210
 211     GMX_ASSERT(bt->nthreads > 0, "Must have positive number of threads");
 212     const int numThreads = bt->nthreads;
 213
 214     bt->haveBondeds      = false;
 215     int    numType       = 0;
 216     size_t fTypeGpuIndex = 0;
 217     for (int fType = 0; fType < F_NRE; fType++)
 218     {
 219         if (!ftype_is_bonded_potential(fType))
 220         {
 221             continue;
 222         }
 223
 224         const t_ilist &il                     = idef.il[fType];
 225         int            nrToAssignToCpuThreads = il.nr;
 226
 227         if (useGpuForBondeds &&
 228             fTypeGpuIndex < gmx::fTypesOnGpu.size() &&
 229             gmx::fTypesOnGpu[fTypeGpuIndex] == fType)
 230         {
 231             fTypeGpuIndex++;
 232
 233             /* Perturbation is not implemented in the GPU bonded kernels.
 234              * But instead of doing all on the CPU, we could do only
 235              * the actually perturbed interactions on the CPU.
 236              */
 237             if (!ftypeHasPerturbedEntries(idef, fType))
 238             {
 239                 /* We will assign this interaction type to the GPU */
 240                 nrToAssignToCpuThreads = 0;
 241             }
 242         }
 243
 244         if (nrToAssignToCpuThreads > 0)
 245         {
 246             bt->haveBondeds = true;
 247         }
 248
 249         if (nrToAssignToCpuThreads == 0)
 250         {
 251             /* No interactions, avoid all the integer math below */
 252             for (int t = 0; t <= numThreads; t++)
 253             {
 254                 bt->workDivision.setBound(fType, t, 0);
 255             }
 256         }
 257         else if (numThreads <= bt->max_nthread_uniform || fType == F_DISRES)
 258         {
 259             /* On up to 4 threads, load balancing the bonded work
 260              * is more important than minimizing the reduction cost.
 261              */
 262
 263             const int stride = 1 + NRAL(fType);
 264
 265             for (int t = 0; t <= numThreads; t++)
 266             {
 267                 /* Divide equally over the threads */
 268                 int nr_t = (((nrToAssignToCpuThreads/stride)*t)/numThreads)*stride;
 269
 270                 if (fType == F_DISRES)
 271                 {
 272                     /* Ensure that distance restraint pairs with the same label
 273                      * end up on the same thread.
 274                      */
 275                     while (nr_t > 0 && nr_t < nrToAssignToCpuThreads &&
 276                            idef.iparams[il.iatoms[nr_t]].disres.label ==
 277                            idef.iparams[il.iatoms[nr_t - stride]].disres.label)
 278                     {
 279                         nr_t += stride;
 280                     }
 281                 }
 282
 283                 bt->workDivision.setBound(fType, t, nr_t);
 284             }
 285         }
 286         else
 287         {
 288             /* Add this fType to the list to be distributed */
 289             int nat          = NRAL(fType);
 290             ild[numType].ftype = fType;
 291             ild[numType].il    = &il;
 292             ild[numType].nat   = nat;
 293
 294             /* The first index for the thread division is always 0 */
 295             bt->workDivision.setBound(fType, 0, 0);
 296
 297             numType++;
 298         }
 299     }
 300
 301     if (numType > 0)
 302     {
 303         divide_bondeds_by_locality(bt, numType, ild);
 304     }
 305
 306     if (debug)
 307     {
 308         int f;
 309
 310         fprintf(debug, "Division of bondeds over threads:\n");
 311         for (f = 0; f < F_NRE; f++)
 312         {
 313             if (ftype_is_bonded_potential(f) && idef.il[f].nr > 0)
 314             {
 315                 int t;
 316
 317                 fprintf(debug, "%16s", interaction_function[f].name);
 318                 for (t = 0; t < numThreads; t++)
 319                 {
 320                     fprintf(debug, " %4d",
 321                             (bt->workDivision.bound(f, t + 1) -
 322                              bt->workDivision.bound(f, t))/
 323                             (1 + NRAL(f)));
 324                 }
 325                 fprintf(debug, "\n");
 326             }
 327         }
 328     }
 329 }
 330
 331 //! Construct a reduction mask for which parts (blocks) of the force array are touched on which thread task
 332 static void
 333 calc_bonded_reduction_mask(int                       natoms,
 334                            f_thread_t               *f_thread,
 335                            const t_idef             &idef,
 336                            int                       thread,
 337                            const bonded_threading_t &bondedThreading)
 338 {
 339     static_assert(BITMASK_SIZE == GMX_OPENMP_MAX_THREADS, "For the error message below we assume these two are equal.");
 340
 341     if (bondedThreading.nthreads > BITMASK_SIZE)
 342     {
 343 #pragma omp master
 344         gmx_fatal(FARGS, "You are using %d OpenMP threads, which is larger than GMX_OPENMP_MAX_THREADS (%d). Decrease the number of OpenMP threads or rebuild GROMACS with a larger value for GMX_OPENMP_MAX_THREADS passed to CMake.",
 345                   bondedThreading.nthreads, GMX_OPENMP_MAX_THREADS);
 346 #pragma omp barrier
 347     }
 348     GMX_ASSERT(bondedThreading.nthreads <= BITMASK_SIZE, "We need at least nthreads bits in the mask");
 349
 350     int nblock = (natoms + reduction_block_size - 1) >> reduction_block_bits;
 351
 352     if (nblock > f_thread->block_nalloc)
 353     {
 354         f_thread->block_nalloc = over_alloc_large(nblock);
 355         srenew(f_thread->mask,        f_thread->block_nalloc);
 356         srenew(f_thread->block_index, f_thread->block_nalloc);
 357         // NOTE: It seems f_thread->f does not need to be aligned
 358         sfree_aligned(f_thread->f);
 359         snew_aligned(f_thread->f,     f_thread->block_nalloc*reduction_block_size, 128);
 360     }
 361
 362     gmx_bitmask_t *mask = f_thread->mask;
 363
 364     for (int b = 0; b < nblock; b++)
 365     {
 366         bitmask_clear(&mask[b]);
 367     }
 368
 369     for (int ftype = 0; ftype < F_NRE; ftype++)
 370     {
 371         if (ftype_is_bonded_potential(ftype))
 372         {
 373             int nb = idef.il[ftype].nr;
 374             if (nb > 0)
 375             {
 376                 int nat1 = interaction_function[ftype].nratoms + 1;
 377
 378                 int nb0 = bondedThreading.workDivision.bound(ftype, thread);
 379                 int nb1 = bondedThreading.workDivision.bound(ftype, thread + 1);
 380
 381                 for (int i = nb0; i < nb1; i += nat1)
 382                 {
 383                     for (int a = 1; a < nat1; a++)
 384                     {
 385                         bitmask_set_bit(&mask[idef.il[ftype].iatoms[i+a] >> reduction_block_bits], thread);
 386                     }
 387                 }
 388             }
 389         }
 390     }
 391
 392     /* Make an index of the blocks our thread touches, so we can do fast
 393      * force buffer clearing.
 394      */
 395     f_thread->nblock_used = 0;
 396     for (int b = 0; b < nblock; b++)
 397     {
 398         if (bitmask_is_set(mask[b], thread))
 399         {
 400             f_thread->block_index[f_thread->nblock_used++] = b;
 401         }
 402     }
 403 }
 404
 405 void setup_bonded_threading(bonded_threading_t *bt,
 406                             int                 numAtoms,
 407                             bool                useGpuForBondeds,
 408                             const t_idef       &idef)
 409 {
 410     int                 ctot = 0;
 411
 412     assert(bt->nthreads >= 1);
 413
 414     /* Divide the bonded interaction over the threads */
 415     divide_bondeds_over_threads(bt, useGpuForBondeds, idef);
 416
 417     if (!bt->haveBondeds)
 418     {
 419         /* We don't have bondeds, so there is nothing to reduce */
 420         return;
 421     }
 422
 423     /* Determine to which blocks each thread's bonded force calculation
 424      * contributes. Store this as a mask for each thread.
 425      */
 426 #pragma omp parallel for num_threads(bt->nthreads) schedule(static)
 427     for (int t = 0; t < bt->nthreads; t++)
 428     {
 429         try
 430         {
 431             calc_bonded_reduction_mask(numAtoms, bt->f_t[t].get(),
 432                                        idef, t, *bt);
 433         }
 434         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 435     }
 436
 437     /* Reduce the masks over the threads and determine which blocks
 438      * we need to reduce over.
 439      */
 440     int nblock_tot = (numAtoms + reduction_block_size - 1) >> reduction_block_bits;
 441     /* Ensure we have sufficient space for all blocks */
 442     if (static_cast<size_t>(nblock_tot) > bt->block_index.size())
 443     {
 444         bt->block_index.resize(nblock_tot);
 445     }
 446     if (static_cast<size_t>(nblock_tot) > bt->mask.size())
 447     {
 448         bt->mask.resize(nblock_tot);
 449     }
 450     bt->nblock_used = 0;
 451     for (int b = 0; b < nblock_tot; b++)
 452     {
 453         gmx_bitmask_t *mask = &bt->mask[b];
 454
 455         /* Generate the union over the threads of the bitmask */
 456         bitmask_clear(mask);
 457         for (int t = 0; t < bt->nthreads; t++)
 458         {
 459             bitmask_union(mask, bt->f_t[t]->mask[b]);
 460         }
 461         if (!bitmask_is_zero(*mask))
 462         {
 463             bt->block_index[bt->nblock_used++] = b;
 464         }
 465
 466         if (debug)
 467         {
 468             int c = 0;
 469             for (int t = 0; t < bt->nthreads; t++)
 470             {
 471                 if (bitmask_is_set(*mask, t))
 472                 {
 473                     c++;
 474                 }
 475             }
 476             ctot += c;
 477
 478             if (gmx_debug_at)
 479             {
 480                 fprintf(debug, "block %d flags %s count %d\n",
 481                         b, to_hex_string(*mask).c_str(), c);
 482             }
 483         }
 484     }
 485     if (debug)
 486     {
 487         fprintf(debug, "Number of %d atom blocks to reduce: %d\n",
 488                 reduction_block_size, bt->nblock_used);
 489         fprintf(debug, "Reduction density %.2f for touched blocks only %.2f\n",
 490                 ctot*reduction_block_size/static_cast<double>(numAtoms),
 491                 ctot/static_cast<double>(bt->nblock_used));
 492     }
 493 }
 494
 495 void tear_down_bonded_threading(bonded_threading_t *bt)
 496 {
 497     delete bt;
 498 }
 499
 500 f_thread_t::f_thread_t(int numEnergyGroups) :
 501     grpp(numEnergyGroups)
 502 {
 503     snew(fshift, SHIFTS);
 504 }
 505
 506 f_thread_t::~f_thread_t()
 507 {
 508     sfree(mask);
 509     sfree(fshift);
 510     sfree(block_index);
 511     sfree_aligned(f);
 512 }
 513
 514 bonded_threading_t::bonded_threading_t(const int numThreads,
 515                                        const int numEnergyGroups) :
 516     nthreads(numThreads),
 517     nblock_used(0),
 518     haveBondeds(false),
 519     workDivision(nthreads),
 520     foreignLambdaWorkDivision(1)
 521 {
 522     f_t.resize(numThreads);
 523 #pragma omp parallel for num_threads(nthreads) schedule(static)
 524     for (int t = 0; t < nthreads; t++)
 525     {
 526         try
 527         {
 528             /* Note that thread 0 uses the global fshift and energy arrays,
 529              * but to keep the code simple, we initialize all data here.
 530              */
 531             f_t[t] = std::make_unique<f_thread_t>(numEnergyGroups);
 532         }
 533         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 534     }
 535 }
 536
 537 bonded_threading_t *init_bonded_threading(FILE      *fplog,
 538                                           const int  nenergrp)
 539 {
 540     /* These thread local data structures are used for bondeds only.
 541      *
 542      * Note that we also use there structures when running single-threaded.
 543      * This is because the bonded force buffer uses type rvec4, whereas
 544      * the normal force buffer is uses type rvec. This leads to a little
 545      * reduction overhead, but the speed gain in the bonded calculations
 546      * of doing transposeScatterIncr/DecrU with aligment 4 instead of 3
 547      * is much larger than the reduction overhead.
 548      */
 549     bonded_threading_t *bt = new bonded_threading_t(gmx_omp_nthreads_get(emntBonded),
 550                                                     nenergrp);
 551
 552     /* The optimal value after which to switch from uniform to localized
 553      * bonded interaction distribution is 3, 4 or 5 depending on the system
 554      * and hardware.
 555      */
 556     const int max_nthread_uniform = 4;
 557     char *    ptr;
 558
 559     if ((ptr = getenv("GMX_BONDED_NTHREAD_UNIFORM")) != nullptr)
 560     {
 561         sscanf(ptr, "%d", &bt->max_nthread_uniform);
 562         if (fplog != nullptr)
 563         {
 564             fprintf(fplog, "\nMax threads for uniform bonded distribution set to %d by env.var.\n",
 565                     bt->max_nthread_uniform);
 566         }
 567     }
 568     else
 569     {
 570         bt->max_nthread_uniform = max_nthread_uniform;
 571     }
 572
 573     return bt;
 574 }