src/gromacs/listed-forces/manage-threading.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  * \brief This file defines functions for managing threading of listed
  39  * interactions.
  40  *
  41  * \author Mark Abraham <mark.j.abraham@gmail.com>
  42  * \ingroup module_listed-forces
  43  */
  44 #include "gmxpre.h"
  45
  46 #include "manage-threading.h"
  47
  48 #include "config.h"
  49
  50 #include <assert.h>
  51 #include <limits.h>
  52 #include <stdlib.h>
  53
  54 #include <algorithm>
  55
  56 #include "gromacs/listed-forces/listed-forces.h"
  57 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  58 #include "gromacs/pbcutil/ishift.h"
  59 #include "gromacs/topology/ifunc.h"
  60 #include "gromacs/utility/exceptions.h"
  61 #include "gromacs/utility/fatalerror.h"
  62 #include "gromacs/utility/smalloc.h"
  63 #include "gromacs/utility/stringutil.h"
  64
  65 #include "listed-internal.h"
  66
  67 /*! \brief struct for passing all data required for a function type */
  68 typedef struct {
  69     int      ftype; /**< the function type index */
  70     t_ilist *il;    /**< pointer to t_ilist entry corresponding to ftype */
  71     int      nat;   /**< nr of atoms involved in a single ftype interaction */
  72 } ilist_data_t;
  73
  74 /*! \brief Divides listed interactions over threads
  75  *
  76  * This routine attempts to divide all interactions of the ntype bondeds
  77  * types stored in ild over the threads such that each thread has roughly
  78  * equal load and different threads avoid touching the same atoms as much
  79  * as possible.
  80  */
  81 static void divide_bondeds_by_locality(int                 ntype,
  82                                        const ilist_data_t *ild,
  83                                        int                 nthread,
  84                                        t_idef             *idef)
  85 {
  86     int nat_tot, nat_sum;
  87     int ind[F_NRE];    /* index into the ild[].il->iatoms */
  88     int at_ind[F_NRE]; /* index of the first atom of the interaction at ind */
  89     int f, t;
  90
  91     assert(ntype <= F_NRE);
  92
  93     nat_tot = 0;
  94     for (f = 0; f < ntype; f++)
  95     {
  96         /* Sum #bondeds*#atoms_per_bond over all bonded types */
  97         nat_tot  += ild[f].il->nr/(ild[f].nat + 1)*ild[f].nat;
  98         /* The start bound for thread 0 is 0 for all interactions */
  99         ind[f]    = 0;
 100         /* Initialize the next atom index array */
 101         assert(ild[f].il->nr > 0);
 102         at_ind[f] = ild[f].il->iatoms[1];
 103     }
 104
 105     nat_sum = 0;
 106     /* Loop over the end bounds of the nthread threads to determine
 107      * which interactions threads 0 to nthread shall calculate.
 108      *
 109      * NOTE: The cost of these combined loops is #interactions*ntype.
 110      * This code is running single threaded (difficult to parallelize
 111      * over threads). So the relative cost of this function increases
 112      * linearly with the number of threads. Since the inner-most loop
 113      * is cheap and this is done only at DD repartitioning, the cost should
 114      * be negligble. At high thread count many other parts of the code
 115      * scale the same way, so it's (currently) not worth improving this.
 116      */
 117     for (t = 1; t <= nthread; t++)
 118     {
 119         int nat_thread;
 120
 121         /* Here we assume that the computational cost is proportional
 122          * to the number of atoms in the interaction. This is a rough
 123          * measure, but roughly correct. Usually there are very few
 124          * interactions anyhow and there are distributed relatively
 125          * uniformly. Proper and RB dihedrals are often distributed
 126          * non-uniformly, but their cost is roughly equal.
 127          */
 128         nat_thread = (nat_tot*t)/nthread;
 129
 130         while (nat_sum < nat_thread)
 131         {
 132             /* To divide bonds based on atom order, we compare
 133              * the index of the first atom in the bonded interaction.
 134              * This works well, since the domain decomposition generates
 135              * bondeds in order of the atoms by looking up interactions
 136              * which are linked to the first atom in each interaction.
 137              * It usually also works well without DD, since than the atoms
 138              * in bonded interactions are usually in increasing order.
 139              * If they are not assigned in increasing order, the balancing
 140              * is still good, but the memory access and reduction cost will
 141              * be higher.
 142              */
 143             int f_min;
 144
 145             /* Find out which of the types has the lowest atom index */
 146             f_min = 0;
 147             for (f = 1; f < ntype; f++)
 148             {
 149                 if (at_ind[f] < at_ind[f_min])
 150                 {
 151                     f_min = f;
 152                 }
 153             }
 154             assert(f_min >= 0 && f_min < ntype);
 155
 156             /* Assign the interaction with the lowest atom index (of type
 157              * index f_min) to thread t-1 by increasing ind.
 158              */
 159             ind[f_min] += ild[f_min].nat + 1;
 160             nat_sum    += ild[f_min].nat;
 161
 162             /* Update the first unassigned atom index for this type */
 163             if (ind[f_min] < ild[f_min].il->nr)
 164             {
 165                 at_ind[f_min] = ild[f_min].il->iatoms[ind[f_min] + 1];
 166             }
 167             else
 168             {
 169                 /* We have assigned all interactions of this type.
 170                  * Setting at_ind to INT_MAX ensures this type will not be
 171                  * chosen in the for loop above during next iterations.
 172                  */
 173                 at_ind[f_min] = INT_MAX;
 174             }
 175         }
 176
 177         /* Store the bonded end boundaries (at index t) for thread t-1 */
 178         for (f = 0; f < ntype; f++)
 179         {
 180             idef->il_thread_division[ild[f].ftype*(nthread + 1) + t] = ind[f];
 181         }
 182     }
 183
 184     for (f = 0; f < ntype; f++)
 185     {
 186         assert(ind[f] == ild[f].il->nr);
 187     }
 188 }
 189
 190 //! Divides bonded interactions over threads
 191 static void divide_bondeds_over_threads(t_idef *idef,
 192                                         int     nthread,
 193                                         int     max_nthread_uniform,
 194                                         bool   *haveBondeds)
 195 {
 196     ilist_data_t ild[F_NRE];
 197     int          ntype;
 198     int          f;
 199
 200     assert(nthread > 0);
 201
 202     idef->nthreads = nthread;
 203
 204     if (F_NRE*(nthread + 1) > idef->il_thread_division_nalloc)
 205     {
 206         idef->il_thread_division_nalloc = F_NRE*(nthread + 1);
 207         srenew(idef->il_thread_division, idef->il_thread_division_nalloc);
 208     }
 209
 210     *haveBondeds = false;
 211     ntype        = 0;
 212     for (f = 0; f < F_NRE; f++)
 213     {
 214         if (!ftype_is_bonded_potential(f))
 215         {
 216             continue;
 217         }
 218
 219         if (idef->il[f].nr > 0)
 220         {
 221             *haveBondeds = true;
 222         }
 223
 224         if (idef->il[f].nr == 0)
 225         {
 226             /* No interactions, avoid all the integer math below */
 227             int t;
 228             for (t = 0; t <= nthread; t++)
 229             {
 230                 idef->il_thread_division[f*(nthread + 1) + t] = 0;
 231             }
 232         }
 233         else if (nthread <= max_nthread_uniform || f == F_DISRES)
 234         {
 235             /* On up to 4 threads, load balancing the bonded work
 236              * is more important than minimizing the reduction cost.
 237              */
 238             int nat1, t;
 239
 240             /* nat1 = 1 + #atoms(ftype) which is the stride use for iatoms */
 241             nat1 = 1 + NRAL(f);
 242
 243             for (t = 0; t <= nthread; t++)
 244             {
 245                 int nr_t;
 246
 247                 /* Divide equally over the threads */
 248                 nr_t = (((idef->il[f].nr/nat1)*t)/nthread)*nat1;
 249
 250                 if (f == F_DISRES)
 251                 {
 252                     /* Ensure that distance restraint pairs with the same label
 253                      * end up on the same thread.
 254                      */
 255                     while (nr_t > 0 && nr_t < idef->il[f].nr &&
 256                            idef->iparams[idef->il[f].iatoms[nr_t]].disres.label ==
 257                            idef->iparams[idef->il[f].iatoms[nr_t-nat1]].disres.label)
 258                     {
 259                         nr_t += nat1;
 260                     }
 261                 }
 262
 263                 idef->il_thread_division[f*(nthread + 1) + t] = nr_t;
 264             }
 265         }
 266         else
 267         {
 268             /* Add this ftype to the list to be distributed */
 269             int nat;
 270
 271             nat              = NRAL(f);
 272             ild[ntype].ftype = f;
 273             ild[ntype].il    = &idef->il[f];
 274             ild[ntype].nat   = nat;
 275
 276             /* The first index for the thread division is always 0 */
 277             idef->il_thread_division[f*(nthread + 1)] = 0;
 278
 279             ntype++;
 280         }
 281     }
 282
 283     if (ntype > 0)
 284     {
 285         divide_bondeds_by_locality(ntype, ild, nthread, idef);
 286     }
 287
 288     if (debug)
 289     {
 290         int f;
 291
 292         fprintf(debug, "Division of bondeds over threads:\n");
 293         for (f = 0; f < F_NRE; f++)
 294         {
 295             if (ftype_is_bonded_potential(f) && idef->il[f].nr > 0)
 296             {
 297                 int t;
 298
 299                 fprintf(debug, "%16s", interaction_function[f].name);
 300                 for (t = 0; t < nthread; t++)
 301                 {
 302                     fprintf(debug, " %4d",
 303                             (idef->il_thread_division[f*(nthread + 1) + t + 1] -
 304                              idef->il_thread_division[f*(nthread + 1) + t])/
 305                             (1 + NRAL(f)));
 306                 }
 307                 fprintf(debug, "\n");
 308             }
 309         }
 310     }
 311 }
 312
 313 //! Construct a reduction mask for which parts (blocks) of the force array are touched on which thread task
 314 static void
 315 calc_bonded_reduction_mask(int natoms,
 316                            f_thread_t *f_thread,
 317                            const t_idef *idef,
 318                            int thread, int nthread)
 319 {
 320     static_assert(BITMASK_SIZE == GMX_OPENMP_MAX_THREADS, "For the error message below we assume these two are equal.");
 321
 322     if (nthread > BITMASK_SIZE)
 323     {
 324 #pragma omp master
 325         gmx_fatal(FARGS, "You are using %d OpenMP threads, which is larger than GMX_OPENMP_MAX_THREADS (%d). Decrease the number of OpenMP threads or rebuild GROMACS with a larger value for GMX_OPENMP_MAX_THREADS.",
 326                   nthread, GMX_OPENMP_MAX_THREADS);
 327 #pragma omp barrier
 328     }
 329     GMX_ASSERT(nthread <= BITMASK_SIZE, "We need at least nthread bits in the mask");
 330
 331     int nblock = (natoms + reduction_block_size - 1) >> reduction_block_bits;
 332
 333     if (nblock > f_thread->block_nalloc)
 334     {
 335         f_thread->block_nalloc = over_alloc_large(nblock);
 336         srenew(f_thread->mask,        f_thread->block_nalloc);
 337         srenew(f_thread->block_index, f_thread->block_nalloc);
 338         sfree_aligned(f_thread->f);
 339         snew_aligned(f_thread->f,     f_thread->block_nalloc*reduction_block_size, 128);
 340     }
 341
 342     gmx_bitmask_t *mask = f_thread->mask;
 343
 344     for (int b = 0; b < nblock; b++)
 345     {
 346         bitmask_clear(&mask[b]);
 347     }
 348
 349     for (int ftype = 0; ftype < F_NRE; ftype++)
 350     {
 351         if (ftype_is_bonded_potential(ftype))
 352         {
 353             int nb = idef->il[ftype].nr;
 354             if (nb > 0)
 355             {
 356                 int nat1 = interaction_function[ftype].nratoms + 1;
 357
 358                 int nb0 = idef->il_thread_division[ftype*(nthread + 1) + thread];
 359                 int nb1 = idef->il_thread_division[ftype*(nthread + 1) + thread + 1];
 360
 361                 for (int i = nb0; i < nb1; i += nat1)
 362                 {
 363                     for (int a = 1; a < nat1; a++)
 364                     {
 365                         bitmask_set_bit(&mask[idef->il[ftype].iatoms[i+a] >> reduction_block_bits], thread);
 366                     }
 367                 }
 368             }
 369         }
 370     }
 371
 372     /* Make an index of the blocks our thread touches, so we can do fast
 373      * force buffer clearing.
 374      */
 375     f_thread->nblock_used = 0;
 376     for (int b = 0; b < nblock; b++)
 377     {
 378         if (bitmask_is_set(mask[b], thread))
 379         {
 380             f_thread->block_index[f_thread->nblock_used++] = b;
 381         }
 382     }
 383 }
 384
 385 void setup_bonded_threading(t_forcerec *fr, t_idef *idef)
 386 {
 387     bonded_threading_t *bt   = fr->bonded_threading;
 388     int                 ctot = 0;
 389
 390     assert(bt->nthreads >= 1);
 391
 392     /* Divide the bonded interaction over the threads */
 393     divide_bondeds_over_threads(idef,
 394                                 bt->nthreads,
 395                                 bt->bonded_max_nthread_uniform,
 396                                 &bt->haveBondeds);
 397
 398     if (!bt->haveBondeds)
 399     {
 400         /* We don't have bondeds, so there is nothing to reduce */
 401         return;
 402     }
 403
 404     /* Determine to which blocks each thread's bonded force calculation
 405      * contributes. Store this as a mask for each thread.
 406      */
 407 #pragma omp parallel for num_threads(bt->nthreads) schedule(static)
 408     for (int t = 0; t < bt->nthreads; t++)
 409     {
 410         try
 411         {
 412             calc_bonded_reduction_mask(fr->natoms_force, &bt->f_t[t],
 413                                        idef, t, bt->nthreads);
 414         }
 415         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 416     }
 417
 418     /* Reduce the masks over the threads and determine which blocks
 419      * we need to reduce over.
 420      */
 421     int nblock_tot = (fr->natoms_force + reduction_block_size - 1) >> reduction_block_bits;
 422     if (nblock_tot > bt->block_nalloc)
 423     {
 424         bt->block_nalloc = over_alloc_large(nblock_tot);
 425         srenew(bt->block_index, bt->block_nalloc);
 426         srenew(bt->mask,        bt->block_nalloc);
 427     }
 428     bt->nblock_used = 0;
 429     for (int b = 0; b < nblock_tot; b++)
 430     {
 431         gmx_bitmask_t *mask = &bt->mask[b];
 432
 433         /* Generate the union over the threads of the bitmask */
 434         bitmask_clear(mask);
 435         for (int t = 0; t < bt->nthreads; t++)
 436         {
 437             bitmask_union(mask, bt->f_t[t].mask[b]);
 438         }
 439         if (!bitmask_is_zero(*mask))
 440         {
 441             bt->block_index[bt->nblock_used++] = b;
 442         }
 443
 444         if (debug)
 445         {
 446             int c = 0;
 447             for (int t = 0; t < bt->nthreads; t++)
 448             {
 449                 if (bitmask_is_set(*mask, t))
 450                 {
 451                     c++;
 452                 }
 453             }
 454             ctot += c;
 455
 456             if (gmx_debug_at)
 457             {
 458 #if BITMASK_SIZE <= 64 //move into bitmask when it is C++
 459                 std::string flags = gmx::formatString("%x", *mask);
 460 #else
 461                 std::string flags = gmx::formatAndJoin(*mask,
 462                                                        *mask+BITMASK_ALEN,
 463                                                        "", gmx::StringFormatter("%x"));
 464 #endif
 465                 fprintf(debug, "block %d flags %s count %d\n",
 466                         b, flags.c_str(), c);
 467             }
 468         }
 469     }
 470     if (debug)
 471     {
 472         fprintf(debug, "Number of %d atom blocks to reduce: %d\n",
 473                 reduction_block_size, bt->nblock_used);
 474         fprintf(debug, "Reduction density %.2f for touched blocks only %.2f\n",
 475                 ctot*reduction_block_size/(double)fr->natoms_force,
 476                 ctot/(double)bt->nblock_used);
 477     }
 478 }
 479
 480 void tear_down_bonded_threading(bonded_threading_t *bt,
 481                                 t_idef             *idef)
 482 {
 483     for (int th = 0; th < bt->nthreads; th++)
 484     {
 485         sfree(bt->f_t[th].fshift);
 486         for (int i = 0; i < egNR; i++)
 487         {
 488             sfree(bt->f_t[th].grpp.ener[i]);
 489         }
 490     }
 491     sfree(bt->f_t);
 492     sfree(bt);
 493     sfree(idef->il_thread_division);
 494 }
 495
 496 void init_bonded_threading(FILE *fplog, int nenergrp,
 497                            struct bonded_threading_t **bt_ptr)
 498 {
 499     bonded_threading_t *bt;
 500
 501     snew(bt, 1);
 502
 503     /* These thread local data structures are used for bondeds only.
 504      *
 505      * Note that we also use there structures when running single-threaded.
 506      * This is because the bonded force buffer uses type rvec4, whereas
 507      * the normal force buffer is uses type rvec. This leads to a little
 508      * reduction overhead, but the speed gain in the bonded calculations
 509      * of doing transposeScatterIncr/DecrU with aligment 4 instead of 3
 510      * is much larger than the reduction overhead.
 511      */
 512     bt->nthreads = gmx_omp_nthreads_get(emntBonded);
 513
 514     snew(bt->f_t, bt->nthreads);
 515 #pragma omp parallel for num_threads(bt->nthreads) schedule(static)
 516     for (int t = 0; t < bt->nthreads; t++)
 517     {
 518         try
 519         {
 520             /* Note that thread 0 uses the global fshift and energy arrays,
 521              * but to keep the code simple, we initialize all data here.
 522              */
 523             bt->f_t[t].f        = nullptr;
 524             bt->f_t[t].f_nalloc = 0;
 525             snew(bt->f_t[t].fshift, SHIFTS);
 526             bt->f_t[t].grpp.nener = nenergrp*nenergrp;
 527             for (int i = 0; i < egNR; i++)
 528             {
 529                 snew(bt->f_t[t].grpp.ener[i], bt->f_t[t].grpp.nener);
 530             }
 531         }
 532         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 533     }
 534
 535     bt->nblock_used  = 0;
 536     bt->block_index  = nullptr;
 537     bt->mask         = nullptr;
 538     bt->block_nalloc = 0;
 539
 540     /* The optimal value after which to switch from uniform to localized
 541      * bonded interaction distribution is 3, 4 or 5 depending on the system
 542      * and hardware.
 543      */
 544     const int max_nthread_uniform = 4;
 545     char *    ptr;
 546
 547     if ((ptr = getenv("GMX_BONDED_NTHREAD_UNIFORM")) != nullptr)
 548     {
 549         sscanf(ptr, "%d", &bt->bonded_max_nthread_uniform);
 550         if (fplog != nullptr)
 551         {
 552             fprintf(fplog, "\nMax threads for uniform bonded distribution set to %d by env.var.\n",
 553                     bt->bonded_max_nthread_uniform);
 554         }
 555     }
 556     else
 557     {
 558         bt->bonded_max_nthread_uniform = max_nthread_uniform;
 559     }
 560
 561     *bt_ptr = bt;
 562 }