src/gromacs/nbnxm/grid.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *
  38  * \brief
  39  * Implements the Grid class.
  40  *
  41  * \author Berk Hess <hess@kth.se>
  42  * \ingroup module_nbnxm
  43  */
  44
  45 #include "gmxpre.h"
  46
  47 #include "grid.h"
  48
  49 #include <cmath>
  50 #include <cstring>
  51
  52 #include <algorithm>
  53
  54 #include "gromacs/math/utilities.h"
  55 #include "gromacs/math/vec.h"
  56 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  57 #include "gromacs/mdlib/updategroupscog.h"
  58 #include "gromacs/mdtypes/forcerec.h" // only for GET_CGINFO_*
  59 #include "gromacs/nbnxm/atomdata.h"
  60 #include "gromacs/nbnxm/nbnxm_geometry.h"
  61 #include "gromacs/simd/simd.h"
  62 #include "gromacs/simd/vector_operations.h"
  63
  64 namespace Nbnxm
  65 {
  66
  67 Grid::Geometry::Geometry(const PairlistType pairlistType) :
  68     isSimple(pairlistType != PairlistType::HierarchicalNxN),
  69     numAtomsICluster(IClusterSizePerListType[pairlistType]),
  70     numAtomsJCluster(JClusterSizePerListType[pairlistType]),
  71     numAtomsPerCell((isSimple ? 1 : c_gpuNumClusterPerCell)*numAtomsICluster),
  72     numAtomsICluster2Log(get_2log(numAtomsICluster))
  73 {
  74 }
  75
  76 Grid::Grid(const PairlistType pairlistType) :
  77     geometry_(pairlistType)
  78 {
  79 }
  80
  81 /*! \brief Returns the atom density (> 0) of a rectangular grid */
  82 static real gridAtomDensity(int        numAtoms,
  83                             const rvec lowerCorner,
  84                             const rvec upperCorner)
  85 {
  86     rvec size;
  87
  88     if (numAtoms == 0)
  89     {
  90         /* To avoid zero density we use a minimum of 1 atom */
  91         numAtoms = 1;
  92     }
  93
  94     rvec_sub(upperCorner, lowerCorner, size);
  95
  96     return numAtoms/(size[XX]*size[YY]*size[ZZ]);
  97 }
  98
  99 void Grid::setDimensions(const int           ddZone,
 100                          const int           numAtoms,
 101                          const rvec          lowerCorner,
 102                          const rvec          upperCorner,
 103                          real                atomDensity,
 104                          const real          maxAtomGroupRadius,
 105                          const bool          haveFep)
 106 {
 107     /* For the home zone we compute the density when not set (=-1) or when =0 */
 108     if (ddZone == 0 && atomDensity <= 0)
 109     {
 110         atomDensity = gridAtomDensity(numAtoms, lowerCorner, upperCorner);
 111     }
 112
 113     dimensions_.atomDensity        = atomDensity;
 114     dimensions_.maxAtomGroupRadius = maxAtomGroupRadius;
 115
 116     rvec size;
 117     rvec_sub(upperCorner, lowerCorner, size);
 118
 119     if (numAtoms > geometry_.numAtomsPerCell)
 120     {
 121         GMX_ASSERT(atomDensity > 0, "With one or more atoms, the density should be positive");
 122
 123         /* target cell length */
 124         real tlen_x;
 125         real tlen_y;
 126         if (geometry_.isSimple)
 127         {
 128             /* To minimize the zero interactions, we should make
 129              * the largest of the i/j cell cubic.
 130              */
 131             int numAtomsInCell = std::max(geometry_.numAtomsICluster,
 132                                           geometry_.numAtomsJCluster);
 133
 134             /* Approximately cubic cells */
 135             real tlen = std::cbrt(numAtomsInCell/atomDensity);
 136             tlen_x    = tlen;
 137             tlen_y    = tlen;
 138         }
 139         else
 140         {
 141             /* Approximately cubic sub cells */
 142             real tlen = std::cbrt(geometry_.numAtomsICluster/atomDensity);
 143             tlen_x    = tlen*c_gpuNumClusterPerCellX;
 144             tlen_y    = tlen*c_gpuNumClusterPerCellY;
 145         }
 146         /* We round ncx and ncy down, because we get less cell pairs
 147          * in the nbsist when the fixed cell dimensions (x,y) are
 148          * larger than the variable one (z) than the other way around.
 149          */
 150         dimensions_.numCells[XX] = std::max(1, static_cast<int>(size[XX]/tlen_x));
 151         dimensions_.numCells[YY] = std::max(1, static_cast<int>(size[YY]/tlen_y));
 152     }
 153     else
 154     {
 155         dimensions_.numCells[XX] = 1;
 156         dimensions_.numCells[YY] = 1;
 157     }
 158
 159     for (int d = 0; d < DIM - 1; d++)
 160     {
 161         dimensions_.cellSize[d]    = size[d]/dimensions_.numCells[d];
 162         dimensions_.invCellSize[d] = 1/dimensions_.cellSize[d];
 163     }
 164
 165     if (ddZone > 0)
 166     {
 167         /* This is a non-home zone, add an extra row of cells
 168          * for particles communicated for bonded interactions.
 169          * These can be beyond the cut-off. It doesn't matter where
 170          * they end up on the grid, but for performance it's better
 171          * if they don't end up in cells that can be within cut-off range.
 172          */
 173         dimensions_.numCells[XX]++;
 174         dimensions_.numCells[YY]++;
 175     }
 176
 177     /* We need one additional cell entry for particles moved by DD */
 178     cxy_na_.resize(numColumns() + 1);
 179     cxy_ind_.resize(numColumns() + 2);
 180
 181     /* Worst case scenario of 1 atom in each last cell */
 182     int maxNumCells;
 183     if (geometry_.numAtomsJCluster <= geometry_.numAtomsICluster)
 184     {
 185         maxNumCells = numAtoms/geometry_.numAtomsPerCell + numColumns();
 186     }
 187     else
 188     {
 189         maxNumCells = numAtoms/geometry_.numAtomsPerCell + numColumns()*geometry_.numAtomsJCluster/geometry_.numAtomsICluster;
 190     }
 191
 192     if (!geometry_.isSimple)
 193     {
 194         numClusters_.resize(maxNumCells);
 195     }
 196     bbcz_.resize(maxNumCells);
 197
 198     /* This resize also zeros the contents, this avoid possible
 199      * floating exceptions in SIMD with the unused bb elements.
 200      */
 201     if (geometry_.isSimple)
 202     {
 203         bb_.resize(maxNumCells);
 204     }
 205     else
 206     {
 207 #if NBNXN_BBXXXX
 208         pbb_.resize(packedBoundingBoxesIndex(maxNumCells*c_gpuNumClusterPerCell));
 209 #else
 210         bb_.resize(maxNumCells*c_gpuNumClusterPerCell);
 211 #endif
 212     }
 213
 214     if (geometry_.numAtomsJCluster == geometry_.numAtomsICluster)
 215     {
 216         bbj_ = bb_;
 217     }
 218     else
 219     {
 220         GMX_ASSERT(geometry_.isSimple, "Only CPU lists should have different i/j cluster sizes");
 221
 222         bbjStorage_.resize(maxNumCells*geometry_.numAtomsICluster/geometry_.numAtomsJCluster);
 223         bbj_ = bbjStorage_;
 224     }
 225
 226     flags_.resize(maxNumCells);
 227     if (haveFep)
 228     {
 229         fep_.resize(maxNumCells*geometry_.numAtomsPerCell/geometry_.numAtomsICluster);
 230     }
 231
 232     copy_rvec(lowerCorner, dimensions_.lowerCorner);
 233     copy_rvec(upperCorner, dimensions_.upperCorner);
 234     copy_rvec(size,        dimensions_.gridSize);
 235 }
 236
 237 /* We need to sort paricles in grid columns on z-coordinate.
 238  * As particle are very often distributed homogeneously, we use a sorting
 239  * algorithm similar to pigeonhole sort. We multiply the z-coordinate
 240  * by a factor, cast to an int and try to store in that hole. If the hole
 241  * is full, we move this or another particle. A second pass is needed to make
 242  * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
 243  * 4 is the optimal value for homogeneous particle distribution and allows
 244  * for an O(#particles) sort up till distributions were all particles are
 245  * concentrated in 1/4 of the space. No NlogN fallback is implemented,
 246  * as it can be expensive to detect imhomogeneous particle distributions.
 247  */
 248 /*! \brief Ratio of grid cells to atoms */
 249 static constexpr int c_sortGridRatio         = 4;
 250 /*! \brief Maximum ratio of holes used, in the worst case all particles
 251  * end up in the last hole and we need num. atoms extra holes at the end.
 252  */
 253 static constexpr int c_sortGridMaxSizeFactor = c_sortGridRatio + 1;
 254
 255 /*! \brief Sorts particle index a on coordinates x along dim.
 256  *
 257  * Backwards tells if we want decreasing iso increasing coordinates.
 258  * h0 is the minimum of the coordinate range.
 259  * invh is the 1/length of the sorting range.
 260  * n_per_h (>=n) is the expected average number of particles per 1/invh
 261  * sort is the sorting work array.
 262  * sort should have a size of at least n_per_h*c_sortGridRatio + n,
 263  * or easier, allocate at least n*c_sortGridMaxSizeFactor elements.
 264  */
 265 static void sort_atoms(int dim, gmx_bool Backwards,
 266                        int gmx_unused dd_zone,
 267                        bool gmx_unused relevantAtomsAreWithinGridBounds,
 268                        int *a, int n,
 269                        gmx::ArrayRef<const gmx::RVec> x,
 270                        real h0, real invh, int n_per_h,
 271                        gmx::ArrayRef<int> sort)
 272 {
 273     if (n <= 1)
 274     {
 275         /* Nothing to do */
 276         return;
 277     }
 278
 279     GMX_ASSERT(n <= n_per_h, "We require n <= n_per_h");
 280
 281     /* Transform the inverse range height into the inverse hole height */
 282     invh *= n_per_h*c_sortGridRatio;
 283
 284     /* Set nsort to the maximum possible number of holes used.
 285      * In worst case all n elements end up in the last bin.
 286      */
 287     int nsort = n_per_h*c_sortGridRatio + n;
 288
 289     /* Determine the index range used, so we can limit it for the second pass */
 290     int zi_min = INT_MAX;
 291     int zi_max = -1;
 292
 293     /* Sort the particles using a simple index sort */
 294     for (int i = 0; i < n; i++)
 295     {
 296         /* The cast takes care of float-point rounding effects below zero.
 297          * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
 298          * times the box height out of the box.
 299          */
 300         int zi = static_cast<int>((x[a[i]][dim] - h0)*invh);
 301
 302 #ifndef NDEBUG
 303         /* As we can have rounding effect, we use > iso >= here */
 304         if (relevantAtomsAreWithinGridBounds &&
 305             (zi < 0 || (dd_zone == 0 && zi > n_per_h*c_sortGridRatio)))
 306         {
 307             gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
 308                       a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
 309                       n_per_h, c_sortGridRatio);
 310         }
 311 #endif
 312         if (zi < 0)
 313         {
 314             zi = 0;
 315         }
 316
 317         /* In a non-local domain, particles communcated for bonded interactions
 318          * can be far beyond the grid size, which is set by the non-bonded
 319          * cut-off distance. We sort such particles into the last cell.
 320          */
 321         if (zi > n_per_h*c_sortGridRatio)
 322         {
 323             zi = n_per_h*c_sortGridRatio;
 324         }
 325
 326         /* Ideally this particle should go in sort cell zi,
 327          * but that might already be in use,
 328          * in that case find the first empty cell higher up
 329          */
 330         if (sort[zi] < 0)
 331         {
 332             sort[zi] = a[i];
 333             zi_min   = std::min(zi_min, zi);
 334             zi_max   = std::max(zi_max, zi);
 335         }
 336         else
 337         {
 338             /* We have multiple atoms in the same sorting slot.
 339              * Sort on real z for minimal bounding box size.
 340              * There is an extra check for identical z to ensure
 341              * well-defined output order, independent of input order
 342              * to ensure binary reproducibility after restarts.
 343              */
 344             while (sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
 345                                       (x[a[i]][dim] == x[sort[zi]][dim] &&
 346                                        a[i] > sort[zi])))
 347             {
 348                 zi++;
 349             }
 350
 351             if (sort[zi] >= 0)
 352             {
 353                 /* Shift all elements by one slot until we find an empty slot */
 354                 int cp  = sort[zi];
 355                 int zim = zi + 1;
 356                 while (sort[zim] >= 0)
 357                 {
 358                     int tmp   = sort[zim];
 359                     sort[zim] = cp;
 360                     cp        = tmp;
 361                     zim++;
 362                 }
 363                 sort[zim] = cp;
 364                 zi_max    = std::max(zi_max, zim);
 365             }
 366             sort[zi] = a[i];
 367             zi_max   = std::max(zi_max, zi);
 368         }
 369     }
 370
 371     int c = 0;
 372     if (!Backwards)
 373     {
 374         for (int zi = 0; zi < nsort; zi++)
 375         {
 376             if (sort[zi] >= 0)
 377             {
 378                 a[c++]   = sort[zi];
 379                 sort[zi] = -1;
 380             }
 381         }
 382     }
 383     else
 384     {
 385         for (int zi = zi_max; zi >= zi_min; zi--)
 386         {
 387             if (sort[zi] >= 0)
 388             {
 389                 a[c++]   = sort[zi];
 390                 sort[zi] = -1;
 391             }
 392         }
 393     }
 394     if (c < n)
 395     {
 396         gmx_incons("Lost particles while sorting");
 397     }
 398 }
 399
 400 #if GMX_DOUBLE
 401 //! Returns double up to one least significant float bit smaller than x
 402 static double R2F_D(const float x)
 403 {
 404     return static_cast<float>(x >= 0 ? (1 - GMX_FLOAT_EPS)*x : (1 + GMX_FLOAT_EPS)*x);
 405 }
 406 //! Returns double up to one least significant float bit larger than x
 407 static double R2F_U(const float x)
 408 {
 409     return static_cast<float>(x >= 0 ? (1 + GMX_FLOAT_EPS)*x : (1 - GMX_FLOAT_EPS)*x);
 410 }
 411 #else
 412 //! Returns x
 413 static float R2F_D(const float x)
 414 {
 415     return x;
 416 }
 417 //! Returns x
 418 static float R2F_U(const float x)
 419 {
 420     return x;
 421 }
 422 #endif
 423
 424 //! Computes the bounding box for na coordinates in order x,y,z, bb order xyz0
 425 static void calc_bounding_box(int na, int stride, const real *x,
 426                               BoundingBox *bb)
 427 {
 428     int  i;
 429     real xl, xh, yl, yh, zl, zh;
 430
 431     i  = 0;
 432     xl = x[i+XX];
 433     xh = x[i+XX];
 434     yl = x[i+YY];
 435     yh = x[i+YY];
 436     zl = x[i+ZZ];
 437     zh = x[i+ZZ];
 438     i += stride;
 439     for (int j = 1; j < na; j++)
 440     {
 441         xl = std::min(xl, x[i+XX]);
 442         xh = std::max(xh, x[i+XX]);
 443         yl = std::min(yl, x[i+YY]);
 444         yh = std::max(yh, x[i+YY]);
 445         zl = std::min(zl, x[i+ZZ]);
 446         zh = std::max(zh, x[i+ZZ]);
 447         i += stride;
 448     }
 449     /* Note: possible double to float conversion here */
 450     bb->lower.x = R2F_D(xl);
 451     bb->lower.y = R2F_D(yl);
 452     bb->lower.z = R2F_D(zl);
 453     bb->upper.x = R2F_U(xh);
 454     bb->upper.y = R2F_U(yh);
 455     bb->upper.z = R2F_U(zh);
 456 }
 457
 458 /*! \brief Computes the bounding box for na packed coordinates, bb order xyz0 */
 459 static void calc_bounding_box_x_x4(int na, const real *x,
 460                                    BoundingBox *bb)
 461 {
 462     real xl, xh, yl, yh, zl, zh;
 463
 464     xl = x[XX*c_packX4];
 465     xh = x[XX*c_packX4];
 466     yl = x[YY*c_packX4];
 467     yh = x[YY*c_packX4];
 468     zl = x[ZZ*c_packX4];
 469     zh = x[ZZ*c_packX4];
 470     for (int j = 1; j < na; j++)
 471     {
 472         xl = std::min(xl, x[j+XX*c_packX4]);
 473         xh = std::max(xh, x[j+XX*c_packX4]);
 474         yl = std::min(yl, x[j+YY*c_packX4]);
 475         yh = std::max(yh, x[j+YY*c_packX4]);
 476         zl = std::min(zl, x[j+ZZ*c_packX4]);
 477         zh = std::max(zh, x[j+ZZ*c_packX4]);
 478     }
 479     /* Note: possible double to float conversion here */
 480     bb->lower.x = R2F_D(xl);
 481     bb->lower.y = R2F_D(yl);
 482     bb->lower.z = R2F_D(zl);
 483     bb->upper.x = R2F_U(xh);
 484     bb->upper.y = R2F_U(yh);
 485     bb->upper.z = R2F_U(zh);
 486 }
 487
 488 /*! \brief Computes the bounding box for na coordinates, bb order xyz0 */
 489 static void calc_bounding_box_x_x8(int na, const real *x,
 490                                    BoundingBox *bb)
 491 {
 492     real xl, xh, yl, yh, zl, zh;
 493
 494     xl = x[XX*c_packX8];
 495     xh = x[XX*c_packX8];
 496     yl = x[YY*c_packX8];
 497     yh = x[YY*c_packX8];
 498     zl = x[ZZ*c_packX8];
 499     zh = x[ZZ*c_packX8];
 500     for (int j = 1; j < na; j++)
 501     {
 502         xl = std::min(xl, x[j+XX*c_packX8]);
 503         xh = std::max(xh, x[j+XX*c_packX8]);
 504         yl = std::min(yl, x[j+YY*c_packX8]);
 505         yh = std::max(yh, x[j+YY*c_packX8]);
 506         zl = std::min(zl, x[j+ZZ*c_packX8]);
 507         zh = std::max(zh, x[j+ZZ*c_packX8]);
 508     }
 509     /* Note: possible double to float conversion here */
 510     bb->lower.x = R2F_D(xl);
 511     bb->lower.y = R2F_D(yl);
 512     bb->lower.z = R2F_D(zl);
 513     bb->upper.x = R2F_U(xh);
 514     bb->upper.y = R2F_U(yh);
 515     bb->upper.z = R2F_U(zh);
 516 }
 517
 518 /*! \brief Computes the bounding box for na packed coordinates, bb order xyz0 */
 519 gmx_unused static void calc_bounding_box_x_x4_halves(int na, const real *x,
 520                                                      BoundingBox *bb,
 521                                                      BoundingBox *bbj)
 522 {
 523     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 524     using namespace gmx;
 525
 526     calc_bounding_box_x_x4(std::min(na, 2), x, bbj);
 527
 528     if (na > 2)
 529     {
 530         calc_bounding_box_x_x4(std::min(na-2, 2), x+(c_packX4>>1), bbj+1);
 531     }
 532     else
 533     {
 534         /* Set the "empty" bounding box to the same as the first one,
 535          * so we don't need to treat special cases in the rest of the code.
 536          */
 537 #if NBNXN_SEARCH_BB_SIMD4
 538         store4(bbj[1].lower.ptr(), load4(bbj[0].lower.ptr()));
 539         store4(bbj[1].upper.ptr(), load4(bbj[0].upper.ptr()));
 540 #else
 541         bbj[1] = bbj[0];
 542 #endif
 543     }
 544
 545 #if NBNXN_SEARCH_BB_SIMD4
 546     store4(bb->lower.ptr(),
 547            min(load4(bbj[0].lower.ptr()), load4(bbj[1].lower.ptr())));
 548     store4(bb->upper.ptr(),
 549            max(load4(bbj[0].upper.ptr()), load4(bbj[1].upper.ptr())));
 550 #else
 551     {
 552         bb->lower = BoundingBox::Corner::min(bbj[0].lower,
 553                                              bbj[1].lower);
 554         bb->upper = BoundingBox::Corner::max(bbj[0].upper,
 555                                              bbj[1].upper);
 556     }
 557 #endif
 558 }
 559
 560 #if NBNXN_BBXXXX
 561
 562 /*! \brief Computes the bounding box for na coordinates in order xyz, bb order xxxxyyyyzzzz */
 563 static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
 564 {
 565     int  i;
 566     real xl, xh, yl, yh, zl, zh;
 567
 568     i  = 0;
 569     xl = x[i+XX];
 570     xh = x[i+XX];
 571     yl = x[i+YY];
 572     yh = x[i+YY];
 573     zl = x[i+ZZ];
 574     zh = x[i+ZZ];
 575     i += stride;
 576     for (int j = 1; j < na; j++)
 577     {
 578         xl = std::min(xl, x[i+XX]);
 579         xh = std::max(xh, x[i+XX]);
 580         yl = std::min(yl, x[i+YY]);
 581         yh = std::max(yh, x[i+YY]);
 582         zl = std::min(zl, x[i+ZZ]);
 583         zh = std::max(zh, x[i+ZZ]);
 584         i += stride;
 585     }
 586     /* Note: possible double to float conversion here */
 587     bb[0*c_packedBoundingBoxesDimSize] = R2F_D(xl);
 588     bb[1*c_packedBoundingBoxesDimSize] = R2F_D(yl);
 589     bb[2*c_packedBoundingBoxesDimSize] = R2F_D(zl);
 590     bb[3*c_packedBoundingBoxesDimSize] = R2F_U(xh);
 591     bb[4*c_packedBoundingBoxesDimSize] = R2F_U(yh);
 592     bb[5*c_packedBoundingBoxesDimSize] = R2F_U(zh);
 593 }
 594
 595 #endif /* NBNXN_BBXXXX */
 596
 597 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 598
 599 /*! \brief Computes the bounding box for na coordinates in order xyz?, bb order xyz0 */
 600 static void calc_bounding_box_simd4(int na, const float *x,
 601                                     BoundingBox *bb)
 602 {
 603     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 604     using namespace gmx;
 605
 606     static_assert(sizeof(BoundingBox::Corner) == GMX_SIMD4_WIDTH*sizeof(float),
 607                   "The Corner struct should hold exactly 4 floats");
 608
 609     Simd4Float bb_0_S = load4(x);
 610     Simd4Float bb_1_S = bb_0_S;
 611
 612     for (int i = 1; i < na; i++)
 613     {
 614         Simd4Float x_S = load4(x + i*GMX_SIMD4_WIDTH);
 615         bb_0_S         = min(bb_0_S, x_S);
 616         bb_1_S         = max(bb_1_S, x_S);
 617     }
 618
 619     store4(bb->lower.ptr(), bb_0_S);
 620     store4(bb->upper.ptr(), bb_1_S);
 621 }
 622
 623 #if NBNXN_BBXXXX
 624
 625 /*! \brief Computes the bounding box for na coordinates in order xyz?, bb order xxxxyyyyzzzz */
 626 static void calc_bounding_box_xxxx_simd4(int na, const float *x,
 627                                          BoundingBox *bb_work_aligned,
 628                                          real *bb)
 629 {
 630     calc_bounding_box_simd4(na, x, bb_work_aligned);
 631
 632     bb[0*c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.x;
 633     bb[1*c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.y;
 634     bb[2*c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.z;
 635     bb[3*c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.x;
 636     bb[4*c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.y;
 637     bb[5*c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.z;
 638 }
 639
 640 #endif /* NBNXN_BBXXXX */
 641
 642 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
 643
 644
 645 /*! \brief Combines pairs of consecutive bounding boxes */
 646 static void combine_bounding_box_pairs(const Grid                       &grid,
 647                                        gmx::ArrayRef<const BoundingBox>  bb,
 648                                        gmx::ArrayRef<BoundingBox>        bbj)
 649 {
 650     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 651     using namespace gmx;
 652
 653     for (int i = 0; i < grid.numColumns(); i++)
 654     {
 655         /* Starting bb in a column is expected to be 2-aligned */
 656         const int sc2 = grid.firstCellInColumn(i) >> 1;
 657         /* For odd numbers skip the last bb here */
 658         const int nc2 = (grid.numAtomsInColumn(i) + 3) >> (2 + 1);
 659         int       c2;
 660         for (c2 = sc2; c2 < sc2 + nc2; c2++)
 661         {
 662 #if NBNXN_SEARCH_BB_SIMD4
 663             Simd4Float min_S, max_S;
 664
 665             min_S = min(load4(bb[c2*2+0].lower.ptr()),
 666                         load4(bb[c2*2+1].lower.ptr()));
 667             max_S = max(load4(bb[c2*2+0].upper.ptr()),
 668                         load4(bb[c2*2+1].upper.ptr()));
 669             store4(bbj[c2].lower.ptr(), min_S);
 670             store4(bbj[c2].upper.ptr(), max_S);
 671 #else
 672             bbj[c2].lower = BoundingBox::Corner::min(bb[c2*2 + 0].lower,
 673                                                      bb[c2*2 + 1].lower);
 674             bbj[c2].upper = BoundingBox::Corner::max(bb[c2*2 + 0].upper,
 675                                                      bb[c2*2 + 1].upper);
 676 #endif
 677         }
 678         if (((grid.numAtomsInColumn(i) + 3) >> 2) & 1)
 679         {
 680             /* The bb count in this column is odd: duplicate the last bb */
 681             bbj[c2].lower = bb[c2*2].lower;
 682             bbj[c2].upper = bb[c2*2].upper;
 683         }
 684     }
 685 }
 686
 687
 688 /*! \brief Prints the average bb size, used for debug output */
 689 static void print_bbsizes_simple(FILE       *fp,
 690                                  const Grid &grid)
 691 {
 692     dvec ba = { 0 };
 693     for (int c = 0; c < grid.numCells(); c++)
 694     {
 695         const BoundingBox &bb = grid.iBoundingBoxes()[c];
 696         ba[XX] += bb.upper.x - bb.lower.x;
 697         ba[YY] += bb.upper.y - bb.lower.y;
 698         ba[ZZ] += bb.upper.z - bb.lower.z;
 699     }
 700     dsvmul(1.0/grid.numCells(), ba, ba);
 701
 702     const Grid::Dimensions &dims         = grid.dimensions();
 703     real                    avgCellSizeZ = (dims.atomDensity > 0 ? grid.geometry().numAtomsICluster/(dims.atomDensity*dims.cellSize[XX]*dims.cellSize[YY]) : 0.0);
 704
 705     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 706             dims.cellSize[XX],
 707             dims.cellSize[YY],
 708             avgCellSizeZ,
 709             ba[XX], ba[YY], ba[ZZ],
 710             ba[XX]*dims.invCellSize[XX],
 711             ba[YY]*dims.invCellSize[YY],
 712             dims.atomDensity > 0 ? ba[ZZ]/avgCellSizeZ : 0.0);
 713 }
 714
 715 /*! \brief Prints the average bb size, used for debug output */
 716 static void print_bbsizes_supersub(FILE       *fp,
 717                                    const Grid &grid)
 718 {
 719     int  ns;
 720     dvec ba;
 721
 722     clear_dvec(ba);
 723     ns = 0;
 724     for (int c = 0; c < grid.numCells(); c++)
 725     {
 726 #if NBNXN_BBXXXX
 727         for (int s = 0; s < grid.numClustersPerCell()[c]; s += c_packedBoundingBoxesDimSize)
 728         {
 729             int  cs_w          = (c*c_gpuNumClusterPerCell + s)/c_packedBoundingBoxesDimSize;
 730             auto boundingBoxes = grid.packedBoundingBoxes().subArray(cs_w*c_packedBoundingBoxesSize, c_packedBoundingBoxesSize);
 731             for (int i = 0; i < c_packedBoundingBoxesDimSize; i++)
 732             {
 733                 for (int d = 0; d < DIM; d++)
 734                 {
 735                     ba[d] +=
 736                         boundingBoxes[(DIM + d)*c_packedBoundingBoxesDimSize + i] -
 737                         boundingBoxes[(0   + d)*c_packedBoundingBoxesDimSize + i];
 738                 }
 739             }
 740         }
 741 #else
 742         for (int s = 0; s < grid.numClustersPerCell()[c]; s++)
 743         {
 744             const BoundingBox &bb = grid.iBoundingBoxes()[c*c_gpuNumClusterPerCell + s];
 745             ba[XX] += bb.upper.x - bb.lower.x;
 746             ba[YY] += bb.upper.y - bb.lower.y;
 747             ba[ZZ] += bb.upper.z - bb.lower.z;
 748         }
 749 #endif
 750         ns += grid.numClustersPerCell()[c];
 751     }
 752     dsvmul(1.0/ns, ba, ba);
 753
 754     const Grid::Dimensions &dims            = grid.dimensions();
 755     const real              avgClusterSizeZ =
 756         (dims.atomDensity > 0 ? grid.geometry().numAtomsPerCell/(dims.atomDensity*dims.cellSize[XX]*dims.cellSize[YY]*c_gpuNumClusterPerCellZ) : 0.0);
 757
 758     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 759             dims.cellSize[XX]/c_gpuNumClusterPerCellX,
 760             dims.cellSize[YY]/c_gpuNumClusterPerCellY,
 761             avgClusterSizeZ,
 762             ba[XX], ba[YY], ba[ZZ],
 763             ba[XX]*c_gpuNumClusterPerCellX*dims.invCellSize[XX],
 764             ba[YY]*c_gpuNumClusterPerCellY*dims.invCellSize[YY],
 765             dims.atomDensity > 0 ? ba[ZZ]/avgClusterSizeZ : 0.0);
 766 }
 767
 768 /*!\brief Set non-bonded interaction flags for the current cluster.
 769  *
 770  * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
 771  */
 772 static void sort_cluster_on_flag(int                 numAtomsInCluster,
 773                                  int                 atomStart,
 774                                  int                 atomEnd,
 775                                  const int          *atinfo,
 776                                  gmx::ArrayRef<int>  order,
 777                                  int                *flags)
 778 {
 779     constexpr int c_maxNumAtomsInCluster = 8;
 780     int           sort1[c_maxNumAtomsInCluster];
 781     int           sort2[c_maxNumAtomsInCluster];
 782
 783     GMX_ASSERT(numAtomsInCluster <= c_maxNumAtomsInCluster, "Need to increase c_maxNumAtomsInCluster to support larger clusters");
 784
 785     *flags = 0;
 786
 787     int subc = 0;
 788     for (int s = atomStart; s < atomEnd; s += numAtomsInCluster)
 789     {
 790         /* Make lists for this (sub-)cell on atoms with and without LJ */
 791         int      n1         = 0;
 792         int      n2         = 0;
 793         gmx_bool haveQ      = FALSE;
 794         int      a_lj_max   = -1;
 795         for (int a = s; a < std::min(s + numAtomsInCluster, atomEnd); a++)
 796         {
 797             haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
 798
 799             if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
 800             {
 801                 sort1[n1++] = order[a];
 802                 a_lj_max    = a;
 803             }
 804             else
 805             {
 806                 sort2[n2++] = order[a];
 807             }
 808         }
 809
 810         /* If we don't have atoms with LJ, there's nothing to sort */
 811         if (n1 > 0)
 812         {
 813             *flags |= NBNXN_CI_DO_LJ(subc);
 814
 815             if (2*n1 <= numAtomsInCluster)
 816             {
 817                 /* Only sort when strictly necessary. Ordering particles
 818                  * Ordering particles can lead to less accurate summation
 819                  * due to rounding, both for LJ and Coulomb interactions.
 820                  */
 821                 if (2*(a_lj_max - s) >= numAtomsInCluster)
 822                 {
 823                     for (int i = 0; i < n1; i++)
 824                     {
 825                         order[atomStart + i]      = sort1[i];
 826                     }
 827                     for (int j = 0; j < n2; j++)
 828                     {
 829                         order[atomStart + n1 + j] = sort2[j];
 830                     }
 831                 }
 832
 833                 *flags |= NBNXN_CI_HALF_LJ(subc);
 834             }
 835         }
 836         if (haveQ)
 837         {
 838             *flags |= NBNXN_CI_DO_COUL(subc);
 839         }
 840         subc++;
 841     }
 842 }
 843
 844 /*! \brief Fill a pair search cell with atoms.
 845  *
 846  * Potentially sorts atoms and sets the interaction flags.
 847  */
 848 void Grid::fillCell(const GridSetData              &gridSetData,
 849                     nbnxn_atomdata_t               *nbat,
 850                     int                             atomStart,
 851                     int                             atomEnd,
 852                     const int                      *atinfo,
 853                     gmx::ArrayRef<const gmx::RVec>  x,
 854                     BoundingBox gmx_unused         *bb_work_aligned)
 855 {
 856     const int                 numAtoms = atomEnd - atomStart;
 857
 858     const gmx::ArrayRef<int> &cells       = gridSetData.cells;
 859     const gmx::ArrayRef<int> &atomIndices = gridSetData.atomIndices;
 860
 861     if (geometry_.isSimple)
 862     {
 863         /* Note that non-local grids are already sorted.
 864          * Then sort_cluster_on_flag will only set the flags and the sorting
 865          * will not affect the atom order.
 866          */
 867         sort_cluster_on_flag(geometry_.numAtomsICluster, atomStart, atomEnd, atinfo, atomIndices,
 868                              flags_.data() + atomToCluster(atomStart) - cellOffset_);
 869     }
 870
 871     if (gridSetData.haveFep)
 872     {
 873         /* Set the fep flag for perturbed atoms in this (sub-)cell */
 874
 875         /* The grid-local cluster/(sub-)cell index */
 876         int cell = atomToCluster(atomStart) - cellOffset_*(geometry_.isSimple ? 1 : c_gpuNumClusterPerCell);
 877         fep_[cell] = 0;
 878         for (int at = atomStart; at < atomEnd; at++)
 879         {
 880             if (atomIndices[at] >= 0 && GET_CGINFO_FEP(atinfo[atomIndices[at]]))
 881             {
 882                 fep_[cell] |= (1 << (at - atomStart));
 883             }
 884         }
 885     }
 886
 887     /* Now we have sorted the atoms, set the cell indices */
 888     for (int at = atomStart; at < atomEnd; at++)
 889     {
 890         cells[atomIndices[at]] = at;
 891     }
 892
 893     copy_rvec_to_nbat_real(atomIndices.data() + atomStart, numAtoms, geometry_.numAtomsICluster,
 894                            as_rvec_array(x.data()),
 895                            nbat->XFormat, nbat->x().data(), atomStart);
 896
 897     if (nbat->XFormat == nbatX4)
 898     {
 899         /* Store the bounding boxes as xyz.xyz. */
 900         size_t       offset = atomToCluster(atomStart - cellOffset_*geometry_.numAtomsICluster);
 901         BoundingBox *bb_ptr = bb_.data() + offset;
 902
 903 #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2
 904         if (2*geometry_.numAtomsJCluster == geometry_.numAtomsICluster)
 905         {
 906             calc_bounding_box_x_x4_halves(numAtoms, nbat->x().data() + atom_to_x_index<c_packX4>(atomStart), bb_ptr,
 907                                           bbj_.data() + offset*2);
 908         }
 909         else
 910 #endif
 911         {
 912             calc_bounding_box_x_x4(numAtoms, nbat->x().data() + atom_to_x_index<c_packX4>(atomStart), bb_ptr);
 913         }
 914     }
 915     else if (nbat->XFormat == nbatX8)
 916     {
 917         /* Store the bounding boxes as xyz.xyz. */
 918         size_t       offset = atomToCluster(atomStart - cellOffset_*geometry_.numAtomsICluster);
 919         BoundingBox *bb_ptr = bb_.data() + offset;
 920
 921         calc_bounding_box_x_x8(numAtoms, nbat->x().data() + atom_to_x_index<c_packX8>(atomStart), bb_ptr);
 922     }
 923 #if NBNXN_BBXXXX
 924     else if (!geometry_.isSimple)
 925     {
 926         /* Store the bounding boxes in a format convenient
 927          * for SIMD4 calculations: xxxxyyyyzzzz...
 928          */
 929         const int clusterIndex    = ((atomStart - cellOffset_*geometry_.numAtomsPerCell) >> geometry_.numAtomsICluster2Log);
 930         float    *pbb_ptr         =
 931             pbb_.data() +
 932             packedBoundingBoxesIndex(clusterIndex) +
 933             (clusterIndex & (c_packedBoundingBoxesDimSize - 1));
 934
 935 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 936         if (nbat->XFormat == nbatXYZQ)
 937         {
 938             calc_bounding_box_xxxx_simd4(numAtoms, nbat->x().data() + atomStart*nbat->xstride,
 939                                          bb_work_aligned, pbb_ptr);
 940         }
 941         else
 942 #endif
 943         {
 944             calc_bounding_box_xxxx(numAtoms, nbat->xstride, nbat->x().data() + atomStart*nbat->xstride,
 945                                    pbb_ptr);
 946         }
 947         if (gmx_debug_at)
 948         {
 949             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 950                     atomToCluster(atomStart),
 951                     pbb_ptr[0*c_packedBoundingBoxesDimSize], pbb_ptr[3*c_packedBoundingBoxesDimSize],
 952                     pbb_ptr[1*c_packedBoundingBoxesDimSize], pbb_ptr[4*c_packedBoundingBoxesDimSize],
 953                     pbb_ptr[2*c_packedBoundingBoxesDimSize], pbb_ptr[5*c_packedBoundingBoxesDimSize]);
 954         }
 955     }
 956 #endif
 957     else
 958     {
 959         /* Store the bounding boxes as xyz.xyz. */
 960         BoundingBox *bb_ptr = bb_.data() + atomToCluster(atomStart - cellOffset_*geometry_.numAtomsPerCell);
 961
 962         calc_bounding_box(numAtoms, nbat->xstride, nbat->x().data() + atomStart*nbat->xstride,
 963                           bb_ptr);
 964
 965         if (gmx_debug_at)
 966         {
 967             int bbo = atomToCluster(atomStart - cellOffset_*geometry_.numAtomsPerCell);
 968             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 969                     atomToCluster(atomStart),
 970                     bb_[bbo].lower.x,
 971                     bb_[bbo].lower.y,
 972                     bb_[bbo].lower.z,
 973                     bb_[bbo].upper.x,
 974                     bb_[bbo].upper.y,
 975                     bb_[bbo].upper.z);
 976         }
 977     }
 978 }
 979
 980 void Grid::sortColumnsCpuGeometry(const GridSetData &gridSetData,
 981                                   int dd_zone,
 982                                   int atomStart, int atomEnd,
 983                                   const int *atinfo,
 984                                   gmx::ArrayRef<const gmx::RVec> x,
 985                                   nbnxn_atomdata_t *nbat,
 986                                   int cxy_start, int cxy_end,
 987                                   gmx::ArrayRef<int> sort_work)
 988 {
 989     if (debug)
 990     {
 991         fprintf(debug, "cell_offset %d sorting columns %d - %d, atoms %d - %d\n",
 992                 cellOffset_, cxy_start, cxy_end, atomStart, atomEnd);
 993     }
 994
 995     const bool relevantAtomsAreWithinGridBounds = (dimensions_.maxAtomGroupRadius == 0);
 996
 997     const int  numAtomsPerCell = geometry_.numAtomsPerCell;
 998
 999     /* Sort the atoms within each x,y column in 3 dimensions */
1000     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
1001     {
1002         const int numAtoms   = numAtomsInColumn(cxy);
1003         const int numCellsZ  = cxy_ind_[cxy + 1] - cxy_ind_[cxy];
1004         const int atomOffset = firstAtomInColumn(cxy);
1005
1006         /* Sort the atoms within each x,y column on z coordinate */
1007         sort_atoms(ZZ, FALSE, dd_zone,
1008                    relevantAtomsAreWithinGridBounds,
1009                    gridSetData.atomIndices.data() + atomOffset, numAtoms, x,
1010                    dimensions_.lowerCorner[ZZ],
1011                    1.0/dimensions_.gridSize[ZZ], numCellsZ*numAtomsPerCell,
1012                    sort_work);
1013
1014         /* Fill the ncz cells in this column */
1015         const int firstCell  = firstCellInColumn(cxy);
1016         int       cellFilled = firstCell;
1017         for (int cellZ = 0; cellZ < numCellsZ; cellZ++)
1018         {
1019             const int cell           = firstCell + cellZ;
1020
1021             const int atomOffsetCell = atomOffset + cellZ*numAtomsPerCell;
1022             const int numAtomsCell   = std::min(numAtomsPerCell, numAtoms - (atomOffsetCell - atomOffset));
1023
1024             fillCell(gridSetData, nbat,
1025                      atomOffsetCell, atomOffsetCell + numAtomsCell, atinfo, x,
1026                      nullptr);
1027
1028             /* This copy to bbcz is not really necessary.
1029              * But it allows to use the same grid search code
1030              * for the simple and supersub cell setups.
1031              */
1032             if (numAtomsCell > 0)
1033             {
1034                 cellFilled = cell;
1035             }
1036             bbcz_[cell].lower = bb_[cellFilled].lower.z;
1037             bbcz_[cell].upper = bb_[cellFilled].upper.z;
1038         }
1039
1040         /* Set the unused atom indices to -1 */
1041         for (int ind = numAtoms; ind < numCellsZ*numAtomsPerCell; ind++)
1042         {
1043             gridSetData.atomIndices[atomOffset + ind] = -1;
1044         }
1045     }
1046 }
1047
1048 /* Spatially sort the atoms within one grid column */
1049 void Grid::sortColumnsGpuGeometry(const GridSetData &gridSetData,
1050                                   int dd_zone,
1051                                   int atomStart, int atomEnd,
1052                                   const int *atinfo,
1053                                   gmx::ArrayRef<const gmx::RVec> x,
1054                                   nbnxn_atomdata_t *nbat,
1055                                   int cxy_start, int cxy_end,
1056                                   gmx::ArrayRef<int> sort_work)
1057 {
1058     BoundingBox  bb_work_array[2];
1059     BoundingBox *bb_work_aligned = reinterpret_cast<BoundingBox *>((reinterpret_cast<std::size_t>(bb_work_array + 1)) & (~(static_cast<std::size_t>(15))));
1060
1061     if (debug)
1062     {
1063         fprintf(debug, "cell_offset %d sorting columns %d - %d, atoms %d - %d\n",
1064                 cellOffset_, cxy_start, cxy_end, atomStart, atomEnd);
1065     }
1066
1067     const bool relevantAtomsAreWithinGridBounds = (dimensions_.maxAtomGroupRadius == 0);
1068
1069     const int  numAtomsPerCell = geometry_.numAtomsPerCell;
1070
1071     const int  subdiv_x = geometry_.numAtomsICluster;
1072     const int  subdiv_y = c_gpuNumClusterPerCellX*subdiv_x;
1073     const int  subdiv_z = c_gpuNumClusterPerCellY*subdiv_y;
1074
1075     /* Extract the atom index array that will be filled here */
1076     const gmx::ArrayRef<int> &atomIndices = gridSetData.atomIndices;
1077
1078     /* Sort the atoms within each x,y column in 3 dimensions.
1079      * Loop over all columns on the x/y grid.
1080      */
1081     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
1082     {
1083         const int gridX            = cxy/dimensions_.numCells[YY];
1084         const int gridY            = cxy - gridX*dimensions_.numCells[YY];
1085
1086         const int numAtomsInColumn = cxy_na_[cxy];
1087         const int numCellsInColumn = cxy_ind_[cxy + 1] - cxy_ind_[cxy];
1088         const int atomOffset       = firstAtomInColumn(cxy);
1089
1090         /* Sort the atoms within each x,y column on z coordinate */
1091         sort_atoms(ZZ, FALSE, dd_zone,
1092                    relevantAtomsAreWithinGridBounds,
1093                    atomIndices.data() + atomOffset, numAtomsInColumn, x,
1094                    dimensions_.lowerCorner[ZZ],
1095                    1.0/dimensions_.gridSize[ZZ], numCellsInColumn*numAtomsPerCell,
1096                    sort_work);
1097
1098         /* This loop goes over the cells and clusters along z at once */
1099         for (int sub_z = 0; sub_z < numCellsInColumn*c_gpuNumClusterPerCellZ; sub_z++)
1100         {
1101             const int atomOffsetZ = atomOffset + sub_z*subdiv_z;
1102             const int numAtomsZ   = std::min(subdiv_z, numAtomsInColumn - (atomOffsetZ - atomOffset));
1103             int       cz          = -1;
1104             /* We have already sorted on z */
1105
1106             if (sub_z % c_gpuNumClusterPerCellZ == 0)
1107             {
1108                 cz = sub_z/c_gpuNumClusterPerCellZ;
1109                 const int cell = cxy_ind_[cxy] + cz;
1110
1111                 /* The number of atoms in this cell/super-cluster */
1112                 const int numAtoms = std::min(numAtomsPerCell, numAtomsInColumn - (atomOffsetZ - atomOffset));
1113
1114                 numClusters_[cell] = std::min(c_gpuNumClusterPerCell,
1115                                               (numAtoms + geometry_.numAtomsICluster - 1)/ geometry_.numAtomsICluster);
1116
1117                 /* Store the z-boundaries of the bounding box of the cell */
1118                 bbcz_[cell].lower = x[atomIndices[atomOffsetZ]][ZZ];
1119                 bbcz_[cell].upper = x[atomIndices[atomOffsetZ + numAtoms - 1]][ZZ];
1120             }
1121
1122             if (c_gpuNumClusterPerCellY > 1)
1123             {
1124                 /* Sort the atoms along y */
1125                 sort_atoms(YY, (sub_z & 1) != 0, dd_zone,
1126                            relevantAtomsAreWithinGridBounds,
1127                            atomIndices.data() + atomOffsetZ, numAtomsZ, x,
1128                            dimensions_.lowerCorner[YY] + gridY*dimensions_.cellSize[YY],
1129                            dimensions_.invCellSize[YY], subdiv_z,
1130                            sort_work);
1131             }
1132
1133             for (int sub_y = 0; sub_y < c_gpuNumClusterPerCellY; sub_y++)
1134             {
1135                 const int atomOffsetY = atomOffsetZ + sub_y*subdiv_y;
1136                 const int numAtomsY   = std::min(subdiv_y, numAtomsInColumn - (atomOffsetY - atomOffset));
1137
1138                 if (c_gpuNumClusterPerCellX > 1)
1139                 {
1140                     /* Sort the atoms along x */
1141                     sort_atoms(XX, ((cz*c_gpuNumClusterPerCellY + sub_y) & 1) != 0, dd_zone,
1142                                relevantAtomsAreWithinGridBounds,
1143                                atomIndices.data() + atomOffsetY, numAtomsY, x,
1144                                dimensions_.lowerCorner[XX] + gridX*dimensions_.cellSize[XX],
1145                                dimensions_.invCellSize[XX], subdiv_y,
1146                                sort_work);
1147                 }
1148
1149                 for (int sub_x = 0; sub_x < c_gpuNumClusterPerCellX; sub_x++)
1150                 {
1151                     const int atomOffsetX = atomOffsetY + sub_x*subdiv_x;
1152                     const int numAtomsX   = std::min(subdiv_x, numAtomsInColumn - (atomOffsetX - atomOffset));
1153
1154                     fillCell(gridSetData, nbat,
1155                              atomOffsetX, atomOffsetX + numAtomsX, atinfo, x,
1156                              bb_work_aligned);
1157                 }
1158             }
1159         }
1160
1161         /* Set the unused atom indices to -1 */
1162         for (int ind = numAtomsInColumn; ind < numCellsInColumn*numAtomsPerCell; ind++)
1163         {
1164             atomIndices[atomOffset + ind] = -1;
1165         }
1166     }
1167 }
1168
1169 /*! \brief Sets the cell index in the cell array for atom \p atomIndex and increments the atom count for the grid column */
1170 static void setCellAndAtomCount(gmx::ArrayRef<int>  cell,
1171                                 int                 cellIndex,
1172                                 gmx::ArrayRef<int>  cxy_na,
1173                                 int                 atomIndex)
1174 {
1175     cell[atomIndex]    = cellIndex;
1176     cxy_na[cellIndex] += 1;
1177 }
1178
1179 void Grid::calcColumnIndices(const Grid::Dimensions         &gridDims,
1180                              const gmx::UpdateGroupsCog     *updateGroupsCog,
1181                              const int                       atomStart,
1182                              const int                       atomEnd,
1183                              gmx::ArrayRef<const gmx::RVec>  x,
1184                              const int                       dd_zone,
1185                              const int                      *move,
1186                              const int                       thread,
1187                              const int                       nthread,
1188                              gmx::ArrayRef<int>              cell,
1189                              gmx::ArrayRef<int>              cxy_na)
1190 {
1191     const int numColumns = gridDims.numCells[XX]*gridDims.numCells[YY];
1192
1193     /* We add one extra cell for particles which moved during DD */
1194     for (int i = 0; i < numColumns; i++)
1195     {
1196         cxy_na[i] = 0;
1197     }
1198
1199     int taskAtomStart = atomStart + static_cast<int>((thread + 0)*(atomEnd - atomStart))/nthread;
1200     int taskAtomEnd   = atomStart + static_cast<int>((thread + 1)*(atomEnd - atomStart))/nthread;
1201
1202     if (dd_zone == 0)
1203     {
1204         /* Home zone */
1205         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1206         {
1207             if (move == nullptr || move[i] >= 0)
1208             {
1209
1210                 const gmx::RVec &coord = (updateGroupsCog ? updateGroupsCog->cogForAtom(i) : x[i]);
1211
1212                 /* We need to be careful with rounding,
1213                  * particles might be a few bits outside the local zone.
1214                  * The int cast takes care of the lower bound,
1215                  * we will explicitly take care of the upper bound.
1216                  */
1217                 int cx = static_cast<int>((coord[XX] - gridDims.lowerCorner[XX])*gridDims.invCellSize[XX]);
1218                 int cy = static_cast<int>((coord[YY] - gridDims.lowerCorner[YY])*gridDims.invCellSize[YY]);
1219
1220 #ifndef NDEBUG
1221                 if (cx < 0 || cx > gridDims.numCells[XX] ||
1222                     cy < 0 || cy > gridDims.numCells[YY])
1223                 {
1224                     gmx_fatal(FARGS,
1225                               "grid cell cx %d cy %d out of range (max %d %d)\n"
1226                               "atom %f %f %f, grid->c0 %f %f",
1227                               cx, cy,
1228                               gridDims.numCells[XX],
1229                               gridDims.numCells[YY],
1230                               x[i][XX], x[i][YY], x[i][ZZ],
1231                               gridDims.lowerCorner[XX],
1232                               gridDims.lowerCorner[YY]);
1233                 }
1234 #endif
1235                 /* Take care of potential rouding issues */
1236                 cx = std::min(cx, gridDims.numCells[XX] - 1);
1237                 cy = std::min(cy, gridDims.numCells[YY] - 1);
1238
1239                 /* For the moment cell will contain only the, grid local,
1240                  * x and y indices, not z.
1241                  */
1242                 setCellAndAtomCount(cell, cx*gridDims.numCells[YY] + cy, cxy_na, i);
1243             }
1244             else
1245             {
1246                 /* Put this moved particle after the end of the grid,
1247                  * so we can process it later without using conditionals.
1248                  */
1249                 setCellAndAtomCount(cell, numColumns, cxy_na, i);
1250             }
1251         }
1252     }
1253     else
1254     {
1255         /* Non-home zone */
1256         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1257         {
1258             int cx = static_cast<int>((x[i][XX] - gridDims.lowerCorner[XX])*gridDims.invCellSize[XX]);
1259             int cy = static_cast<int>((x[i][YY] - gridDims.lowerCorner[YY])*gridDims.invCellSize[YY]);
1260
1261             /* For non-home zones there could be particles outside
1262              * the non-bonded cut-off range, which have been communicated
1263              * for bonded interactions only. For the result it doesn't
1264              * matter where these end up on the grid. For performance
1265              * we put them in an extra row at the border.
1266              */
1267             cx = std::max(cx, 0);
1268             cx = std::min(cx, gridDims.numCells[XX] - 1);
1269             cy = std::max(cy, 0);
1270             cy = std::min(cy, gridDims.numCells[YY] - 1);
1271
1272             /* For the moment cell will contain only the, grid local,
1273              * x and y indices, not z.
1274              */
1275             setCellAndAtomCount(cell, cx*gridDims.numCells[YY] + cy, cxy_na, i);
1276         }
1277     }
1278 }
1279
1280 /*! \brief Resizes grid and atom data which depend on the number of cells */
1281 static void resizeForNumberOfCells(const int           numNbnxnAtoms,
1282                                    const int           numAtomsMoved,
1283                                    GridSetData        *gridSetData,
1284                                    nbnxn_atomdata_t   *nbat)
1285 {
1286     /* Note: gridSetData->cellIndices was already resized before */
1287
1288     /* To avoid conditionals we store the moved particles at the end of a,
1289      * make sure we have enough space.
1290      */
1291     gridSetData->atomIndices.resize(numNbnxnAtoms + numAtomsMoved);
1292
1293     /* Make space in nbat for storing the atom coordinates */
1294     nbat->resizeCoordinateBuffer(numNbnxnAtoms);
1295 }
1296
1297 void Grid::setCellIndices(int                             ddZone,
1298                           int                             cellOffset,
1299                           GridSetData                    *gridSetData,
1300                           gmx::ArrayRef<GridWork>         gridWork,
1301                           int                             atomStart,
1302                           int                             atomEnd,
1303                           const int                      *atinfo,
1304                           gmx::ArrayRef<const gmx::RVec>  x,
1305                           const int                       numAtomsMoved,
1306                           nbnxn_atomdata_t               *nbat)
1307 {
1308     cellOffset_ = cellOffset;
1309
1310     const int nthread = gmx_omp_nthreads_get(emntPairsearch);
1311
1312     const int numAtomsPerCell = geometry_.numAtomsPerCell;
1313
1314     /* Make the cell index as a function of x and y */
1315     int ncz_max      = 0;
1316     int ncz          = 0;
1317     cxy_ind_[0]      = 0;
1318     for (int i = 0; i < numColumns() + 1; i++)
1319     {
1320         /* We set ncz_max at the beginning of the loop iso at the end
1321          * to skip i=grid->ncx*grid->numCells[YY] which are moved particles
1322          * that do not need to be ordered on the grid.
1323          */
1324         if (ncz > ncz_max)
1325         {
1326             ncz_max = ncz;
1327         }
1328         int cxy_na_i = gridWork[0].numAtomsPerColumn[i];
1329         for (int thread = 1; thread < nthread; thread++)
1330         {
1331             cxy_na_i += gridWork[thread].numAtomsPerColumn[i];
1332         }
1333         ncz = (cxy_na_i + numAtomsPerCell - 1)/numAtomsPerCell;
1334         if (nbat->XFormat == nbatX8)
1335         {
1336             /* Make the number of cell a multiple of 2 */
1337             ncz = (ncz + 1) & ~1;
1338         }
1339         cxy_ind_[i+1] = cxy_ind_[i] + ncz;
1340         /* Clear cxy_na_, so we can reuse the array below */
1341         cxy_na_[i] = 0;
1342     }
1343     numCellsTotal_ = cxy_ind_[numColumns()] - cxy_ind_[0];
1344
1345     /* Resize grid and atom data which depend on the number of cells */
1346     resizeForNumberOfCells(atomIndexEnd(), numAtomsMoved, gridSetData, nbat);
1347
1348     if (debug)
1349     {
1350         fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1351                 numAtomsPerCell, geometry_.numAtomsICluster, numCellsTotal_,
1352                 dimensions_.numCells[XX], dimensions_.numCells[YY],
1353                 numCellsTotal_/(static_cast<double>(numColumns())),
1354                 ncz_max);
1355         if (gmx_debug_at)
1356         {
1357             int i = 0;
1358             for (int cy = 0; cy < dimensions_.numCells[YY]; cy++)
1359             {
1360                 for (int cx = 0; cx < dimensions_.numCells[XX]; cx++)
1361                 {
1362                     fprintf(debug, " %2d", cxy_ind_[i + 1] - cxy_ind_[i]);
1363                     i++;
1364                 }
1365                 fprintf(debug, "\n");
1366             }
1367         }
1368     }
1369
1370     /* Make sure the work array for sorting is large enough */
1371     const int worstCaseSortBufferSize = ncz_max*numAtomsPerCell*c_sortGridMaxSizeFactor;
1372     if (worstCaseSortBufferSize > gmx::index(gridWork[0].sortBuffer.size()))
1373     {
1374         for (GridWork &work : gridWork)
1375         {
1376             /* Elements not in use should be -1 */
1377             work.sortBuffer.resize(worstCaseSortBufferSize, -1);
1378         }
1379     }
1380
1381     /* Now we know the dimensions we can fill the grid.
1382      * This is the first, unsorted fill. We sort the columns after this.
1383      */
1384     gmx::ArrayRef<int> cells       = gridSetData->cells;
1385     gmx::ArrayRef<int> atomIndices = gridSetData->atomIndices;
1386     for (int i = atomStart; i < atomEnd; i++)
1387     {
1388         /* At this point nbs->cell contains the local grid x,y indices */
1389         const int cxy = cells[i];
1390         atomIndices[firstAtomInColumn(cxy) + cxy_na_[cxy]++] = i;
1391     }
1392
1393     if (ddZone == 0)
1394     {
1395         /* Set the cell indices for the moved particles */
1396         int n0 = numCellsTotal_*numAtomsPerCell;
1397         int n1 = numCellsTotal_*numAtomsPerCell + cxy_na_[numColumns()];
1398         if (ddZone == 0)
1399         {
1400             for (int i = n0; i < n1; i++)
1401             {
1402                 cells[atomIndices[i]] = i;
1403             }
1404         }
1405     }
1406
1407     /* Sort the super-cell columns along z into the sub-cells. */
1408 #pragma omp parallel for num_threads(nthread) schedule(static)
1409     for (int thread = 0; thread < nthread; thread++)
1410     {
1411         try
1412         {
1413             int columnStart = ((thread + 0)*numColumns())/nthread;
1414             int columnEnd   = ((thread + 1)*numColumns())/nthread;
1415             if (geometry_.isSimple)
1416             {
1417                 sortColumnsCpuGeometry(*gridSetData, ddZone,
1418                                        atomStart, atomEnd, atinfo, x, nbat,
1419                                        columnStart, columnEnd,
1420                                        gridWork[thread].sortBuffer);
1421             }
1422             else
1423             {
1424                 sortColumnsGpuGeometry(*gridSetData, ddZone,
1425                                        atomStart, atomEnd, atinfo, x, nbat,
1426                                        columnStart, columnEnd,
1427                                        gridWork[thread].sortBuffer);
1428             }
1429         }
1430         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1431     }
1432
1433     if (geometry_.isSimple && nbat->XFormat == nbatX8)
1434     {
1435         combine_bounding_box_pairs(*this, bb_, bbj_);
1436     }
1437
1438     if (!geometry_.isSimple)
1439     {
1440         numClustersTotal_ = 0;
1441         for (int i = 0; i < numCellsTotal_; i++)
1442         {
1443             numClustersTotal_ += numClusters_[i];
1444         }
1445     }
1446
1447     if (debug)
1448     {
1449         if (geometry_.isSimple)
1450         {
1451             print_bbsizes_simple(debug, *this);
1452         }
1453         else
1454         {
1455             fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
1456                     numClustersTotal_,
1457                     (atomEnd - atomStart)/static_cast<double>(numClustersTotal_));
1458
1459             print_bbsizes_supersub(debug, *this);
1460         }
1461     }
1462 }
1463
1464 } // namespace Nbnxm