src/gromacs/mdlib/nbnxn_grid.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "nbnxn_grid.h"
  39
  40 #include <string.h>
  41
  42 #include <cmath>
  43
  44 #include <algorithm>
  45
  46 #include "gromacs/domdec/domdec_struct.h"
  47 #include "gromacs/math/utilities.h"
  48 #include "gromacs/math/vec.h"
  49 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  50 #include "gromacs/mdlib/nb_verlet.h"
  51 #include "gromacs/mdlib/nbnxn_atomdata.h"
  52 #include "gromacs/mdlib/nbnxn_consts.h"
  53 #include "gromacs/mdlib/nbnxn_internal.h"
  54 #include "gromacs/mdlib/nbnxn_search.h"
  55 #include "gromacs/mdlib/nbnxn_util.h"
  56 #include "gromacs/mdtypes/forcerec.h" // only for GET_CGINFO_*
  57 #include "gromacs/simd/simd.h"
  58 #include "gromacs/simd/vector_operations.h"
  59 #include "gromacs/utility/exceptions.h"
  60 #include "gromacs/utility/smalloc.h"
  61
  62 struct gmx_domdec_zones_t;
  63
  64 static real grid_atom_density(int        numAtoms,
  65                               const rvec lowerCorner,
  66                               const rvec upperCorner)
  67 {
  68     rvec size;
  69
  70     if (numAtoms == 0)
  71     {
  72         /* To avoid zero density we use a minimum of 1 atom */
  73         numAtoms = 1;
  74     }
  75
  76     rvec_sub(upperCorner, lowerCorner, size);
  77
  78     return numAtoms/(size[XX]*size[YY]*size[ZZ]);
  79 }
  80
  81 static int set_grid_size_xy(const nbnxn_search_t  nbs,
  82                             nbnxn_grid_t         *grid,
  83                             int                   ddZone,
  84                             int                   numAtoms,
  85                             const rvec            lowerCorner,
  86                             const rvec            upperCorner,
  87                             real                  atomDensity)
  88 {
  89     rvec size;
  90     real tlen, tlen_x, tlen_y;
  91
  92     rvec_sub(upperCorner, lowerCorner, size);
  93
  94     if (numAtoms > grid->na_sc)
  95     {
  96         GMX_ASSERT(atomDensity > 0, "With one or more atoms, the density should be positive");
  97
  98         /* target cell length */
  99         if (grid->bSimple)
 100         {
 101             /* To minimize the zero interactions, we should make
 102              * the largest of the i/j cell cubic.
 103              */
 104             int numAtomsInCell = std::max(grid->na_c, grid->na_cj);
 105
 106             /* Approximately cubic cells */
 107             tlen   = std::cbrt(numAtomsInCell/atomDensity);
 108             tlen_x = tlen;
 109             tlen_y = tlen;
 110         }
 111         else
 112         {
 113             /* Approximately cubic sub cells */
 114             tlen   = std::cbrt(grid->na_c/atomDensity);
 115             tlen_x = tlen*c_gpuNumClusterPerCellX;
 116             tlen_y = tlen*c_gpuNumClusterPerCellY;
 117         }
 118         /* We round ncx and ncy down, because we get less cell pairs
 119          * in the nbsist when the fixed cell dimensions (x,y) are
 120          * larger than the variable one (z) than the other way around.
 121          */
 122         grid->ncx = std::max(1, static_cast<int>(size[XX]/tlen_x));
 123         grid->ncy = std::max(1, static_cast<int>(size[YY]/tlen_y));
 124     }
 125     else
 126     {
 127         grid->ncx = 1;
 128         grid->ncy = 1;
 129     }
 130
 131     grid->sx     = size[XX]/grid->ncx;
 132     grid->sy     = size[YY]/grid->ncy;
 133     grid->inv_sx = 1/grid->sx;
 134     grid->inv_sy = 1/grid->sy;
 135
 136     if (ddZone > 0)
 137     {
 138         /* This is a non-home zone, add an extra row of cells
 139          * for particles communicated for bonded interactions.
 140          * These can be beyond the cut-off. It doesn't matter where
 141          * they end up on the grid, but for performance it's better
 142          * if they don't end up in cells that can be within cut-off range.
 143          */
 144         grid->ncx++;
 145         grid->ncy++;
 146     }
 147
 148     /* We need one additional cell entry for particles moved by DD */
 149     grid->cxy_na.resize(grid->ncx*grid->ncy + 1);
 150     grid->cxy_ind.resize(grid->ncx*grid->ncy + 2);
 151
 152     for (int t = 0; t < nbs->nthread_max; t++)
 153     {
 154         if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
 155         {
 156             nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
 157             srenew(nbs->work[t].cxy_na, nbs->work[t].cxy_na_nalloc);
 158         }
 159     }
 160
 161     /* Worst case scenario of 1 atom in each last cell */
 162     int maxNumCells;
 163     if (grid->na_cj <= grid->na_c)
 164     {
 165         maxNumCells = numAtoms/grid->na_sc + grid->ncx*grid->ncy;
 166     }
 167     else
 168     {
 169         maxNumCells = numAtoms/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
 170     }
 171
 172     grid->nsubc.resize(maxNumCells);
 173     grid->bbcz.resize(maxNumCells*NNBSBB_D);
 174
 175     /* This resize also zeros the contents, this avoid possible
 176      * floating exceptions in SIMD with the unused bb elements.
 177      */
 178     if (grid->bSimple)
 179     {
 180         grid->bb.resize(maxNumCells);
 181     }
 182     else
 183     {
 184 #if NBNXN_BBXXXX
 185         grid->pbb.resize(maxNumCells*c_gpuNumClusterPerCell/STRIDE_PBB*NNBSBB_XXXX);
 186 #else
 187         grid->bb.resize(maxNumCells*c_gpuNumClusterPerCell);
 188 #endif
 189     }
 190
 191     if (grid->bSimple)
 192     {
 193         if (grid->na_cj == grid->na_c)
 194         {
 195             grid->bbj = grid->bb;
 196         }
 197         else
 198         {
 199             grid->bbjStorage.resize(maxNumCells*grid->na_c/grid->na_cj);
 200             grid->bbj = grid->bbjStorage;
 201         }
 202     }
 203
 204     grid->flags.resize(maxNumCells);
 205     if (nbs->bFEP)
 206     {
 207         grid->fep.resize(maxNumCells*grid->na_sc/grid->na_c);
 208     }
 209
 210     copy_rvec(lowerCorner, grid->c0);
 211     copy_rvec(upperCorner, grid->c1);
 212     copy_rvec(size,        grid->size);
 213
 214     return maxNumCells;
 215 }
 216
 217 /* We need to sort paricles in grid columns on z-coordinate.
 218  * As particle are very often distributed homogeneously, we a sorting
 219  * algorithm similar to pigeonhole sort. We multiply the z-coordinate
 220  * by a factor, cast to an int and try to store in that hole. If the hole
 221  * is full, we move this or another particle. A second pass is needed to make
 222  * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
 223  * 4 is the optimal value for homogeneous particle distribution and allows
 224  * for an O(#particles) sort up till distributions were all particles are
 225  * concentrated in 1/4 of the space. No NlogN fallback is implemented,
 226  * as it can be expensive to detect imhomogeneous particle distributions.
 227  * SGSF is the maximum ratio of holes used, in the worst case all particles
 228  * end up in the last hole and we need #particles extra holes at the end.
 229  */
 230 #define SORT_GRID_OVERSIZE 4
 231 #define SGSF (SORT_GRID_OVERSIZE + 1)
 232
 233 /* Sort particle index a on coordinates x along dim.
 234  * Backwards tells if we want decreasing iso increasing coordinates.
 235  * h0 is the minimum of the coordinate range.
 236  * invh is the 1/length of the sorting range.
 237  * n_per_h (>=n) is the expected average number of particles per 1/invh
 238  * sort is the sorting work array.
 239  * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
 240  * or easier, allocate at least n*SGSF elements.
 241  */
 242 static void sort_atoms(int dim, gmx_bool Backwards,
 243                        int gmx_unused dd_zone,
 244                        int *a, int n, const rvec *x,
 245                        real h0, real invh, int n_per_h,
 246                        int *sort)
 247 {
 248     if (n <= 1)
 249     {
 250         /* Nothing to do */
 251         return;
 252     }
 253
 254 #ifndef NDEBUG
 255     if (n > n_per_h)
 256     {
 257         gmx_incons("n > n_per_h");
 258     }
 259 #endif
 260
 261     /* Transform the inverse range height into the inverse hole height */
 262     invh *= n_per_h*SORT_GRID_OVERSIZE;
 263
 264     /* Set nsort to the maximum possible number of holes used.
 265      * In worst case all n elements end up in the last bin.
 266      */
 267     int nsort = n_per_h*SORT_GRID_OVERSIZE + n;
 268
 269     /* Determine the index range used, so we can limit it for the second pass */
 270     int zi_min = INT_MAX;
 271     int zi_max = -1;
 272
 273     /* Sort the particles using a simple index sort */
 274     for (int i = 0; i < n; i++)
 275     {
 276         /* The cast takes care of float-point rounding effects below zero.
 277          * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
 278          * times the box height out of the box.
 279          */
 280         int zi = static_cast<int>((x[a[i]][dim] - h0)*invh);
 281
 282 #ifndef NDEBUG
 283         /* As we can have rounding effect, we use > iso >= here */
 284         if (zi < 0 || (dd_zone == 0 && zi > n_per_h*SORT_GRID_OVERSIZE))
 285         {
 286             gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
 287                       a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
 288                       n_per_h, SORT_GRID_OVERSIZE);
 289         }
 290 #endif
 291
 292         /* In a non-local domain, particles communcated for bonded interactions
 293          * can be far beyond the grid size, which is set by the non-bonded
 294          * cut-off distance. We sort such particles into the last cell.
 295          */
 296         if (zi > n_per_h*SORT_GRID_OVERSIZE)
 297         {
 298             zi = n_per_h*SORT_GRID_OVERSIZE;
 299         }
 300
 301         /* Ideally this particle should go in sort cell zi,
 302          * but that might already be in use,
 303          * in that case find the first empty cell higher up
 304          */
 305         if (sort[zi] < 0)
 306         {
 307             sort[zi] = a[i];
 308             zi_min   = std::min(zi_min, zi);
 309             zi_max   = std::max(zi_max, zi);
 310         }
 311         else
 312         {
 313             /* We have multiple atoms in the same sorting slot.
 314              * Sort on real z for minimal bounding box size.
 315              * There is an extra check for identical z to ensure
 316              * well-defined output order, independent of input order
 317              * to ensure binary reproducibility after restarts.
 318              */
 319             while (sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
 320                                       (x[a[i]][dim] == x[sort[zi]][dim] &&
 321                                        a[i] > sort[zi])))
 322             {
 323                 zi++;
 324             }
 325
 326             if (sort[zi] >= 0)
 327             {
 328                 /* Shift all elements by one slot until we find an empty slot */
 329                 int cp  = sort[zi];
 330                 int zim = zi + 1;
 331                 while (sort[zim] >= 0)
 332                 {
 333                     int tmp   = sort[zim];
 334                     sort[zim] = cp;
 335                     cp        = tmp;
 336                     zim++;
 337                 }
 338                 sort[zim] = cp;
 339                 zi_max    = std::max(zi_max, zim);
 340             }
 341             sort[zi] = a[i];
 342             zi_max   = std::max(zi_max, zi);
 343         }
 344     }
 345
 346     int c = 0;
 347     if (!Backwards)
 348     {
 349         for (int zi = 0; zi < nsort; zi++)
 350         {
 351             if (sort[zi] >= 0)
 352             {
 353                 a[c++]   = sort[zi];
 354                 sort[zi] = -1;
 355             }
 356         }
 357     }
 358     else
 359     {
 360         for (int zi = zi_max; zi >= zi_min; zi--)
 361         {
 362             if (sort[zi] >= 0)
 363             {
 364                 a[c++]   = sort[zi];
 365                 sort[zi] = -1;
 366             }
 367         }
 368     }
 369     if (c < n)
 370     {
 371         gmx_incons("Lost particles while sorting");
 372     }
 373 }
 374
 375 #if GMX_DOUBLE
 376 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
 377 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
 378 #else
 379 #define R2F_D(x) (x)
 380 #define R2F_U(x) (x)
 381 #endif
 382
 383 /* Coordinate order x,y,z, bb order xyz0 */
 384 static void calc_bounding_box(int na, int stride, const real *x, nbnxn_bb_t *bb)
 385 {
 386     int  i;
 387     real xl, xh, yl, yh, zl, zh;
 388
 389     i  = 0;
 390     xl = x[i+XX];
 391     xh = x[i+XX];
 392     yl = x[i+YY];
 393     yh = x[i+YY];
 394     zl = x[i+ZZ];
 395     zh = x[i+ZZ];
 396     i += stride;
 397     for (int j = 1; j < na; j++)
 398     {
 399         xl = std::min(xl, x[i+XX]);
 400         xh = std::max(xh, x[i+XX]);
 401         yl = std::min(yl, x[i+YY]);
 402         yh = std::max(yh, x[i+YY]);
 403         zl = std::min(zl, x[i+ZZ]);
 404         zh = std::max(zh, x[i+ZZ]);
 405         i += stride;
 406     }
 407     /* Note: possible double to float conversion here */
 408     bb->lower[BB_X] = R2F_D(xl);
 409     bb->lower[BB_Y] = R2F_D(yl);
 410     bb->lower[BB_Z] = R2F_D(zl);
 411     bb->upper[BB_X] = R2F_U(xh);
 412     bb->upper[BB_Y] = R2F_U(yh);
 413     bb->upper[BB_Z] = R2F_U(zh);
 414 }
 415
 416 /* Packed coordinates, bb order xyz0 */
 417 static void calc_bounding_box_x_x4(int na, const real *x, nbnxn_bb_t *bb)
 418 {
 419     real xl, xh, yl, yh, zl, zh;
 420
 421     xl = x[XX*c_packX4];
 422     xh = x[XX*c_packX4];
 423     yl = x[YY*c_packX4];
 424     yh = x[YY*c_packX4];
 425     zl = x[ZZ*c_packX4];
 426     zh = x[ZZ*c_packX4];
 427     for (int j = 1; j < na; j++)
 428     {
 429         xl = std::min(xl, x[j+XX*c_packX4]);
 430         xh = std::max(xh, x[j+XX*c_packX4]);
 431         yl = std::min(yl, x[j+YY*c_packX4]);
 432         yh = std::max(yh, x[j+YY*c_packX4]);
 433         zl = std::min(zl, x[j+ZZ*c_packX4]);
 434         zh = std::max(zh, x[j+ZZ*c_packX4]);
 435     }
 436     /* Note: possible double to float conversion here */
 437     bb->lower[BB_X] = R2F_D(xl);
 438     bb->lower[BB_Y] = R2F_D(yl);
 439     bb->lower[BB_Z] = R2F_D(zl);
 440     bb->upper[BB_X] = R2F_U(xh);
 441     bb->upper[BB_Y] = R2F_U(yh);
 442     bb->upper[BB_Z] = R2F_U(zh);
 443 }
 444
 445 /* Packed coordinates, bb order xyz0 */
 446 static void calc_bounding_box_x_x8(int na, const real *x, nbnxn_bb_t *bb)
 447 {
 448     real xl, xh, yl, yh, zl, zh;
 449
 450     xl = x[XX*c_packX8];
 451     xh = x[XX*c_packX8];
 452     yl = x[YY*c_packX8];
 453     yh = x[YY*c_packX8];
 454     zl = x[ZZ*c_packX8];
 455     zh = x[ZZ*c_packX8];
 456     for (int j = 1; j < na; j++)
 457     {
 458         xl = std::min(xl, x[j+XX*c_packX8]);
 459         xh = std::max(xh, x[j+XX*c_packX8]);
 460         yl = std::min(yl, x[j+YY*c_packX8]);
 461         yh = std::max(yh, x[j+YY*c_packX8]);
 462         zl = std::min(zl, x[j+ZZ*c_packX8]);
 463         zh = std::max(zh, x[j+ZZ*c_packX8]);
 464     }
 465     /* Note: possible double to float conversion here */
 466     bb->lower[BB_X] = R2F_D(xl);
 467     bb->lower[BB_Y] = R2F_D(yl);
 468     bb->lower[BB_Z] = R2F_D(zl);
 469     bb->upper[BB_X] = R2F_U(xh);
 470     bb->upper[BB_Y] = R2F_U(yh);
 471     bb->upper[BB_Z] = R2F_U(zh);
 472 }
 473
 474 /* Packed coordinates, bb order xyz0 */
 475 gmx_unused static void calc_bounding_box_x_x4_halves(int na, const real *x,
 476                                                      nbnxn_bb_t *bb, nbnxn_bb_t *bbj)
 477 {
 478     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 479     using namespace gmx;
 480
 481     calc_bounding_box_x_x4(std::min(na, 2), x, bbj);
 482
 483     if (na > 2)
 484     {
 485         calc_bounding_box_x_x4(std::min(na-2, 2), x+(c_packX4>>1), bbj+1);
 486     }
 487     else
 488     {
 489         /* Set the "empty" bounding box to the same as the first one,
 490          * so we don't need to treat special cases in the rest of the code.
 491          */
 492 #if NBNXN_SEARCH_BB_SIMD4
 493         store4(&bbj[1].lower[0], load4(&bbj[0].lower[0]));
 494         store4(&bbj[1].upper[0], load4(&bbj[0].upper[0]));
 495 #else
 496         bbj[1] = bbj[0];
 497 #endif
 498     }
 499
 500 #if NBNXN_SEARCH_BB_SIMD4
 501     store4(&bb->lower[0], min(load4(&bbj[0].lower[0]), load4(&bbj[1].lower[0])));
 502     store4(&bb->upper[0], max(load4(&bbj[0].upper[0]), load4(&bbj[1].upper[0])));
 503 #else
 504     {
 505         int i;
 506
 507         for (i = 0; i < NNBSBB_C; i++)
 508         {
 509             bb->lower[i] = std::min(bbj[0].lower[i], bbj[1].lower[i]);
 510             bb->upper[i] = std::max(bbj[0].upper[i], bbj[1].upper[i]);
 511         }
 512     }
 513 #endif
 514 }
 515
 516 #if NBNXN_SEARCH_BB_SIMD4
 517
 518 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
 519 static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
 520 {
 521     int  i;
 522     real xl, xh, yl, yh, zl, zh;
 523
 524     i  = 0;
 525     xl = x[i+XX];
 526     xh = x[i+XX];
 527     yl = x[i+YY];
 528     yh = x[i+YY];
 529     zl = x[i+ZZ];
 530     zh = x[i+ZZ];
 531     i += stride;
 532     for (int j = 1; j < na; j++)
 533     {
 534         xl = std::min(xl, x[i+XX]);
 535         xh = std::max(xh, x[i+XX]);
 536         yl = std::min(yl, x[i+YY]);
 537         yh = std::max(yh, x[i+YY]);
 538         zl = std::min(zl, x[i+ZZ]);
 539         zh = std::max(zh, x[i+ZZ]);
 540         i += stride;
 541     }
 542     /* Note: possible double to float conversion here */
 543     bb[0*STRIDE_PBB] = R2F_D(xl);
 544     bb[1*STRIDE_PBB] = R2F_D(yl);
 545     bb[2*STRIDE_PBB] = R2F_D(zl);
 546     bb[3*STRIDE_PBB] = R2F_U(xh);
 547     bb[4*STRIDE_PBB] = R2F_U(yh);
 548     bb[5*STRIDE_PBB] = R2F_U(zh);
 549 }
 550
 551 #endif /* NBNXN_SEARCH_BB_SIMD4 */
 552
 553 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 554
 555 /* Coordinate order xyz?, bb order xyz0 */
 556 static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb)
 557 {
 558     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 559     using namespace gmx;
 560
 561     Simd4Float bb_0_S, bb_1_S;
 562     Simd4Float x_S;
 563
 564     bb_0_S = load4(x);
 565     bb_1_S = bb_0_S;
 566
 567     for (int i = 1; i < na; i++)
 568     {
 569         x_S    = load4(x+i*NNBSBB_C);
 570         bb_0_S = min(bb_0_S, x_S);
 571         bb_1_S = max(bb_1_S, x_S);
 572     }
 573
 574     store4(&bb->lower[0], bb_0_S);
 575     store4(&bb->upper[0], bb_1_S);
 576 }
 577
 578 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
 579 static void calc_bounding_box_xxxx_simd4(int na, const float *x,
 580                                          nbnxn_bb_t *bb_work_aligned,
 581                                          real *bb)
 582 {
 583     calc_bounding_box_simd4(na, x, bb_work_aligned);
 584
 585     bb[0*STRIDE_PBB] = bb_work_aligned->lower[BB_X];
 586     bb[1*STRIDE_PBB] = bb_work_aligned->lower[BB_Y];
 587     bb[2*STRIDE_PBB] = bb_work_aligned->lower[BB_Z];
 588     bb[3*STRIDE_PBB] = bb_work_aligned->upper[BB_X];
 589     bb[4*STRIDE_PBB] = bb_work_aligned->upper[BB_Y];
 590     bb[5*STRIDE_PBB] = bb_work_aligned->upper[BB_Z];
 591 }
 592
 593 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
 594
 595
 596 /* Combines pairs of consecutive bounding boxes */
 597 static void combine_bounding_box_pairs(nbnxn_grid_t                    *grid,
 598                                        gmx::ArrayRef<const nbnxn_bb_t>  bb)
 599 {
 600     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 601     using namespace gmx;
 602
 603     for (int i = 0; i < grid->ncx*grid->ncy; i++)
 604     {
 605         /* Starting bb in a column is expected to be 2-aligned */
 606         int sc2 = grid->cxy_ind[i]>>1;
 607         /* For odd numbers skip the last bb here */
 608         int nc2 = (grid->cxy_na[i]+3)>>(2+1);
 609         int c2;
 610         for (c2 = sc2; c2 < sc2+nc2; c2++)
 611         {
 612 #if NBNXN_SEARCH_BB_SIMD4
 613             Simd4Float min_S, max_S;
 614
 615             min_S = min(load4(&bb[c2*2+0].lower[0]),
 616                         load4(&bb[c2*2+1].lower[0]));
 617             max_S = max(load4(&bb[c2*2+0].upper[0]),
 618                         load4(&bb[c2*2+1].upper[0]));
 619             store4(&grid->bbj[c2].lower[0], min_S);
 620             store4(&grid->bbj[c2].upper[0], max_S);
 621 #else
 622             for (int j = 0; j < NNBSBB_C; j++)
 623             {
 624                 grid->bbj[c2].lower[j] = std::min(bb[c2*2+0].lower[j],
 625                                                   bb[c2*2+1].lower[j]);
 626                 grid->bbj[c2].upper[j] = std::max(bb[c2*2+0].upper[j],
 627                                                   bb[c2*2+1].upper[j]);
 628             }
 629 #endif
 630         }
 631         if (((grid->cxy_na[i]+3)>>2) & 1)
 632         {
 633             /* The bb count in this column is odd: duplicate the last bb */
 634             for (int j = 0; j < NNBSBB_C; j++)
 635             {
 636                 grid->bbj[c2].lower[j] = bb[c2*2].lower[j];
 637                 grid->bbj[c2].upper[j] = bb[c2*2].upper[j];
 638             }
 639         }
 640     }
 641 }
 642
 643
 644 /* Prints the average bb size, used for debug output */
 645 static void print_bbsizes_simple(FILE                *fp,
 646                                  const nbnxn_grid_t  *grid)
 647 {
 648     dvec ba;
 649
 650     clear_dvec(ba);
 651     for (int c = 0; c < grid->nc; c++)
 652     {
 653         for (int d = 0; d < DIM; d++)
 654         {
 655             ba[d] += grid->bb[c].upper[d] - grid->bb[c].lower[d];
 656         }
 657     }
 658     dsvmul(1.0/grid->nc, ba, ba);
 659
 660     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 661             grid->sx,
 662             grid->sy,
 663             grid->atom_density > 0 ?
 664             grid->na_c/(grid->atom_density*grid->sx*grid->sy) : 0.0,
 665             ba[XX], ba[YY], ba[ZZ],
 666             ba[XX]/grid->sx,
 667             ba[YY]/grid->sy,
 668             grid->atom_density > 0 ?
 669             ba[ZZ]/(grid->na_c/(grid->atom_density*grid->sx*grid->sy)) : 0.0);
 670 }
 671
 672 /* Prints the average bb size, used for debug output */
 673 static void print_bbsizes_supersub(FILE                *fp,
 674                                    const nbnxn_grid_t  *grid)
 675 {
 676     int  ns;
 677     dvec ba;
 678
 679     clear_dvec(ba);
 680     ns = 0;
 681     for (int c = 0; c < grid->nc; c++)
 682     {
 683 #if NBNXN_BBXXXX
 684         for (int s = 0; s < grid->nsubc[c]; s += STRIDE_PBB)
 685         {
 686             int cs_w = (c*c_gpuNumClusterPerCell + s)/STRIDE_PBB;
 687             for (int i = 0; i < STRIDE_PBB; i++)
 688             {
 689                 for (int d = 0; d < DIM; d++)
 690                 {
 691                     ba[d] +=
 692                         grid->pbb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
 693                         grid->pbb[cs_w*NNBSBB_XXXX+     d *STRIDE_PBB+i];
 694                 }
 695             }
 696         }
 697 #else
 698         for (int s = 0; s < grid->nsubc[c]; s++)
 699         {
 700             int cs = c*c_gpuNumClusterPerCell + s;
 701             for (int d = 0; d < DIM; d++)
 702             {
 703                 ba[d] += grid->bb[cs].upper[d] - grid->bb[cs].lower[d];
 704             }
 705         }
 706 #endif
 707         ns += grid->nsubc[c];
 708     }
 709     dsvmul(1.0/ns, ba, ba);
 710
 711     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 712             grid->sx/c_gpuNumClusterPerCellX,
 713             grid->sy/c_gpuNumClusterPerCellY,
 714             grid->atom_density > 0 ?
 715             grid->na_sc/(grid->atom_density*grid->sx*grid->sy*c_gpuNumClusterPerCellZ) : 0.0,
 716             ba[XX], ba[YY], ba[ZZ],
 717             ba[XX]*c_gpuNumClusterPerCellX/grid->sx,
 718             ba[YY]*c_gpuNumClusterPerCellY/grid->sy,
 719             grid->atom_density > 0 ?
 720             ba[ZZ]/(grid->na_sc/(grid->atom_density*grid->sx*grid->sy*c_gpuNumClusterPerCellZ)) : 0.0);
 721 }
 722
 723 /* Set non-bonded interaction flags for the current cluster.
 724  * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
 725  */
 726 static void sort_cluster_on_flag(int        numAtomsInCluster,
 727                                  int        atomStart,
 728                                  int        atomEnd,
 729                                  const int *atinfo,
 730                                  int       *order,
 731                                  int       *flags)
 732 {
 733     constexpr int c_maxNumAtomsInCluster = 8;
 734     int           sort1[c_maxNumAtomsInCluster];
 735     int           sort2[c_maxNumAtomsInCluster];
 736
 737     GMX_ASSERT(numAtomsInCluster <= c_maxNumAtomsInCluster, "Need to increase c_maxNumAtomsInCluster to support larger clusters");
 738
 739     *flags = 0;
 740
 741     int subc = 0;
 742     for (int s = atomStart; s < atomEnd; s += numAtomsInCluster)
 743     {
 744         /* Make lists for this (sub-)cell on atoms with and without LJ */
 745         int      n1         = 0;
 746         int      n2         = 0;
 747         gmx_bool haveQ      = FALSE;
 748         int      a_lj_max   = -1;
 749         for (int a = s; a < std::min(s + numAtomsInCluster, atomEnd); a++)
 750         {
 751             haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
 752
 753             if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
 754             {
 755                 sort1[n1++] = order[a];
 756                 a_lj_max    = a;
 757             }
 758             else
 759             {
 760                 sort2[n2++] = order[a];
 761             }
 762         }
 763
 764         /* If we don't have atoms with LJ, there's nothing to sort */
 765         if (n1 > 0)
 766         {
 767             *flags |= NBNXN_CI_DO_LJ(subc);
 768
 769             if (2*n1 <= numAtomsInCluster)
 770             {
 771                 /* Only sort when strictly necessary. Ordering particles
 772                  * Ordering particles can lead to less accurate summation
 773                  * due to rounding, both for LJ and Coulomb interactions.
 774                  */
 775                 if (2*(a_lj_max - s) >= numAtomsInCluster)
 776                 {
 777                     for (int i = 0; i < n1; i++)
 778                     {
 779                         order[atomStart + i]      = sort1[i];
 780                     }
 781                     for (int j = 0; j < n2; j++)
 782                     {
 783                         order[atomStart + n1 + j] = sort2[j];
 784                     }
 785                 }
 786
 787                 *flags |= NBNXN_CI_HALF_LJ(subc);
 788             }
 789         }
 790         if (haveQ)
 791         {
 792             *flags |= NBNXN_CI_DO_COUL(subc);
 793         }
 794         subc++;
 795     }
 796 }
 797
 798 /* Fill a pair search cell with atoms.
 799  * Potentially sorts atoms and sets the interaction flags.
 800  */
 801 static void fill_cell(const nbnxn_search_t   nbs,
 802                       nbnxn_grid_t          *grid,
 803                       nbnxn_atomdata_t      *nbat,
 804                       int                    atomStart,
 805                       int                    atomEnd,
 806                       const int             *atinfo,
 807                       const rvec            *x,
 808                       nbnxn_bb_t gmx_unused *bb_work_aligned)
 809 {
 810     const int numAtoms = atomEnd - atomStart;
 811
 812     if (grid->bSimple)
 813     {
 814         /* Note that non-local grids are already sorted.
 815          * Then sort_cluster_on_flag will only set the flags and the sorting
 816          * will not affect the atom order.
 817          */
 818         sort_cluster_on_flag(grid->na_c, atomStart, atomEnd, atinfo, nbs->a,
 819                              grid->flags.data() + (atomStart >> grid->na_c_2log) - grid->cell0);
 820     }
 821
 822     if (nbs->bFEP)
 823     {
 824         /* Set the fep flag for perturbed atoms in this (sub-)cell */
 825
 826         /* The grid-local cluster/(sub-)cell index */
 827         int cell = (atomStart >> grid->na_c_2log) - grid->cell0*(grid->bSimple ? 1 : c_gpuNumClusterPerCell);
 828         grid->fep[cell] = 0;
 829         for (int at = atomStart; at < atomEnd; at++)
 830         {
 831             if (nbs->a[at] >= 0 && GET_CGINFO_FEP(atinfo[nbs->a[at]]))
 832             {
 833                 grid->fep[cell] |= (1 << (at - atomStart));
 834             }
 835         }
 836     }
 837
 838     /* Now we have sorted the atoms, set the cell indices */
 839     for (int at = atomStart; at < atomEnd; at++)
 840     {
 841         nbs->cell[nbs->a[at]] = at;
 842     }
 843
 844     copy_rvec_to_nbat_real(nbs->a + atomStart, numAtoms, grid->na_c, x,
 845                            nbat->XFormat, nbat->x, atomStart);
 846
 847     if (nbat->XFormat == nbatX4)
 848     {
 849         /* Store the bounding boxes as xyz.xyz. */
 850         size_t      offset = (atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log;
 851         nbnxn_bb_t *bb_ptr = grid->bb.data() + offset;
 852
 853 #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2
 854         if (2*grid->na_cj == grid->na_c)
 855         {
 856             calc_bounding_box_x_x4_halves(numAtoms, nbat->x + atom_to_x_index<c_packX4>(atomStart), bb_ptr,
 857                                           grid->bbj.data() + offset*2);
 858         }
 859         else
 860 #endif
 861         {
 862             calc_bounding_box_x_x4(numAtoms, nbat->x + atom_to_x_index<c_packX4>(atomStart), bb_ptr);
 863         }
 864     }
 865     else if (nbat->XFormat == nbatX8)
 866     {
 867         /* Store the bounding boxes as xyz.xyz. */
 868         size_t      offset = (atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log;
 869         nbnxn_bb_t *bb_ptr = grid->bb.data() + offset;
 870
 871         calc_bounding_box_x_x8(numAtoms, nbat->x +  atom_to_x_index<c_packX8>(atomStart), bb_ptr);
 872     }
 873 #if NBNXN_BBXXXX
 874     else if (!grid->bSimple)
 875     {
 876         /* Store the bounding boxes in a format convenient
 877          * for SIMD4 calculations: xxxxyyyyzzzz...
 878          */
 879         float *pbb_ptr =
 880             grid->pbb.data() +
 881             ((atomStart - grid->cell0*grid->na_sc) >> (grid->na_c_2log + STRIDE_PBB_2LOG))*NNBSBB_XXXX +
 882             (((atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log) & (STRIDE_PBB - 1));
 883
 884 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 885         if (nbat->XFormat == nbatXYZQ)
 886         {
 887             calc_bounding_box_xxxx_simd4(numAtoms, nbat->x + atomStart*nbat->xstride,
 888                                          bb_work_aligned, pbb_ptr);
 889         }
 890         else
 891 #endif
 892         {
 893             calc_bounding_box_xxxx(numAtoms, nbat->xstride, nbat->x + atomStart*nbat->xstride,
 894                                    pbb_ptr);
 895         }
 896         if (gmx_debug_at)
 897         {
 898             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 899                     atomStart >> grid->na_c_2log,
 900                     pbb_ptr[0*STRIDE_PBB], pbb_ptr[3*STRIDE_PBB],
 901                     pbb_ptr[1*STRIDE_PBB], pbb_ptr[4*STRIDE_PBB],
 902                     pbb_ptr[2*STRIDE_PBB], pbb_ptr[5*STRIDE_PBB]);
 903         }
 904     }
 905 #endif
 906     else
 907     {
 908         /* Store the bounding boxes as xyz.xyz. */
 909         nbnxn_bb_t *bb_ptr = grid->bb.data() + ((atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log);
 910
 911         calc_bounding_box(numAtoms, nbat->xstride, nbat->x + atomStart*nbat->xstride,
 912                           bb_ptr);
 913
 914         if (gmx_debug_at)
 915         {
 916             int bbo = (atomStart - grid->cell0*grid->na_sc)/grid->na_c;
 917             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 918                     atomStart >> grid->na_c_2log,
 919                     grid->bb[bbo].lower[BB_X],
 920                     grid->bb[bbo].lower[BB_Y],
 921                     grid->bb[bbo].lower[BB_Z],
 922                     grid->bb[bbo].upper[BB_X],
 923                     grid->bb[bbo].upper[BB_Y],
 924                     grid->bb[bbo].upper[BB_Z]);
 925         }
 926     }
 927 }
 928
 929 /* Spatially sort the atoms within one grid column */
 930 static void sort_columns_simple(const nbnxn_search_t nbs,
 931                                 int dd_zone,
 932                                 nbnxn_grid_t *grid,
 933                                 int atomStart, int atomEnd,
 934                                 const int *atinfo,
 935                                 const rvec *x,
 936                                 nbnxn_atomdata_t *nbat,
 937                                 int cxy_start, int cxy_end,
 938                                 int *sort_work)
 939 {
 940     if (debug)
 941     {
 942         fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
 943                 grid->cell0, cxy_start, cxy_end, atomStart, atomEnd);
 944     }
 945
 946     /* Sort the atoms within each x,y column in 3 dimensions */
 947     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
 948     {
 949         int na  = grid->cxy_na[cxy];
 950         int ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
 951         int ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 952
 953         /* Sort the atoms within each x,y column on z coordinate */
 954         sort_atoms(ZZ, FALSE, dd_zone,
 955                    nbs->a+ash, na, x,
 956                    grid->c0[ZZ],
 957                    1.0/grid->size[ZZ], ncz*grid->na_sc,
 958                    sort_work);
 959
 960         /* Fill the ncz cells in this column */
 961         int cfilled = grid->cxy_ind[cxy];
 962         for (int cz = 0; cz < ncz; cz++)
 963         {
 964             int c     = grid->cxy_ind[cxy] + cz;
 965
 966             int ash_c = ash + cz*grid->na_sc;
 967             int na_c  = std::min(grid->na_sc, na-(ash_c-ash));
 968
 969             fill_cell(nbs, grid, nbat,
 970                       ash_c, ash_c+na_c, atinfo, x,
 971                       nullptr);
 972
 973             /* This copy to bbcz is not really necessary.
 974              * But it allows to use the same grid search code
 975              * for the simple and supersub cell setups.
 976              */
 977             if (na_c > 0)
 978             {
 979                 cfilled = c;
 980             }
 981             grid->bbcz[c*NNBSBB_D  ] = grid->bb[cfilled].lower[BB_Z];
 982             grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled].upper[BB_Z];
 983         }
 984
 985         /* Set the unused atom indices to -1 */
 986         for (int ind = na; ind < ncz*grid->na_sc; ind++)
 987         {
 988             nbs->a[ash+ind] = -1;
 989         }
 990     }
 991 }
 992
 993 /* Spatially sort the atoms within one grid column */
 994 static void sort_columns_supersub(const nbnxn_search_t nbs,
 995                                   int dd_zone,
 996                                   nbnxn_grid_t *grid,
 997                                   int atomStart, int atomEnd,
 998                                   const int *atinfo,
 999                                   const rvec *x,
1000                                   nbnxn_atomdata_t *nbat,
1001                                   int cxy_start, int cxy_end,
1002                                   int *sort_work)
1003 {
1004     nbnxn_bb_t  bb_work_array[2];
1005     nbnxn_bb_t *bb_work_aligned = reinterpret_cast<nbnxn_bb_t *>((reinterpret_cast<std::size_t>(bb_work_array + 1)) & (~(static_cast<std::size_t>(15))));
1006
1007     if (debug)
1008     {
1009         fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1010                 grid->cell0, cxy_start, cxy_end, atomStart, atomEnd);
1011     }
1012
1013     int subdiv_x = grid->na_c;
1014     int subdiv_y = c_gpuNumClusterPerCellX*subdiv_x;
1015     int subdiv_z = c_gpuNumClusterPerCellY*subdiv_y;
1016
1017     /* Sort the atoms within each x,y column in 3 dimensions.
1018      * Loop over all columns on the x/y grid.
1019      */
1020     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
1021     {
1022         int gridX            = cxy/grid->ncy;
1023         int gridY            = cxy - gridX*grid->ncy;
1024
1025         int numAtomsInColumn = grid->cxy_na[cxy];
1026         int numCellsInColumn = grid->cxy_ind[cxy + 1] - grid->cxy_ind[cxy];
1027         int ash              = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1028
1029         /* Sort the atoms within each x,y column on z coordinate */
1030         sort_atoms(ZZ, FALSE, dd_zone,
1031                    nbs->a + ash, numAtomsInColumn, x,
1032                    grid->c0[ZZ],
1033                    1.0/grid->size[ZZ], numCellsInColumn*grid->na_sc,
1034                    sort_work);
1035
1036         /* This loop goes over the cells and clusters along z at once */
1037         for (int sub_z = 0; sub_z < numCellsInColumn*c_gpuNumClusterPerCellZ; sub_z++)
1038         {
1039             int ash_z = ash + sub_z*subdiv_z;
1040             int na_z  = std::min(subdiv_z, numAtomsInColumn - (ash_z - ash));
1041             int cz    = -1;
1042             /* We have already sorted on z */
1043
1044             if (sub_z % c_gpuNumClusterPerCellZ == 0)
1045             {
1046                 cz = sub_z/c_gpuNumClusterPerCellZ;
1047                 int cell = grid->cxy_ind[cxy] + cz;
1048
1049                 /* The number of atoms in this cell/super-cluster */
1050                 int numAtoms = std::min(grid->na_sc, numAtomsInColumn - (ash_z - ash));
1051
1052                 grid->nsubc[cell] = std::min(c_gpuNumClusterPerCell,
1053                                              (numAtoms + grid->na_c - 1)/grid->na_c);
1054
1055                 /* Store the z-boundaries of the bounding box of the cell */
1056                 grid->bbcz[cell*NNBSBB_D  ] = x[nbs->a[ash_z]][ZZ];
1057                 grid->bbcz[cell*NNBSBB_D+1] = x[nbs->a[ash_z + numAtoms - 1]][ZZ];
1058             }
1059
1060             if (c_gpuNumClusterPerCellY > 1)
1061             {
1062                 /* Sort the atoms along y */
1063                 sort_atoms(YY, (sub_z & 1), dd_zone,
1064                            nbs->a + ash_z, na_z, x,
1065                            grid->c0[YY] + gridY*grid->sy,
1066                            grid->inv_sy, subdiv_z,
1067                            sort_work);
1068             }
1069
1070             for (int sub_y = 0; sub_y < c_gpuNumClusterPerCellY; sub_y++)
1071             {
1072                 int ash_y = ash_z + sub_y*subdiv_y;
1073                 int na_y  = std::min(subdiv_y, numAtomsInColumn - (ash_y - ash));
1074
1075                 if (c_gpuNumClusterPerCellX > 1)
1076                 {
1077                     /* Sort the atoms along x */
1078                     sort_atoms(XX, ((cz*c_gpuNumClusterPerCellY + sub_y) & 1), dd_zone,
1079                                nbs->a + ash_y, na_y, x,
1080                                grid->c0[XX] + gridX*grid->sx,
1081                                grid->inv_sx, subdiv_y,
1082                                sort_work);
1083                 }
1084
1085                 for (int sub_x = 0; sub_x < c_gpuNumClusterPerCellX; sub_x++)
1086                 {
1087                     int ash_x = ash_y + sub_x*subdiv_x;
1088                     int na_x  = std::min(subdiv_x, numAtomsInColumn - (ash_x - ash));
1089
1090                     fill_cell(nbs, grid, nbat,
1091                               ash_x, ash_x + na_x, atinfo, x,
1092                               bb_work_aligned);
1093                 }
1094             }
1095         }
1096
1097         /* Set the unused atom indices to -1 */
1098         for (int ind = numAtomsInColumn; ind < numCellsInColumn*grid->na_sc; ind++)
1099         {
1100             nbs->a[ash + ind] = -1;
1101         }
1102     }
1103 }
1104
1105 /* Determine in which grid column atoms should go */
1106 static void calc_column_indices(nbnxn_grid_t *grid,
1107                                 int atomStart, int atomEnd,
1108                                 const rvec *x,
1109                                 int dd_zone, const int *move,
1110                                 int thread, int nthread,
1111                                 int *cell,
1112                                 int *cxy_na)
1113 {
1114     /* We add one extra cell for particles which moved during DD */
1115     for (int i = 0; i < grid->ncx*grid->ncy+1; i++)
1116     {
1117         cxy_na[i] = 0;
1118     }
1119
1120     int taskAtomStart = atomStart + static_cast<int>((thread + 0)*(atomEnd - atomStart))/nthread;
1121     int taskAtomEnd   = atomStart + static_cast<int>((thread + 1)*(atomEnd - atomStart))/nthread;
1122     if (dd_zone == 0)
1123     {
1124         /* Home zone */
1125         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1126         {
1127             if (move == nullptr || move[i] >= 0)
1128             {
1129                 /* We need to be careful with rounding,
1130                  * particles might be a few bits outside the local zone.
1131                  * The int cast takes care of the lower bound,
1132                  * we will explicitly take care of the upper bound.
1133                  */
1134                 int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1135                 int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1136
1137 #ifndef NDEBUG
1138                 if (cx < 0 || cx > grid->ncx ||
1139                     cy < 0 || cy > grid->ncy)
1140                 {
1141                     gmx_fatal(FARGS,
1142                               "grid cell cx %d cy %d out of range (max %d %d)\n"
1143                               "atom %f %f %f, grid->c0 %f %f",
1144                               cx, cy, grid->ncx, grid->ncy,
1145                               x[i][XX], x[i][YY], x[i][ZZ], grid->c0[XX], grid->c0[YY]);
1146                 }
1147 #endif
1148                 /* Take care of potential rouding issues */
1149                 cx = std::min(cx, grid->ncx - 1);
1150                 cy = std::min(cy, grid->ncy - 1);
1151
1152                 /* For the moment cell will contain only the, grid local,
1153                  * x and y indices, not z.
1154                  */
1155                 cell[i] = cx*grid->ncy + cy;
1156             }
1157             else
1158             {
1159                 /* Put this moved particle after the end of the grid,
1160                  * so we can process it later without using conditionals.
1161                  */
1162                 cell[i] = grid->ncx*grid->ncy;
1163             }
1164
1165             cxy_na[cell[i]]++;
1166         }
1167     }
1168     else
1169     {
1170         /* Non-home zone */
1171         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1172         {
1173             int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1174             int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1175
1176             /* For non-home zones there could be particles outside
1177              * the non-bonded cut-off range, which have been communicated
1178              * for bonded interactions only. For the result it doesn't
1179              * matter where these end up on the grid. For performance
1180              * we put them in an extra row at the border.
1181              */
1182             cx = std::max(cx, 0);
1183             cx = std::min(cx, grid->ncx - 1);
1184             cy = std::max(cy, 0);
1185             cy = std::min(cy, grid->ncy - 1);
1186
1187             /* For the moment cell will contain only the, grid local,
1188              * x and y indices, not z.
1189              */
1190             cell[i] = cx*grid->ncy + cy;
1191
1192             cxy_na[cell[i]]++;
1193         }
1194     }
1195 }
1196
1197 /* Determine in which grid cells the atoms should go */
1198 static void calc_cell_indices(const nbnxn_search_t  nbs,
1199                               int                   ddZone,
1200                               nbnxn_grid_t         *grid,
1201                               int                   atomStart,
1202                               int                   atomEnd,
1203                               const int            *atinfo,
1204                               const rvec           *x,
1205                               const int            *move,
1206                               nbnxn_atomdata_t     *nbat)
1207 {
1208     const int nthread = gmx_omp_nthreads_get(emntPairsearch);
1209
1210 #pragma omp parallel for num_threads(nthread) schedule(static)
1211     for (int thread = 0; thread < nthread; thread++)
1212     {
1213         try
1214         {
1215             calc_column_indices(grid, atomStart, atomEnd, x, ddZone, move, thread, nthread,
1216                                 nbs->cell, nbs->work[thread].cxy_na);
1217         }
1218         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1219     }
1220
1221     /* Make the cell index as a function of x and y */
1222     int ncz_max      = 0;
1223     int ncz          = 0;
1224     grid->cxy_ind[0] = 0;
1225     for (int i = 0; i < grid->ncx*grid->ncy + 1; i++)
1226     {
1227         /* We set ncz_max at the beginning of the loop iso at the end
1228          * to skip i=grid->ncx*grid->ncy which are moved particles
1229          * that do not need to be ordered on the grid.
1230          */
1231         if (ncz > ncz_max)
1232         {
1233             ncz_max = ncz;
1234         }
1235         int cxy_na_i = nbs->work[0].cxy_na[i];
1236         for (int thread = 1; thread < nthread; thread++)
1237         {
1238             cxy_na_i += nbs->work[thread].cxy_na[i];
1239         }
1240         ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
1241         if (nbat->XFormat == nbatX8)
1242         {
1243             /* Make the number of cell a multiple of 2 */
1244             ncz = (ncz + 1) & ~1;
1245         }
1246         grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
1247         /* Clear cxy_na, so we can reuse the array below */
1248         grid->cxy_na[i] = 0;
1249     }
1250     grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
1251
1252     nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
1253
1254     if (debug)
1255     {
1256         fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1257                 grid->na_sc, grid->na_c, grid->nc,
1258                 grid->ncx, grid->ncy, grid->nc/((double)(grid->ncx*grid->ncy)),
1259                 ncz_max);
1260         if (gmx_debug_at)
1261         {
1262             int i = 0;
1263             for (int cy = 0; cy < grid->ncy; cy++)
1264             {
1265                 for (int cx = 0; cx < grid->ncx; cx++)
1266                 {
1267                     fprintf(debug, " %2d", grid->cxy_ind[i+1]-grid->cxy_ind[i]);
1268                     i++;
1269                 }
1270                 fprintf(debug, "\n");
1271             }
1272         }
1273     }
1274
1275     /* Make sure the work array for sorting is large enough */
1276     if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
1277     {
1278         for (int thread = 0; thread < nbs->nthread_max; thread++)
1279         {
1280             nbs->work[thread].sort_work_nalloc =
1281                 over_alloc_large(ncz_max*grid->na_sc*SGSF);
1282             srenew(nbs->work[thread].sort_work,
1283                    nbs->work[thread].sort_work_nalloc);
1284             /* When not in use, all elements should be -1 */
1285             for (int i = 0; i < nbs->work[thread].sort_work_nalloc; i++)
1286             {
1287                 nbs->work[thread].sort_work[i] = -1;
1288             }
1289         }
1290     }
1291
1292     /* Now we know the dimensions we can fill the grid.
1293      * This is the first, unsorted fill. We sort the columns after this.
1294      */
1295     for (int i = atomStart; i < atomEnd; i++)
1296     {
1297         /* At this point nbs->cell contains the local grid x,y indices */
1298         int cxy = nbs->cell[i];
1299         nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
1300     }
1301
1302     if (ddZone == 0)
1303     {
1304         /* Set the cell indices for the moved particles */
1305         int n0 = grid->nc*grid->na_sc;
1306         int n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
1307         if (ddZone == 0)
1308         {
1309             for (int i = n0; i < n1; i++)
1310             {
1311                 nbs->cell[nbs->a[i]] = i;
1312             }
1313         }
1314     }
1315
1316     /* Sort the super-cell columns along z into the sub-cells. */
1317 #pragma omp parallel for num_threads(nthread) schedule(static)
1318     for (int thread = 0; thread < nthread; thread++)
1319     {
1320         try
1321         {
1322             if (grid->bSimple)
1323             {
1324                 sort_columns_simple(nbs, ddZone, grid, atomStart, atomEnd, atinfo, x, nbat,
1325                                     ((thread+0)*grid->ncx*grid->ncy)/nthread,
1326                                     ((thread+1)*grid->ncx*grid->ncy)/nthread,
1327                                     nbs->work[thread].sort_work);
1328             }
1329             else
1330             {
1331                 sort_columns_supersub(nbs, ddZone, grid, atomStart, atomEnd, atinfo, x, nbat,
1332                                       ((thread+0)*grid->ncx*grid->ncy)/nthread,
1333                                       ((thread+1)*grid->ncx*grid->ncy)/nthread,
1334                                       nbs->work[thread].sort_work);
1335             }
1336         }
1337         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1338     }
1339
1340     if (grid->bSimple && nbat->XFormat == nbatX8)
1341     {
1342         combine_bounding_box_pairs(grid, grid->bb);
1343     }
1344
1345     if (!grid->bSimple)
1346     {
1347         grid->nsubc_tot = 0;
1348         for (int i = 0; i < grid->nc; i++)
1349         {
1350             grid->nsubc_tot += grid->nsubc[i];
1351         }
1352     }
1353
1354     if (debug)
1355     {
1356         if (grid->bSimple)
1357         {
1358             print_bbsizes_simple(debug, grid);
1359         }
1360         else
1361         {
1362             fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
1363                     grid->nsubc_tot, (atomEnd - atomStart)/(double)grid->nsubc_tot);
1364
1365             print_bbsizes_supersub(debug, grid);
1366         }
1367     }
1368 }
1369
1370 /* Sets up a grid and puts the atoms on the grid.
1371  * This function only operates on one domain of the domain decompostion.
1372  * Note that without domain decomposition there is only one domain.
1373  */
1374 void nbnxn_put_on_grid(nbnxn_search_t    nbs,
1375                        int               ePBC,
1376                        const matrix      box,
1377                        int               ddZone,
1378                        const rvec        lowerCorner,
1379                        const rvec        upperCorner,
1380                        int               atomStart,
1381                        int               atomEnd,
1382                        real              atomDensity,
1383                        const int        *atinfo,
1384                        const rvec       *x,
1385                        int               numAtomsMoved,
1386                        const int        *move,
1387                        int               nb_kernel_type,
1388                        nbnxn_atomdata_t *nbat)
1389 {
1390     nbnxn_grid_t *grid = &nbs->grid[ddZone];
1391
1392     nbs_cycle_start(&nbs->cc[enbsCCgrid]);
1393
1394     grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
1395
1396     grid->na_c      = nbnxn_kernel_to_cluster_i_size(nb_kernel_type);
1397     grid->na_cj     = nbnxn_kernel_to_cluster_j_size(nb_kernel_type);
1398     grid->na_sc     = (grid->bSimple ? 1 : c_gpuNumClusterPerCell)*grid->na_c;
1399     grid->na_c_2log = get_2log(grid->na_c);
1400
1401     nbat->na_c = grid->na_c;
1402
1403     if (ddZone == 0)
1404     {
1405         grid->cell0 = 0;
1406     }
1407     else
1408     {
1409         grid->cell0 =
1410             (nbs->grid[ddZone - 1].cell0 + nbs->grid[ddZone - 1].nc)*
1411             nbs->grid[ddZone- 1].na_sc/grid->na_sc;
1412     }
1413
1414     const int n = atomEnd - atomStart;
1415
1416     if (ddZone == 0)
1417     {
1418         nbs->ePBC = ePBC;
1419         copy_mat(box, nbs->box);
1420
1421         /* Avoid zero density */
1422         if (atomDensity > 0)
1423         {
1424             grid->atom_density = atomDensity;
1425         }
1426         else
1427         {
1428             grid->atom_density = grid_atom_density(n - numAtomsMoved, lowerCorner, upperCorner);
1429         }
1430
1431         grid->cell0 = 0;
1432
1433         nbs->natoms_local    = atomEnd - numAtomsMoved;
1434         /* We assume that nbnxn_put_on_grid is called first
1435          * for the local atoms (ddZone=0).
1436          */
1437         nbs->natoms_nonlocal = atomEnd - numAtomsMoved;
1438
1439         if (debug)
1440         {
1441             fprintf(debug, "natoms_local = %5d atom_density = %5.1f\n",
1442                     nbs->natoms_local, grid->atom_density);
1443         }
1444     }
1445     else
1446     {
1447         nbs->natoms_nonlocal = std::max(nbs->natoms_nonlocal, atomEnd);
1448     }
1449
1450     /* We always use the home zone (grid[0]) for setting the cell size,
1451      * since determining densities for non-local zones is difficult.
1452      */
1453     int maxNumCellsOnThisGrid =
1454         set_grid_size_xy(nbs, grid,
1455                          ddZone, n - numAtomsMoved,
1456                          lowerCorner, upperCorner,
1457                          nbs->grid[0].atom_density);
1458
1459     int maxNumCells = grid->cell0 + maxNumCellsOnThisGrid;
1460
1461     if (atomEnd > nbs->cell_nalloc)
1462     {
1463         nbs->cell_nalloc = over_alloc_large(atomEnd);
1464         srenew(nbs->cell, nbs->cell_nalloc);
1465     }
1466
1467     /* To avoid conditionals we store the moved particles at the end of a,
1468      * make sure we have enough space.
1469      */
1470     if (maxNumCells*grid->na_sc + numAtomsMoved > nbs->a_nalloc)
1471     {
1472         nbs->a_nalloc = over_alloc_large(maxNumCells*grid->na_sc + numAtomsMoved);
1473         srenew(nbs->a, nbs->a_nalloc);
1474     }
1475
1476     /* We need padding up to a multiple of the buffer flag size: simply add */
1477     if (maxNumCells*grid->na_sc + NBNXN_BUFFERFLAG_SIZE > nbat->nalloc)
1478     {
1479         nbnxn_atomdata_realloc(nbat,
1480                                maxNumCells*grid->na_sc + NBNXN_BUFFERFLAG_SIZE);
1481     }
1482
1483     calc_cell_indices(nbs, ddZone, grid, atomStart, atomEnd, atinfo, x, move, nbat);
1484
1485     if (ddZone == 0)
1486     {
1487         nbat->natoms_local = nbat->natoms;
1488     }
1489
1490     nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
1491 }
1492
1493 /* Calls nbnxn_put_on_grid for all non-local domains */
1494 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t                   nbs,
1495                                 const struct gmx_domdec_zones_t *zones,
1496                                 const int                       *atinfo,
1497                                 const rvec                      *x,
1498                                 int                              nb_kernel_type,
1499                                 nbnxn_atomdata_t                *nbat)
1500 {
1501     for (int zone = 1; zone < zones->n; zone++)
1502     {
1503         rvec c0, c1;
1504         for (int d = 0; d < DIM; d++)
1505         {
1506             c0[d] = zones->size[zone].bb_x0[d];
1507             c1[d] = zones->size[zone].bb_x1[d];
1508         }
1509
1510         nbnxn_put_on_grid(nbs, nbs->ePBC, nullptr,
1511                           zone, c0, c1,
1512                           zones->cg_range[zone],
1513                           zones->cg_range[zone+1],
1514                           -1,
1515                           atinfo,
1516                           x,
1517                           0, nullptr,
1518                           nb_kernel_type,
1519                           nbat);
1520     }
1521 }
1522
1523 void nbnxn_get_ncells(nbnxn_search_t nbs, int *ncx, int *ncy)
1524 {
1525     *ncx = nbs->grid[0].ncx;
1526     *ncy = nbs->grid[0].ncy;
1527 }
1528
1529 void nbnxn_get_atomorder(const nbnxn_search_t nbs, const int **a, int *n)
1530 {
1531     const nbnxn_grid_t *grid;
1532
1533     grid = &nbs->grid[0];
1534
1535     /* Return the atom order for the home cell (index 0) */
1536     *a  = nbs->a;
1537
1538     *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
1539 }
1540
1541 void nbnxn_set_atomorder(nbnxn_search_t nbs)
1542 {
1543     /* Set the atom order for the home cell (index 0) */
1544     nbnxn_grid_t *grid = &nbs->grid[0];
1545
1546     int           ao = 0;
1547     for (int cx = 0; cx < grid->ncx; cx++)
1548     {
1549         for (int cy = 0; cy < grid->ncy; cy++)
1550         {
1551             int cxy = cx*grid->ncy + cy;
1552             int j   = grid->cxy_ind[cxy]*grid->na_sc;
1553             for (int cz = 0; cz < grid->cxy_na[cxy]; cz++)
1554             {
1555                 nbs->a[j]     = ao;
1556                 nbs->cell[ao] = j;
1557                 ao++;
1558                 j++;
1559             }
1560         }
1561     }
1562 }