src/gromacs/mdlib/nbnxn_grid.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "nbnxn_grid.h"
  39
  40 #include <string.h>
  41
  42 #include <cmath>
  43
  44 #include <algorithm>
  45
  46 #include "gromacs/domdec/domdec_struct.h"
  47 #include "gromacs/math/utilities.h"
  48 #include "gromacs/math/vec.h"
  49 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  50 #include "gromacs/mdlib/nb_verlet.h"
  51 #include "gromacs/mdlib/nbnxn_atomdata.h"
  52 #include "gromacs/mdlib/nbnxn_consts.h"
  53 #include "gromacs/mdlib/nbnxn_internal.h"
  54 #include "gromacs/mdlib/nbnxn_search.h"
  55 #include "gromacs/mdlib/nbnxn_util.h"
  56 #include "gromacs/mdtypes/forcerec.h" // only for GET_CGINFO_*
  57 #include "gromacs/simd/simd.h"
  58 #include "gromacs/simd/vector_operations.h"
  59 #include "gromacs/utility/exceptions.h"
  60 #include "gromacs/utility/smalloc.h"
  61
  62 struct gmx_domdec_zones_t;
  63
  64 static real grid_atom_density(int        numAtoms,
  65                               const rvec lowerCorner,
  66                               const rvec upperCorner)
  67 {
  68     rvec size;
  69
  70     if (numAtoms == 0)
  71     {
  72         /* To avoid zero density we use a minimum of 1 atom */
  73         numAtoms = 1;
  74     }
  75
  76     rvec_sub(upperCorner, lowerCorner, size);
  77
  78     return numAtoms/(size[XX]*size[YY]*size[ZZ]);
  79 }
  80
  81 static void set_grid_size_xy(const nbnxn_search_t  nbs,
  82                              nbnxn_grid_t         *grid,
  83                              int                   ddZone,
  84                              int                   numAtoms,
  85                              const rvec            lowerCorner,
  86                              const rvec            upperCorner,
  87                              real                  atomDensity)
  88 {
  89     rvec size;
  90     real tlen, tlen_x, tlen_y;
  91
  92     rvec_sub(upperCorner, lowerCorner, size);
  93
  94     if (numAtoms > grid->na_sc)
  95     {
  96         GMX_ASSERT(atomDensity > 0, "With one or more atoms, the density should be positive");
  97
  98         /* target cell length */
  99         if (grid->bSimple)
 100         {
 101             /* To minimize the zero interactions, we should make
 102              * the largest of the i/j cell cubic.
 103              */
 104             int numAtomsInCell = std::max(grid->na_c, grid->na_cj);
 105
 106             /* Approximately cubic cells */
 107             tlen   = std::cbrt(numAtomsInCell/atomDensity);
 108             tlen_x = tlen;
 109             tlen_y = tlen;
 110         }
 111         else
 112         {
 113             /* Approximately cubic sub cells */
 114             tlen   = std::cbrt(grid->na_c/atomDensity);
 115             tlen_x = tlen*c_gpuNumClusterPerCellX;
 116             tlen_y = tlen*c_gpuNumClusterPerCellY;
 117         }
 118         /* We round ncx and ncy down, because we get less cell pairs
 119          * in the nbsist when the fixed cell dimensions (x,y) are
 120          * larger than the variable one (z) than the other way around.
 121          */
 122         grid->ncx = std::max(1, static_cast<int>(size[XX]/tlen_x));
 123         grid->ncy = std::max(1, static_cast<int>(size[YY]/tlen_y));
 124     }
 125     else
 126     {
 127         grid->ncx = 1;
 128         grid->ncy = 1;
 129     }
 130
 131     grid->sx     = size[XX]/grid->ncx;
 132     grid->sy     = size[YY]/grid->ncy;
 133     grid->inv_sx = 1/grid->sx;
 134     grid->inv_sy = 1/grid->sy;
 135
 136     if (ddZone > 0)
 137     {
 138         /* This is a non-home zone, add an extra row of cells
 139          * for particles communicated for bonded interactions.
 140          * These can be beyond the cut-off. It doesn't matter where
 141          * they end up on the grid, but for performance it's better
 142          * if they don't end up in cells that can be within cut-off range.
 143          */
 144         grid->ncx++;
 145         grid->ncy++;
 146     }
 147
 148     /* We need one additional cell entry for particles moved by DD */
 149     grid->cxy_na.resize(grid->ncx*grid->ncy + 1);
 150     grid->cxy_ind.resize(grid->ncx*grid->ncy + 2);
 151
 152     for (int t = 0; t < nbs->nthread_max; t++)
 153     {
 154         if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
 155         {
 156             nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
 157             srenew(nbs->work[t].cxy_na, nbs->work[t].cxy_na_nalloc);
 158         }
 159     }
 160
 161     /* Worst case scenario of 1 atom in each last cell */
 162     int maxNumCells;
 163     if (grid->na_cj <= grid->na_c)
 164     {
 165         maxNumCells = numAtoms/grid->na_sc + grid->ncx*grid->ncy;
 166     }
 167     else
 168     {
 169         maxNumCells = numAtoms/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
 170     }
 171
 172     grid->nsubc.resize(maxNumCells);
 173     grid->bbcz.resize(maxNumCells*NNBSBB_D);
 174
 175     /* This resize also zeros the contents, this avoid possible
 176      * floating exceptions in SIMD with the unused bb elements.
 177      */
 178     if (grid->bSimple)
 179     {
 180         grid->bb.resize(maxNumCells);
 181     }
 182     else
 183     {
 184 #if NBNXN_BBXXXX
 185         grid->pbb.resize(maxNumCells*c_gpuNumClusterPerCell/STRIDE_PBB*NNBSBB_XXXX);
 186 #else
 187         grid->bb.resize(maxNumCells*c_gpuNumClusterPerCell);
 188 #endif
 189     }
 190
 191     if (grid->bSimple)
 192     {
 193         if (grid->na_cj == grid->na_c)
 194         {
 195             grid->bbj = grid->bb;
 196         }
 197         else
 198         {
 199             grid->bbjStorage.resize(maxNumCells*grid->na_c/grid->na_cj);
 200             grid->bbj = grid->bbjStorage;
 201         }
 202     }
 203
 204     grid->flags.resize(maxNumCells);
 205     if (nbs->bFEP)
 206     {
 207         grid->fep.resize(maxNumCells*grid->na_sc/grid->na_c);
 208     }
 209
 210     copy_rvec(lowerCorner, grid->c0);
 211     copy_rvec(upperCorner, grid->c1);
 212     copy_rvec(size,        grid->size);
 213 }
 214
 215 /* We need to sort paricles in grid columns on z-coordinate.
 216  * As particle are very often distributed homogeneously, we a sorting
 217  * algorithm similar to pigeonhole sort. We multiply the z-coordinate
 218  * by a factor, cast to an int and try to store in that hole. If the hole
 219  * is full, we move this or another particle. A second pass is needed to make
 220  * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
 221  * 4 is the optimal value for homogeneous particle distribution and allows
 222  * for an O(#particles) sort up till distributions were all particles are
 223  * concentrated in 1/4 of the space. No NlogN fallback is implemented,
 224  * as it can be expensive to detect imhomogeneous particle distributions.
 225  * SGSF is the maximum ratio of holes used, in the worst case all particles
 226  * end up in the last hole and we need #particles extra holes at the end.
 227  */
 228 #define SORT_GRID_OVERSIZE 4
 229 #define SGSF (SORT_GRID_OVERSIZE + 1)
 230
 231 /* Sort particle index a on coordinates x along dim.
 232  * Backwards tells if we want decreasing iso increasing coordinates.
 233  * h0 is the minimum of the coordinate range.
 234  * invh is the 1/length of the sorting range.
 235  * n_per_h (>=n) is the expected average number of particles per 1/invh
 236  * sort is the sorting work array.
 237  * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
 238  * or easier, allocate at least n*SGSF elements.
 239  */
 240 static void sort_atoms(int dim, gmx_bool Backwards,
 241                        int gmx_unused dd_zone,
 242                        int *a, int n, const rvec *x,
 243                        real h0, real invh, int n_per_h,
 244                        int *sort)
 245 {
 246     if (n <= 1)
 247     {
 248         /* Nothing to do */
 249         return;
 250     }
 251
 252 #ifndef NDEBUG
 253     if (n > n_per_h)
 254     {
 255         gmx_incons("n > n_per_h");
 256     }
 257 #endif
 258
 259     /* Transform the inverse range height into the inverse hole height */
 260     invh *= n_per_h*SORT_GRID_OVERSIZE;
 261
 262     /* Set nsort to the maximum possible number of holes used.
 263      * In worst case all n elements end up in the last bin.
 264      */
 265     int nsort = n_per_h*SORT_GRID_OVERSIZE + n;
 266
 267     /* Determine the index range used, so we can limit it for the second pass */
 268     int zi_min = INT_MAX;
 269     int zi_max = -1;
 270
 271     /* Sort the particles using a simple index sort */
 272     for (int i = 0; i < n; i++)
 273     {
 274         /* The cast takes care of float-point rounding effects below zero.
 275          * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
 276          * times the box height out of the box.
 277          */
 278         int zi = static_cast<int>((x[a[i]][dim] - h0)*invh);
 279
 280 #ifndef NDEBUG
 281         /* As we can have rounding effect, we use > iso >= here */
 282         if (zi < 0 || (dd_zone == 0 && zi > n_per_h*SORT_GRID_OVERSIZE))
 283         {
 284             gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
 285                       a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
 286                       n_per_h, SORT_GRID_OVERSIZE);
 287         }
 288 #endif
 289
 290         /* In a non-local domain, particles communcated for bonded interactions
 291          * can be far beyond the grid size, which is set by the non-bonded
 292          * cut-off distance. We sort such particles into the last cell.
 293          */
 294         if (zi > n_per_h*SORT_GRID_OVERSIZE)
 295         {
 296             zi = n_per_h*SORT_GRID_OVERSIZE;
 297         }
 298
 299         /* Ideally this particle should go in sort cell zi,
 300          * but that might already be in use,
 301          * in that case find the first empty cell higher up
 302          */
 303         if (sort[zi] < 0)
 304         {
 305             sort[zi] = a[i];
 306             zi_min   = std::min(zi_min, zi);
 307             zi_max   = std::max(zi_max, zi);
 308         }
 309         else
 310         {
 311             /* We have multiple atoms in the same sorting slot.
 312              * Sort on real z for minimal bounding box size.
 313              * There is an extra check for identical z to ensure
 314              * well-defined output order, independent of input order
 315              * to ensure binary reproducibility after restarts.
 316              */
 317             while (sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
 318                                       (x[a[i]][dim] == x[sort[zi]][dim] &&
 319                                        a[i] > sort[zi])))
 320             {
 321                 zi++;
 322             }
 323
 324             if (sort[zi] >= 0)
 325             {
 326                 /* Shift all elements by one slot until we find an empty slot */
 327                 int cp  = sort[zi];
 328                 int zim = zi + 1;
 329                 while (sort[zim] >= 0)
 330                 {
 331                     int tmp   = sort[zim];
 332                     sort[zim] = cp;
 333                     cp        = tmp;
 334                     zim++;
 335                 }
 336                 sort[zim] = cp;
 337                 zi_max    = std::max(zi_max, zim);
 338             }
 339             sort[zi] = a[i];
 340             zi_max   = std::max(zi_max, zi);
 341         }
 342     }
 343
 344     int c = 0;
 345     if (!Backwards)
 346     {
 347         for (int zi = 0; zi < nsort; zi++)
 348         {
 349             if (sort[zi] >= 0)
 350             {
 351                 a[c++]   = sort[zi];
 352                 sort[zi] = -1;
 353             }
 354         }
 355     }
 356     else
 357     {
 358         for (int zi = zi_max; zi >= zi_min; zi--)
 359         {
 360             if (sort[zi] >= 0)
 361             {
 362                 a[c++]   = sort[zi];
 363                 sort[zi] = -1;
 364             }
 365         }
 366     }
 367     if (c < n)
 368     {
 369         gmx_incons("Lost particles while sorting");
 370     }
 371 }
 372
 373 #if GMX_DOUBLE
 374 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
 375 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
 376 #else
 377 #define R2F_D(x) (x)
 378 #define R2F_U(x) (x)
 379 #endif
 380
 381 /* Coordinate order x,y,z, bb order xyz0 */
 382 static void calc_bounding_box(int na, int stride, const real *x, nbnxn_bb_t *bb)
 383 {
 384     int  i;
 385     real xl, xh, yl, yh, zl, zh;
 386
 387     i  = 0;
 388     xl = x[i+XX];
 389     xh = x[i+XX];
 390     yl = x[i+YY];
 391     yh = x[i+YY];
 392     zl = x[i+ZZ];
 393     zh = x[i+ZZ];
 394     i += stride;
 395     for (int j = 1; j < na; j++)
 396     {
 397         xl = std::min(xl, x[i+XX]);
 398         xh = std::max(xh, x[i+XX]);
 399         yl = std::min(yl, x[i+YY]);
 400         yh = std::max(yh, x[i+YY]);
 401         zl = std::min(zl, x[i+ZZ]);
 402         zh = std::max(zh, x[i+ZZ]);
 403         i += stride;
 404     }
 405     /* Note: possible double to float conversion here */
 406     bb->lower[BB_X] = R2F_D(xl);
 407     bb->lower[BB_Y] = R2F_D(yl);
 408     bb->lower[BB_Z] = R2F_D(zl);
 409     bb->upper[BB_X] = R2F_U(xh);
 410     bb->upper[BB_Y] = R2F_U(yh);
 411     bb->upper[BB_Z] = R2F_U(zh);
 412 }
 413
 414 /* Packed coordinates, bb order xyz0 */
 415 static void calc_bounding_box_x_x4(int na, const real *x, nbnxn_bb_t *bb)
 416 {
 417     real xl, xh, yl, yh, zl, zh;
 418
 419     xl = x[XX*c_packX4];
 420     xh = x[XX*c_packX4];
 421     yl = x[YY*c_packX4];
 422     yh = x[YY*c_packX4];
 423     zl = x[ZZ*c_packX4];
 424     zh = x[ZZ*c_packX4];
 425     for (int j = 1; j < na; j++)
 426     {
 427         xl = std::min(xl, x[j+XX*c_packX4]);
 428         xh = std::max(xh, x[j+XX*c_packX4]);
 429         yl = std::min(yl, x[j+YY*c_packX4]);
 430         yh = std::max(yh, x[j+YY*c_packX4]);
 431         zl = std::min(zl, x[j+ZZ*c_packX4]);
 432         zh = std::max(zh, x[j+ZZ*c_packX4]);
 433     }
 434     /* Note: possible double to float conversion here */
 435     bb->lower[BB_X] = R2F_D(xl);
 436     bb->lower[BB_Y] = R2F_D(yl);
 437     bb->lower[BB_Z] = R2F_D(zl);
 438     bb->upper[BB_X] = R2F_U(xh);
 439     bb->upper[BB_Y] = R2F_U(yh);
 440     bb->upper[BB_Z] = R2F_U(zh);
 441 }
 442
 443 /* Packed coordinates, bb order xyz0 */
 444 static void calc_bounding_box_x_x8(int na, const real *x, nbnxn_bb_t *bb)
 445 {
 446     real xl, xh, yl, yh, zl, zh;
 447
 448     xl = x[XX*c_packX8];
 449     xh = x[XX*c_packX8];
 450     yl = x[YY*c_packX8];
 451     yh = x[YY*c_packX8];
 452     zl = x[ZZ*c_packX8];
 453     zh = x[ZZ*c_packX8];
 454     for (int j = 1; j < na; j++)
 455     {
 456         xl = std::min(xl, x[j+XX*c_packX8]);
 457         xh = std::max(xh, x[j+XX*c_packX8]);
 458         yl = std::min(yl, x[j+YY*c_packX8]);
 459         yh = std::max(yh, x[j+YY*c_packX8]);
 460         zl = std::min(zl, x[j+ZZ*c_packX8]);
 461         zh = std::max(zh, x[j+ZZ*c_packX8]);
 462     }
 463     /* Note: possible double to float conversion here */
 464     bb->lower[BB_X] = R2F_D(xl);
 465     bb->lower[BB_Y] = R2F_D(yl);
 466     bb->lower[BB_Z] = R2F_D(zl);
 467     bb->upper[BB_X] = R2F_U(xh);
 468     bb->upper[BB_Y] = R2F_U(yh);
 469     bb->upper[BB_Z] = R2F_U(zh);
 470 }
 471
 472 /* Packed coordinates, bb order xyz0 */
 473 gmx_unused static void calc_bounding_box_x_x4_halves(int na, const real *x,
 474                                                      nbnxn_bb_t *bb, nbnxn_bb_t *bbj)
 475 {
 476     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 477     using namespace gmx;
 478
 479     calc_bounding_box_x_x4(std::min(na, 2), x, bbj);
 480
 481     if (na > 2)
 482     {
 483         calc_bounding_box_x_x4(std::min(na-2, 2), x+(c_packX4>>1), bbj+1);
 484     }
 485     else
 486     {
 487         /* Set the "empty" bounding box to the same as the first one,
 488          * so we don't need to treat special cases in the rest of the code.
 489          */
 490 #if NBNXN_SEARCH_BB_SIMD4
 491         store4(&bbj[1].lower[0], load4(&bbj[0].lower[0]));
 492         store4(&bbj[1].upper[0], load4(&bbj[0].upper[0]));
 493 #else
 494         bbj[1] = bbj[0];
 495 #endif
 496     }
 497
 498 #if NBNXN_SEARCH_BB_SIMD4
 499     store4(&bb->lower[0], min(load4(&bbj[0].lower[0]), load4(&bbj[1].lower[0])));
 500     store4(&bb->upper[0], max(load4(&bbj[0].upper[0]), load4(&bbj[1].upper[0])));
 501 #else
 502     {
 503         int i;
 504
 505         for (i = 0; i < NNBSBB_C; i++)
 506         {
 507             bb->lower[i] = std::min(bbj[0].lower[i], bbj[1].lower[i]);
 508             bb->upper[i] = std::max(bbj[0].upper[i], bbj[1].upper[i]);
 509         }
 510     }
 511 #endif
 512 }
 513
 514 #if NBNXN_SEARCH_BB_SIMD4
 515
 516 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
 517 static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
 518 {
 519     int  i;
 520     real xl, xh, yl, yh, zl, zh;
 521
 522     i  = 0;
 523     xl = x[i+XX];
 524     xh = x[i+XX];
 525     yl = x[i+YY];
 526     yh = x[i+YY];
 527     zl = x[i+ZZ];
 528     zh = x[i+ZZ];
 529     i += stride;
 530     for (int j = 1; j < na; j++)
 531     {
 532         xl = std::min(xl, x[i+XX]);
 533         xh = std::max(xh, x[i+XX]);
 534         yl = std::min(yl, x[i+YY]);
 535         yh = std::max(yh, x[i+YY]);
 536         zl = std::min(zl, x[i+ZZ]);
 537         zh = std::max(zh, x[i+ZZ]);
 538         i += stride;
 539     }
 540     /* Note: possible double to float conversion here */
 541     bb[0*STRIDE_PBB] = R2F_D(xl);
 542     bb[1*STRIDE_PBB] = R2F_D(yl);
 543     bb[2*STRIDE_PBB] = R2F_D(zl);
 544     bb[3*STRIDE_PBB] = R2F_U(xh);
 545     bb[4*STRIDE_PBB] = R2F_U(yh);
 546     bb[5*STRIDE_PBB] = R2F_U(zh);
 547 }
 548
 549 #endif /* NBNXN_SEARCH_BB_SIMD4 */
 550
 551 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 552
 553 /* Coordinate order xyz?, bb order xyz0 */
 554 static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb)
 555 {
 556     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 557     using namespace gmx;
 558
 559     Simd4Float bb_0_S, bb_1_S;
 560     Simd4Float x_S;
 561
 562     bb_0_S = load4(x);
 563     bb_1_S = bb_0_S;
 564
 565     for (int i = 1; i < na; i++)
 566     {
 567         x_S    = load4(x+i*NNBSBB_C);
 568         bb_0_S = min(bb_0_S, x_S);
 569         bb_1_S = max(bb_1_S, x_S);
 570     }
 571
 572     store4(&bb->lower[0], bb_0_S);
 573     store4(&bb->upper[0], bb_1_S);
 574 }
 575
 576 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
 577 static void calc_bounding_box_xxxx_simd4(int na, const float *x,
 578                                          nbnxn_bb_t *bb_work_aligned,
 579                                          real *bb)
 580 {
 581     calc_bounding_box_simd4(na, x, bb_work_aligned);
 582
 583     bb[0*STRIDE_PBB] = bb_work_aligned->lower[BB_X];
 584     bb[1*STRIDE_PBB] = bb_work_aligned->lower[BB_Y];
 585     bb[2*STRIDE_PBB] = bb_work_aligned->lower[BB_Z];
 586     bb[3*STRIDE_PBB] = bb_work_aligned->upper[BB_X];
 587     bb[4*STRIDE_PBB] = bb_work_aligned->upper[BB_Y];
 588     bb[5*STRIDE_PBB] = bb_work_aligned->upper[BB_Z];
 589 }
 590
 591 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
 592
 593
 594 /* Combines pairs of consecutive bounding boxes */
 595 static void combine_bounding_box_pairs(nbnxn_grid_t                    *grid,
 596                                        gmx::ArrayRef<const nbnxn_bb_t>  bb)
 597 {
 598     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 599     using namespace gmx;
 600
 601     for (int i = 0; i < grid->ncx*grid->ncy; i++)
 602     {
 603         /* Starting bb in a column is expected to be 2-aligned */
 604         int sc2 = grid->cxy_ind[i]>>1;
 605         /* For odd numbers skip the last bb here */
 606         int nc2 = (grid->cxy_na[i]+3)>>(2+1);
 607         int c2;
 608         for (c2 = sc2; c2 < sc2+nc2; c2++)
 609         {
 610 #if NBNXN_SEARCH_BB_SIMD4
 611             Simd4Float min_S, max_S;
 612
 613             min_S = min(load4(&bb[c2*2+0].lower[0]),
 614                         load4(&bb[c2*2+1].lower[0]));
 615             max_S = max(load4(&bb[c2*2+0].upper[0]),
 616                         load4(&bb[c2*2+1].upper[0]));
 617             store4(&grid->bbj[c2].lower[0], min_S);
 618             store4(&grid->bbj[c2].upper[0], max_S);
 619 #else
 620             for (int j = 0; j < NNBSBB_C; j++)
 621             {
 622                 grid->bbj[c2].lower[j] = std::min(bb[c2*2+0].lower[j],
 623                                                   bb[c2*2+1].lower[j]);
 624                 grid->bbj[c2].upper[j] = std::max(bb[c2*2+0].upper[j],
 625                                                   bb[c2*2+1].upper[j]);
 626             }
 627 #endif
 628         }
 629         if (((grid->cxy_na[i]+3)>>2) & 1)
 630         {
 631             /* The bb count in this column is odd: duplicate the last bb */
 632             for (int j = 0; j < NNBSBB_C; j++)
 633             {
 634                 grid->bbj[c2].lower[j] = bb[c2*2].lower[j];
 635                 grid->bbj[c2].upper[j] = bb[c2*2].upper[j];
 636             }
 637         }
 638     }
 639 }
 640
 641
 642 /* Prints the average bb size, used for debug output */
 643 static void print_bbsizes_simple(FILE                *fp,
 644                                  const nbnxn_grid_t  *grid)
 645 {
 646     dvec ba;
 647
 648     clear_dvec(ba);
 649     for (int c = 0; c < grid->nc; c++)
 650     {
 651         for (int d = 0; d < DIM; d++)
 652         {
 653             ba[d] += grid->bb[c].upper[d] - grid->bb[c].lower[d];
 654         }
 655     }
 656     dsvmul(1.0/grid->nc, ba, ba);
 657
 658     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 659             grid->sx,
 660             grid->sy,
 661             grid->atom_density > 0 ?
 662             grid->na_c/(grid->atom_density*grid->sx*grid->sy) : 0.0,
 663             ba[XX], ba[YY], ba[ZZ],
 664             ba[XX]/grid->sx,
 665             ba[YY]/grid->sy,
 666             grid->atom_density > 0 ?
 667             ba[ZZ]/(grid->na_c/(grid->atom_density*grid->sx*grid->sy)) : 0.0);
 668 }
 669
 670 /* Prints the average bb size, used for debug output */
 671 static void print_bbsizes_supersub(FILE                *fp,
 672                                    const nbnxn_grid_t  *grid)
 673 {
 674     int  ns;
 675     dvec ba;
 676
 677     clear_dvec(ba);
 678     ns = 0;
 679     for (int c = 0; c < grid->nc; c++)
 680     {
 681 #if NBNXN_BBXXXX
 682         for (int s = 0; s < grid->nsubc[c]; s += STRIDE_PBB)
 683         {
 684             int cs_w = (c*c_gpuNumClusterPerCell + s)/STRIDE_PBB;
 685             for (int i = 0; i < STRIDE_PBB; i++)
 686             {
 687                 for (int d = 0; d < DIM; d++)
 688                 {
 689                     ba[d] +=
 690                         grid->pbb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
 691                         grid->pbb[cs_w*NNBSBB_XXXX+     d *STRIDE_PBB+i];
 692                 }
 693             }
 694         }
 695 #else
 696         for (int s = 0; s < grid->nsubc[c]; s++)
 697         {
 698             int cs = c*c_gpuNumClusterPerCell + s;
 699             for (int d = 0; d < DIM; d++)
 700             {
 701                 ba[d] += grid->bb[cs].upper[d] - grid->bb[cs].lower[d];
 702             }
 703         }
 704 #endif
 705         ns += grid->nsubc[c];
 706     }
 707     dsvmul(1.0/ns, ba, ba);
 708
 709     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 710             grid->sx/c_gpuNumClusterPerCellX,
 711             grid->sy/c_gpuNumClusterPerCellY,
 712             grid->atom_density > 0 ?
 713             grid->na_sc/(grid->atom_density*grid->sx*grid->sy*c_gpuNumClusterPerCellZ) : 0.0,
 714             ba[XX], ba[YY], ba[ZZ],
 715             ba[XX]*c_gpuNumClusterPerCellX/grid->sx,
 716             ba[YY]*c_gpuNumClusterPerCellY/grid->sy,
 717             grid->atom_density > 0 ?
 718             ba[ZZ]/(grid->na_sc/(grid->atom_density*grid->sx*grid->sy*c_gpuNumClusterPerCellZ)) : 0.0);
 719 }
 720
 721 /* Set non-bonded interaction flags for the current cluster.
 722  * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
 723  */
 724 static void sort_cluster_on_flag(int                 numAtomsInCluster,
 725                                  int                 atomStart,
 726                                  int                 atomEnd,
 727                                  const int          *atinfo,
 728                                  gmx::ArrayRef<int>  order,
 729                                  int                *flags)
 730 {
 731     constexpr int c_maxNumAtomsInCluster = 8;
 732     int           sort1[c_maxNumAtomsInCluster];
 733     int           sort2[c_maxNumAtomsInCluster];
 734
 735     GMX_ASSERT(numAtomsInCluster <= c_maxNumAtomsInCluster, "Need to increase c_maxNumAtomsInCluster to support larger clusters");
 736
 737     *flags = 0;
 738
 739     int subc = 0;
 740     for (int s = atomStart; s < atomEnd; s += numAtomsInCluster)
 741     {
 742         /* Make lists for this (sub-)cell on atoms with and without LJ */
 743         int      n1         = 0;
 744         int      n2         = 0;
 745         gmx_bool haveQ      = FALSE;
 746         int      a_lj_max   = -1;
 747         for (int a = s; a < std::min(s + numAtomsInCluster, atomEnd); a++)
 748         {
 749             haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
 750
 751             if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
 752             {
 753                 sort1[n1++] = order[a];
 754                 a_lj_max    = a;
 755             }
 756             else
 757             {
 758                 sort2[n2++] = order[a];
 759             }
 760         }
 761
 762         /* If we don't have atoms with LJ, there's nothing to sort */
 763         if (n1 > 0)
 764         {
 765             *flags |= NBNXN_CI_DO_LJ(subc);
 766
 767             if (2*n1 <= numAtomsInCluster)
 768             {
 769                 /* Only sort when strictly necessary. Ordering particles
 770                  * Ordering particles can lead to less accurate summation
 771                  * due to rounding, both for LJ and Coulomb interactions.
 772                  */
 773                 if (2*(a_lj_max - s) >= numAtomsInCluster)
 774                 {
 775                     for (int i = 0; i < n1; i++)
 776                     {
 777                         order[atomStart + i]      = sort1[i];
 778                     }
 779                     for (int j = 0; j < n2; j++)
 780                     {
 781                         order[atomStart + n1 + j] = sort2[j];
 782                     }
 783                 }
 784
 785                 *flags |= NBNXN_CI_HALF_LJ(subc);
 786             }
 787         }
 788         if (haveQ)
 789         {
 790             *flags |= NBNXN_CI_DO_COUL(subc);
 791         }
 792         subc++;
 793     }
 794 }
 795
 796 /* Fill a pair search cell with atoms.
 797  * Potentially sorts atoms and sets the interaction flags.
 798  */
 799 static void fill_cell(const nbnxn_search_t   nbs,
 800                       nbnxn_grid_t          *grid,
 801                       nbnxn_atomdata_t      *nbat,
 802                       int                    atomStart,
 803                       int                    atomEnd,
 804                       const int             *atinfo,
 805                       const rvec            *x,
 806                       nbnxn_bb_t gmx_unused *bb_work_aligned)
 807 {
 808     const int numAtoms = atomEnd - atomStart;
 809
 810     if (grid->bSimple)
 811     {
 812         /* Note that non-local grids are already sorted.
 813          * Then sort_cluster_on_flag will only set the flags and the sorting
 814          * will not affect the atom order.
 815          */
 816         sort_cluster_on_flag(grid->na_c, atomStart, atomEnd, atinfo, nbs->a,
 817                              grid->flags.data() + (atomStart >> grid->na_c_2log) - grid->cell0);
 818     }
 819
 820     if (nbs->bFEP)
 821     {
 822         /* Set the fep flag for perturbed atoms in this (sub-)cell */
 823
 824         /* The grid-local cluster/(sub-)cell index */
 825         int cell = (atomStart >> grid->na_c_2log) - grid->cell0*(grid->bSimple ? 1 : c_gpuNumClusterPerCell);
 826         grid->fep[cell] = 0;
 827         for (int at = atomStart; at < atomEnd; at++)
 828         {
 829             if (nbs->a[at] >= 0 && GET_CGINFO_FEP(atinfo[nbs->a[at]]))
 830             {
 831                 grid->fep[cell] |= (1 << (at - atomStart));
 832             }
 833         }
 834     }
 835
 836     /* Now we have sorted the atoms, set the cell indices */
 837     for (int at = atomStart; at < atomEnd; at++)
 838     {
 839         nbs->cell[nbs->a[at]] = at;
 840     }
 841
 842     copy_rvec_to_nbat_real(nbs->a.data() + atomStart, numAtoms, grid->na_c, x,
 843                            nbat->XFormat, nbat->x, atomStart);
 844
 845     if (nbat->XFormat == nbatX4)
 846     {
 847         /* Store the bounding boxes as xyz.xyz. */
 848         size_t      offset = (atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log;
 849         nbnxn_bb_t *bb_ptr = grid->bb.data() + offset;
 850
 851 #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2
 852         if (2*grid->na_cj == grid->na_c)
 853         {
 854             calc_bounding_box_x_x4_halves(numAtoms, nbat->x + atom_to_x_index<c_packX4>(atomStart), bb_ptr,
 855                                           grid->bbj.data() + offset*2);
 856         }
 857         else
 858 #endif
 859         {
 860             calc_bounding_box_x_x4(numAtoms, nbat->x + atom_to_x_index<c_packX4>(atomStart), bb_ptr);
 861         }
 862     }
 863     else if (nbat->XFormat == nbatX8)
 864     {
 865         /* Store the bounding boxes as xyz.xyz. */
 866         size_t      offset = (atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log;
 867         nbnxn_bb_t *bb_ptr = grid->bb.data() + offset;
 868
 869         calc_bounding_box_x_x8(numAtoms, nbat->x +  atom_to_x_index<c_packX8>(atomStart), bb_ptr);
 870     }
 871 #if NBNXN_BBXXXX
 872     else if (!grid->bSimple)
 873     {
 874         /* Store the bounding boxes in a format convenient
 875          * for SIMD4 calculations: xxxxyyyyzzzz...
 876          */
 877         float *pbb_ptr =
 878             grid->pbb.data() +
 879             ((atomStart - grid->cell0*grid->na_sc) >> (grid->na_c_2log + STRIDE_PBB_2LOG))*NNBSBB_XXXX +
 880             (((atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log) & (STRIDE_PBB - 1));
 881
 882 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 883         if (nbat->XFormat == nbatXYZQ)
 884         {
 885             calc_bounding_box_xxxx_simd4(numAtoms, nbat->x + atomStart*nbat->xstride,
 886                                          bb_work_aligned, pbb_ptr);
 887         }
 888         else
 889 #endif
 890         {
 891             calc_bounding_box_xxxx(numAtoms, nbat->xstride, nbat->x + atomStart*nbat->xstride,
 892                                    pbb_ptr);
 893         }
 894         if (gmx_debug_at)
 895         {
 896             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 897                     atomStart >> grid->na_c_2log,
 898                     pbb_ptr[0*STRIDE_PBB], pbb_ptr[3*STRIDE_PBB],
 899                     pbb_ptr[1*STRIDE_PBB], pbb_ptr[4*STRIDE_PBB],
 900                     pbb_ptr[2*STRIDE_PBB], pbb_ptr[5*STRIDE_PBB]);
 901         }
 902     }
 903 #endif
 904     else
 905     {
 906         /* Store the bounding boxes as xyz.xyz. */
 907         nbnxn_bb_t *bb_ptr = grid->bb.data() + ((atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log);
 908
 909         calc_bounding_box(numAtoms, nbat->xstride, nbat->x + atomStart*nbat->xstride,
 910                           bb_ptr);
 911
 912         if (gmx_debug_at)
 913         {
 914             int bbo = (atomStart - grid->cell0*grid->na_sc)/grid->na_c;
 915             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 916                     atomStart >> grid->na_c_2log,
 917                     grid->bb[bbo].lower[BB_X],
 918                     grid->bb[bbo].lower[BB_Y],
 919                     grid->bb[bbo].lower[BB_Z],
 920                     grid->bb[bbo].upper[BB_X],
 921                     grid->bb[bbo].upper[BB_Y],
 922                     grid->bb[bbo].upper[BB_Z]);
 923         }
 924     }
 925 }
 926
 927 /* Spatially sort the atoms within one grid column */
 928 static void sort_columns_simple(const nbnxn_search_t nbs,
 929                                 int dd_zone,
 930                                 nbnxn_grid_t *grid,
 931                                 int atomStart, int atomEnd,
 932                                 const int *atinfo,
 933                                 const rvec *x,
 934                                 nbnxn_atomdata_t *nbat,
 935                                 int cxy_start, int cxy_end,
 936                                 int *sort_work)
 937 {
 938     if (debug)
 939     {
 940         fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
 941                 grid->cell0, cxy_start, cxy_end, atomStart, atomEnd);
 942     }
 943
 944     /* Sort the atoms within each x,y column in 3 dimensions */
 945     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
 946     {
 947         int na  = grid->cxy_na[cxy];
 948         int ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
 949         int ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 950
 951         /* Sort the atoms within each x,y column on z coordinate */
 952         sort_atoms(ZZ, FALSE, dd_zone,
 953                    nbs->a.data() + ash, na, x,
 954                    grid->c0[ZZ],
 955                    1.0/grid->size[ZZ], ncz*grid->na_sc,
 956                    sort_work);
 957
 958         /* Fill the ncz cells in this column */
 959         int cfilled = grid->cxy_ind[cxy];
 960         for (int cz = 0; cz < ncz; cz++)
 961         {
 962             int c     = grid->cxy_ind[cxy] + cz;
 963
 964             int ash_c = ash + cz*grid->na_sc;
 965             int na_c  = std::min(grid->na_sc, na-(ash_c-ash));
 966
 967             fill_cell(nbs, grid, nbat,
 968                       ash_c, ash_c+na_c, atinfo, x,
 969                       nullptr);
 970
 971             /* This copy to bbcz is not really necessary.
 972              * But it allows to use the same grid search code
 973              * for the simple and supersub cell setups.
 974              */
 975             if (na_c > 0)
 976             {
 977                 cfilled = c;
 978             }
 979             grid->bbcz[c*NNBSBB_D  ] = grid->bb[cfilled].lower[BB_Z];
 980             grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled].upper[BB_Z];
 981         }
 982
 983         /* Set the unused atom indices to -1 */
 984         for (int ind = na; ind < ncz*grid->na_sc; ind++)
 985         {
 986             nbs->a[ash+ind] = -1;
 987         }
 988     }
 989 }
 990
 991 /* Spatially sort the atoms within one grid column */
 992 static void sort_columns_supersub(const nbnxn_search_t nbs,
 993                                   int dd_zone,
 994                                   nbnxn_grid_t *grid,
 995                                   int atomStart, int atomEnd,
 996                                   const int *atinfo,
 997                                   const rvec *x,
 998                                   nbnxn_atomdata_t *nbat,
 999                                   int cxy_start, int cxy_end,
1000                                   int *sort_work)
1001 {
1002     nbnxn_bb_t  bb_work_array[2];
1003     nbnxn_bb_t *bb_work_aligned = reinterpret_cast<nbnxn_bb_t *>((reinterpret_cast<std::size_t>(bb_work_array + 1)) & (~(static_cast<std::size_t>(15))));
1004
1005     if (debug)
1006     {
1007         fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1008                 grid->cell0, cxy_start, cxy_end, atomStart, atomEnd);
1009     }
1010
1011     int subdiv_x = grid->na_c;
1012     int subdiv_y = c_gpuNumClusterPerCellX*subdiv_x;
1013     int subdiv_z = c_gpuNumClusterPerCellY*subdiv_y;
1014
1015     /* Sort the atoms within each x,y column in 3 dimensions.
1016      * Loop over all columns on the x/y grid.
1017      */
1018     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
1019     {
1020         int gridX            = cxy/grid->ncy;
1021         int gridY            = cxy - gridX*grid->ncy;
1022
1023         int numAtomsInColumn = grid->cxy_na[cxy];
1024         int numCellsInColumn = grid->cxy_ind[cxy + 1] - grid->cxy_ind[cxy];
1025         int ash              = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1026
1027         /* Sort the atoms within each x,y column on z coordinate */
1028         sort_atoms(ZZ, FALSE, dd_zone,
1029                    nbs->a.data() + ash, numAtomsInColumn, x,
1030                    grid->c0[ZZ],
1031                    1.0/grid->size[ZZ], numCellsInColumn*grid->na_sc,
1032                    sort_work);
1033
1034         /* This loop goes over the cells and clusters along z at once */
1035         for (int sub_z = 0; sub_z < numCellsInColumn*c_gpuNumClusterPerCellZ; sub_z++)
1036         {
1037             int ash_z = ash + sub_z*subdiv_z;
1038             int na_z  = std::min(subdiv_z, numAtomsInColumn - (ash_z - ash));
1039             int cz    = -1;
1040             /* We have already sorted on z */
1041
1042             if (sub_z % c_gpuNumClusterPerCellZ == 0)
1043             {
1044                 cz = sub_z/c_gpuNumClusterPerCellZ;
1045                 int cell = grid->cxy_ind[cxy] + cz;
1046
1047                 /* The number of atoms in this cell/super-cluster */
1048                 int numAtoms = std::min(grid->na_sc, numAtomsInColumn - (ash_z - ash));
1049
1050                 grid->nsubc[cell] = std::min(c_gpuNumClusterPerCell,
1051                                              (numAtoms + grid->na_c - 1)/grid->na_c);
1052
1053                 /* Store the z-boundaries of the bounding box of the cell */
1054                 grid->bbcz[cell*NNBSBB_D  ] = x[nbs->a[ash_z]][ZZ];
1055                 grid->bbcz[cell*NNBSBB_D+1] = x[nbs->a[ash_z + numAtoms - 1]][ZZ];
1056             }
1057
1058             if (c_gpuNumClusterPerCellY > 1)
1059             {
1060                 /* Sort the atoms along y */
1061                 sort_atoms(YY, (sub_z & 1), dd_zone,
1062                            nbs->a.data() + ash_z, na_z, x,
1063                            grid->c0[YY] + gridY*grid->sy,
1064                            grid->inv_sy, subdiv_z,
1065                            sort_work);
1066             }
1067
1068             for (int sub_y = 0; sub_y < c_gpuNumClusterPerCellY; sub_y++)
1069             {
1070                 int ash_y = ash_z + sub_y*subdiv_y;
1071                 int na_y  = std::min(subdiv_y, numAtomsInColumn - (ash_y - ash));
1072
1073                 if (c_gpuNumClusterPerCellX > 1)
1074                 {
1075                     /* Sort the atoms along x */
1076                     sort_atoms(XX, ((cz*c_gpuNumClusterPerCellY + sub_y) & 1), dd_zone,
1077                                nbs->a.data() + ash_y, na_y, x,
1078                                grid->c0[XX] + gridX*grid->sx,
1079                                grid->inv_sx, subdiv_y,
1080                                sort_work);
1081                 }
1082
1083                 for (int sub_x = 0; sub_x < c_gpuNumClusterPerCellX; sub_x++)
1084                 {
1085                     int ash_x = ash_y + sub_x*subdiv_x;
1086                     int na_x  = std::min(subdiv_x, numAtomsInColumn - (ash_x - ash));
1087
1088                     fill_cell(nbs, grid, nbat,
1089                               ash_x, ash_x + na_x, atinfo, x,
1090                               bb_work_aligned);
1091                 }
1092             }
1093         }
1094
1095         /* Set the unused atom indices to -1 */
1096         for (int ind = numAtomsInColumn; ind < numCellsInColumn*grid->na_sc; ind++)
1097         {
1098             nbs->a[ash + ind] = -1;
1099         }
1100     }
1101 }
1102
1103 /* Determine in which grid column atoms should go */
1104 static void calc_column_indices(nbnxn_grid_t *grid,
1105                                 int atomStart, int atomEnd,
1106                                 const rvec *x,
1107                                 int dd_zone, const int *move,
1108                                 int thread, int nthread,
1109                                 gmx::ArrayRef<int> cell,
1110                                 int *cxy_na)
1111 {
1112     /* We add one extra cell for particles which moved during DD */
1113     for (int i = 0; i < grid->ncx*grid->ncy+1; i++)
1114     {
1115         cxy_na[i] = 0;
1116     }
1117
1118     int taskAtomStart = atomStart + static_cast<int>((thread + 0)*(atomEnd - atomStart))/nthread;
1119     int taskAtomEnd   = atomStart + static_cast<int>((thread + 1)*(atomEnd - atomStart))/nthread;
1120     if (dd_zone == 0)
1121     {
1122         /* Home zone */
1123         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1124         {
1125             if (move == nullptr || move[i] >= 0)
1126             {
1127                 /* We need to be careful with rounding,
1128                  * particles might be a few bits outside the local zone.
1129                  * The int cast takes care of the lower bound,
1130                  * we will explicitly take care of the upper bound.
1131                  */
1132                 int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1133                 int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1134
1135 #ifndef NDEBUG
1136                 if (cx < 0 || cx > grid->ncx ||
1137                     cy < 0 || cy > grid->ncy)
1138                 {
1139                     gmx_fatal(FARGS,
1140                               "grid cell cx %d cy %d out of range (max %d %d)\n"
1141                               "atom %f %f %f, grid->c0 %f %f",
1142                               cx, cy, grid->ncx, grid->ncy,
1143                               x[i][XX], x[i][YY], x[i][ZZ], grid->c0[XX], grid->c0[YY]);
1144                 }
1145 #endif
1146                 /* Take care of potential rouding issues */
1147                 cx = std::min(cx, grid->ncx - 1);
1148                 cy = std::min(cy, grid->ncy - 1);
1149
1150                 /* For the moment cell will contain only the, grid local,
1151                  * x and y indices, not z.
1152                  */
1153                 cell[i] = cx*grid->ncy + cy;
1154             }
1155             else
1156             {
1157                 /* Put this moved particle after the end of the grid,
1158                  * so we can process it later without using conditionals.
1159                  */
1160                 cell[i] = grid->ncx*grid->ncy;
1161             }
1162
1163             cxy_na[cell[i]]++;
1164         }
1165     }
1166     else
1167     {
1168         /* Non-home zone */
1169         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1170         {
1171             int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1172             int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1173
1174             /* For non-home zones there could be particles outside
1175              * the non-bonded cut-off range, which have been communicated
1176              * for bonded interactions only. For the result it doesn't
1177              * matter where these end up on the grid. For performance
1178              * we put them in an extra row at the border.
1179              */
1180             cx = std::max(cx, 0);
1181             cx = std::min(cx, grid->ncx - 1);
1182             cy = std::max(cy, 0);
1183             cy = std::min(cy, grid->ncy - 1);
1184
1185             /* For the moment cell will contain only the, grid local,
1186              * x and y indices, not z.
1187              */
1188             cell[i] = cx*grid->ncy + cy;
1189
1190             cxy_na[cell[i]]++;
1191         }
1192     }
1193 }
1194
1195 /* Resizes grid and atom data which depend on the number of cells */
1196 static void resizeForNumberOfCells(const nbnxn_grid_t &grid,
1197                                    int                 numAtomsMoved,
1198                                    nbnxn_search       *nbs,
1199                                    nbnxn_atomdata_t   *nbat)
1200 {
1201     int numNbnxnAtoms = (grid.cell0 + grid.nc)*grid.na_sc;
1202
1203     /* Note: nbs->cell was already resized before */
1204
1205     /* To avoid conditionals we store the moved particles at the end of a,
1206      * make sure we have enough space.
1207      */
1208     nbs->a.resize(numNbnxnAtoms + numAtomsMoved);
1209
1210     /* We need padding up to a multiple of the buffer flag size: simply add */
1211     if (numNbnxnAtoms + NBNXN_BUFFERFLAG_SIZE > nbat->nalloc)
1212     {
1213         nbnxn_atomdata_realloc(nbat, numNbnxnAtoms + NBNXN_BUFFERFLAG_SIZE);
1214     }
1215
1216     nbat->natoms = numNbnxnAtoms;
1217 }
1218
1219 /* Determine in which grid cells the atoms should go */
1220 static void calc_cell_indices(const nbnxn_search_t  nbs,
1221                               int                   ddZone,
1222                               nbnxn_grid_t         *grid,
1223                               int                   atomStart,
1224                               int                   atomEnd,
1225                               const int            *atinfo,
1226                               const rvec           *x,
1227                               int                   numAtomsMoved,
1228                               const int            *move,
1229                               nbnxn_atomdata_t     *nbat)
1230 {
1231     /* First compute all grid/column indices and store them in nbs->cell */
1232     nbs->cell.resize(atomEnd);
1233
1234     const int nthread = gmx_omp_nthreads_get(emntPairsearch);
1235
1236 #pragma omp parallel for num_threads(nthread) schedule(static)
1237     for (int thread = 0; thread < nthread; thread++)
1238     {
1239         try
1240         {
1241             calc_column_indices(grid, atomStart, atomEnd, x, ddZone, move, thread, nthread,
1242                                 nbs->cell, nbs->work[thread].cxy_na);
1243         }
1244         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1245     }
1246
1247     /* Make the cell index as a function of x and y */
1248     int ncz_max      = 0;
1249     int ncz          = 0;
1250     grid->cxy_ind[0] = 0;
1251     for (int i = 0; i < grid->ncx*grid->ncy + 1; i++)
1252     {
1253         /* We set ncz_max at the beginning of the loop iso at the end
1254          * to skip i=grid->ncx*grid->ncy which are moved particles
1255          * that do not need to be ordered on the grid.
1256          */
1257         if (ncz > ncz_max)
1258         {
1259             ncz_max = ncz;
1260         }
1261         int cxy_na_i = nbs->work[0].cxy_na[i];
1262         for (int thread = 1; thread < nthread; thread++)
1263         {
1264             cxy_na_i += nbs->work[thread].cxy_na[i];
1265         }
1266         ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
1267         if (nbat->XFormat == nbatX8)
1268         {
1269             /* Make the number of cell a multiple of 2 */
1270             ncz = (ncz + 1) & ~1;
1271         }
1272         grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
1273         /* Clear cxy_na, so we can reuse the array below */
1274         grid->cxy_na[i] = 0;
1275     }
1276     grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
1277
1278     resizeForNumberOfCells(*grid, numAtomsMoved, nbs, nbat);
1279
1280     if (debug)
1281     {
1282         fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1283                 grid->na_sc, grid->na_c, grid->nc,
1284                 grid->ncx, grid->ncy, grid->nc/((double)(grid->ncx*grid->ncy)),
1285                 ncz_max);
1286         if (gmx_debug_at)
1287         {
1288             int i = 0;
1289             for (int cy = 0; cy < grid->ncy; cy++)
1290             {
1291                 for (int cx = 0; cx < grid->ncx; cx++)
1292                 {
1293                     fprintf(debug, " %2d", grid->cxy_ind[i+1]-grid->cxy_ind[i]);
1294                     i++;
1295                 }
1296                 fprintf(debug, "\n");
1297             }
1298         }
1299     }
1300
1301     /* Make sure the work array for sorting is large enough */
1302     if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
1303     {
1304         for (int thread = 0; thread < nbs->nthread_max; thread++)
1305         {
1306             nbs->work[thread].sort_work_nalloc =
1307                 over_alloc_large(ncz_max*grid->na_sc*SGSF);
1308             srenew(nbs->work[thread].sort_work,
1309                    nbs->work[thread].sort_work_nalloc);
1310             /* When not in use, all elements should be -1 */
1311             for (int i = 0; i < nbs->work[thread].sort_work_nalloc; i++)
1312             {
1313                 nbs->work[thread].sort_work[i] = -1;
1314             }
1315         }
1316     }
1317
1318     /* Now we know the dimensions we can fill the grid.
1319      * This is the first, unsorted fill. We sort the columns after this.
1320      */
1321     for (int i = atomStart; i < atomEnd; i++)
1322     {
1323         /* At this point nbs->cell contains the local grid x,y indices */
1324         int cxy = nbs->cell[i];
1325         nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
1326     }
1327
1328     if (ddZone == 0)
1329     {
1330         /* Set the cell indices for the moved particles */
1331         int n0 = grid->nc*grid->na_sc;
1332         int n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
1333         if (ddZone == 0)
1334         {
1335             for (int i = n0; i < n1; i++)
1336             {
1337                 nbs->cell[nbs->a[i]] = i;
1338             }
1339         }
1340     }
1341
1342     /* Sort the super-cell columns along z into the sub-cells. */
1343 #pragma omp parallel for num_threads(nthread) schedule(static)
1344     for (int thread = 0; thread < nthread; thread++)
1345     {
1346         try
1347         {
1348             if (grid->bSimple)
1349             {
1350                 sort_columns_simple(nbs, ddZone, grid, atomStart, atomEnd, atinfo, x, nbat,
1351                                     ((thread+0)*grid->ncx*grid->ncy)/nthread,
1352                                     ((thread+1)*grid->ncx*grid->ncy)/nthread,
1353                                     nbs->work[thread].sort_work);
1354             }
1355             else
1356             {
1357                 sort_columns_supersub(nbs, ddZone, grid, atomStart, atomEnd, atinfo, x, nbat,
1358                                       ((thread+0)*grid->ncx*grid->ncy)/nthread,
1359                                       ((thread+1)*grid->ncx*grid->ncy)/nthread,
1360                                       nbs->work[thread].sort_work);
1361             }
1362         }
1363         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1364     }
1365
1366     if (grid->bSimple && nbat->XFormat == nbatX8)
1367     {
1368         combine_bounding_box_pairs(grid, grid->bb);
1369     }
1370
1371     if (!grid->bSimple)
1372     {
1373         grid->nsubc_tot = 0;
1374         for (int i = 0; i < grid->nc; i++)
1375         {
1376             grid->nsubc_tot += grid->nsubc[i];
1377         }
1378     }
1379
1380     if (debug)
1381     {
1382         if (grid->bSimple)
1383         {
1384             print_bbsizes_simple(debug, grid);
1385         }
1386         else
1387         {
1388             fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
1389                     grid->nsubc_tot, (atomEnd - atomStart)/(double)grid->nsubc_tot);
1390
1391             print_bbsizes_supersub(debug, grid);
1392         }
1393     }
1394 }
1395
1396 /* Sets up a grid and puts the atoms on the grid.
1397  * This function only operates on one domain of the domain decompostion.
1398  * Note that without domain decomposition there is only one domain.
1399  */
1400 void nbnxn_put_on_grid(nbnxn_search_t    nbs,
1401                        int               ePBC,
1402                        const matrix      box,
1403                        int               ddZone,
1404                        const rvec        lowerCorner,
1405                        const rvec        upperCorner,
1406                        int               atomStart,
1407                        int               atomEnd,
1408                        real              atomDensity,
1409                        const int        *atinfo,
1410                        const rvec       *x,
1411                        int               numAtomsMoved,
1412                        const int        *move,
1413                        int               nb_kernel_type,
1414                        nbnxn_atomdata_t *nbat)
1415 {
1416     nbnxn_grid_t *grid = &nbs->grid[ddZone];
1417
1418     nbs_cycle_start(&nbs->cc[enbsCCgrid]);
1419
1420     grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
1421
1422     grid->na_c      = nbnxn_kernel_to_cluster_i_size(nb_kernel_type);
1423     grid->na_cj     = nbnxn_kernel_to_cluster_j_size(nb_kernel_type);
1424     grid->na_sc     = (grid->bSimple ? 1 : c_gpuNumClusterPerCell)*grid->na_c;
1425     grid->na_c_2log = get_2log(grid->na_c);
1426
1427     nbat->na_c = grid->na_c;
1428
1429     if (ddZone == 0)
1430     {
1431         grid->cell0 = 0;
1432     }
1433     else
1434     {
1435         grid->cell0 =
1436             (nbs->grid[ddZone - 1].cell0 + nbs->grid[ddZone - 1].nc)*
1437             nbs->grid[ddZone- 1].na_sc/grid->na_sc;
1438     }
1439
1440     const int n = atomEnd - atomStart;
1441
1442     if (ddZone == 0)
1443     {
1444         nbs->ePBC = ePBC;
1445         copy_mat(box, nbs->box);
1446
1447         /* Avoid zero density */
1448         if (atomDensity > 0)
1449         {
1450             grid->atom_density = atomDensity;
1451         }
1452         else
1453         {
1454             grid->atom_density = grid_atom_density(n - numAtomsMoved, lowerCorner, upperCorner);
1455         }
1456
1457         grid->cell0 = 0;
1458
1459         nbs->natoms_local    = atomEnd - numAtomsMoved;
1460         /* We assume that nbnxn_put_on_grid is called first
1461          * for the local atoms (ddZone=0).
1462          */
1463         nbs->natoms_nonlocal = atomEnd - numAtomsMoved;
1464
1465         if (debug)
1466         {
1467             fprintf(debug, "natoms_local = %5d atom_density = %5.1f\n",
1468                     nbs->natoms_local, grid->atom_density);
1469         }
1470     }
1471     else
1472     {
1473         nbs->natoms_nonlocal = std::max(nbs->natoms_nonlocal, atomEnd);
1474     }
1475
1476     /* We always use the home zone (grid[0]) for setting the cell size,
1477      * since determining densities for non-local zones is difficult.
1478      */
1479     set_grid_size_xy(nbs, grid,
1480                      ddZone, n - numAtomsMoved,
1481                      lowerCorner, upperCorner,
1482                      nbs->grid[0].atom_density);
1483
1484     calc_cell_indices(nbs, ddZone, grid, atomStart, atomEnd, atinfo, x, numAtomsMoved, move, nbat);
1485
1486     if (ddZone == 0)
1487     {
1488         nbat->natoms_local = nbat->natoms;
1489     }
1490
1491     nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
1492 }
1493
1494 /* Calls nbnxn_put_on_grid for all non-local domains */
1495 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t                   nbs,
1496                                 const struct gmx_domdec_zones_t *zones,
1497                                 const int                       *atinfo,
1498                                 const rvec                      *x,
1499                                 int                              nb_kernel_type,
1500                                 nbnxn_atomdata_t                *nbat)
1501 {
1502     for (int zone = 1; zone < zones->n; zone++)
1503     {
1504         rvec c0, c1;
1505         for (int d = 0; d < DIM; d++)
1506         {
1507             c0[d] = zones->size[zone].bb_x0[d];
1508             c1[d] = zones->size[zone].bb_x1[d];
1509         }
1510
1511         nbnxn_put_on_grid(nbs, nbs->ePBC, nullptr,
1512                           zone, c0, c1,
1513                           zones->cg_range[zone],
1514                           zones->cg_range[zone+1],
1515                           -1,
1516                           atinfo,
1517                           x,
1518                           0, nullptr,
1519                           nb_kernel_type,
1520                           nbat);
1521     }
1522 }
1523
1524 void nbnxn_get_ncells(nbnxn_search_t nbs, int *ncx, int *ncy)
1525 {
1526     *ncx = nbs->grid[0].ncx;
1527     *ncy = nbs->grid[0].ncy;
1528 }
1529
1530 void nbnxn_get_atomorder(const nbnxn_search_t nbs, const int **a, int *n)
1531 {
1532     const nbnxn_grid_t *grid;
1533
1534     grid = &nbs->grid[0];
1535
1536     /* Return the atom order for the home cell (index 0) */
1537     *a  = nbs->a.data();
1538
1539     *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
1540 }
1541
1542 void nbnxn_set_atomorder(nbnxn_search_t nbs)
1543 {
1544     /* Set the atom order for the home cell (index 0) */
1545     nbnxn_grid_t *grid = &nbs->grid[0];
1546
1547     int           ao = 0;
1548     for (int cx = 0; cx < grid->ncx; cx++)
1549     {
1550         for (int cy = 0; cy < grid->ncy; cy++)
1551         {
1552             int cxy = cx*grid->ncy + cy;
1553             int j   = grid->cxy_ind[cxy]*grid->na_sc;
1554             for (int cz = 0; cz < grid->cxy_na[cxy]; cz++)
1555             {
1556                 nbs->a[j]     = ao;
1557                 nbs->cell[ao] = j;
1558                 ao++;
1559                 j++;
1560             }
1561         }
1562     }
1563 }