src/gromacs/mdlib/nbnxn_grid.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "nbnxn_grid.h"
  39
  40 #include "config.h"
  41
  42 #include <assert.h>
  43 #include <string.h>
  44
  45 #include <cmath>
  46
  47 #include <algorithm>
  48
  49 #include "gromacs/domdec/domdec_struct.h"
  50 #include "gromacs/math/utilities.h"
  51 #include "gromacs/math/vec.h"
  52 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  53 #include "gromacs/mdlib/nb_verlet.h"
  54 #include "gromacs/mdlib/nbnxn_atomdata.h"
  55 #include "gromacs/mdlib/nbnxn_consts.h"
  56 #include "gromacs/mdlib/nbnxn_internal.h"
  57 #include "gromacs/mdlib/nbnxn_search.h"
  58 #include "gromacs/mdlib/nbnxn_util.h"
  59 #include "gromacs/simd/simd.h"
  60 #include "gromacs/simd/vector_operations.h"
  61 #include "gromacs/utility/exceptions.h"
  62 #include "gromacs/utility/smalloc.h"
  63
  64 struct gmx_domdec_zones_t;
  65
  66 static void nbnxn_grid_init(nbnxn_grid_t * grid)
  67 {
  68     grid->cxy_na      = nullptr;
  69     grid->cxy_ind     = nullptr;
  70     grid->cxy_nalloc  = 0;
  71     grid->bb          = nullptr;
  72     grid->bbj         = nullptr;
  73     grid->nc_nalloc   = 0;
  74 }
  75
  76 void nbnxn_grids_init(nbnxn_search_t nbs, int ngrid)
  77 {
  78     int g;
  79
  80     nbs->ngrid = ngrid;
  81
  82     snew(nbs->grid, nbs->ngrid);
  83     for (g = 0; g < nbs->ngrid; g++)
  84     {
  85         nbnxn_grid_init(&nbs->grid[g]);
  86     }
  87 }
  88
  89 static real grid_atom_density(int n, rvec corner0, rvec corner1)
  90 {
  91     rvec size;
  92
  93     if (n == 0)
  94     {
  95         /* To avoid zero density we use a minimum of 1 atom */
  96         n = 1;
  97     }
  98
  99     rvec_sub(corner1, corner0, size);
 100
 101     return n/(size[XX]*size[YY]*size[ZZ]);
 102 }
 103
 104 static int set_grid_size_xy(const nbnxn_search_t nbs,
 105                             nbnxn_grid_t *grid,
 106                             int dd_zone,
 107                             int n, rvec corner0, rvec corner1,
 108                             real atom_density)
 109 {
 110     rvec size;
 111     int  na_c;
 112     int  nc_max;
 113     real tlen, tlen_x, tlen_y;
 114
 115     rvec_sub(corner1, corner0, size);
 116
 117     if (n > grid->na_sc)
 118     {
 119         assert(atom_density > 0);
 120
 121         /* target cell length */
 122         if (grid->bSimple)
 123         {
 124             /* To minimize the zero interactions, we should make
 125              * the largest of the i/j cell cubic.
 126              */
 127             na_c = std::max(grid->na_c, grid->na_cj);
 128
 129             /* Approximately cubic cells */
 130             tlen   = std::cbrt(na_c/atom_density);
 131             tlen_x = tlen;
 132             tlen_y = tlen;
 133         }
 134         else
 135         {
 136             /* Approximately cubic sub cells */
 137             tlen   = std::cbrt(grid->na_c/atom_density);
 138             tlen_x = tlen*c_gpuNumClusterPerCellX;
 139             tlen_y = tlen*c_gpuNumClusterPerCellY;
 140         }
 141         /* We round ncx and ncy down, because we get less cell pairs
 142          * in the nbsist when the fixed cell dimensions (x,y) are
 143          * larger than the variable one (z) than the other way around.
 144          */
 145         grid->ncx = std::max(1, static_cast<int>(size[XX]/tlen_x));
 146         grid->ncy = std::max(1, static_cast<int>(size[YY]/tlen_y));
 147     }
 148     else
 149     {
 150         grid->ncx = 1;
 151         grid->ncy = 1;
 152     }
 153
 154     grid->sx     = size[XX]/grid->ncx;
 155     grid->sy     = size[YY]/grid->ncy;
 156     grid->inv_sx = 1/grid->sx;
 157     grid->inv_sy = 1/grid->sy;
 158
 159     if (dd_zone > 0)
 160     {
 161         /* This is a non-home zone, add an extra row of cells
 162          * for particles communicated for bonded interactions.
 163          * These can be beyond the cut-off. It doesn't matter where
 164          * they end up on the grid, but for performance it's better
 165          * if they don't end up in cells that can be within cut-off range.
 166          */
 167         grid->ncx++;
 168         grid->ncy++;
 169     }
 170
 171     /* We need one additional cell entry for particles moved by DD */
 172     if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
 173     {
 174         grid->cxy_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
 175         srenew(grid->cxy_na, grid->cxy_nalloc);
 176         srenew(grid->cxy_ind, grid->cxy_nalloc+1);
 177     }
 178     for (int t = 0; t < nbs->nthread_max; t++)
 179     {
 180         if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
 181         {
 182             nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
 183             srenew(nbs->work[t].cxy_na, nbs->work[t].cxy_na_nalloc);
 184         }
 185     }
 186
 187     /* Worst case scenario of 1 atom in each last cell */
 188     if (grid->na_cj <= grid->na_c)
 189     {
 190         nc_max = n/grid->na_sc + grid->ncx*grid->ncy;
 191     }
 192     else
 193     {
 194         nc_max = n/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
 195     }
 196
 197     if (nc_max > grid->nc_nalloc)
 198     {
 199         grid->nc_nalloc = over_alloc_large(nc_max);
 200         srenew(grid->nsubc, grid->nc_nalloc);
 201         srenew(grid->bbcz, grid->nc_nalloc*NNBSBB_D);
 202
 203         sfree_aligned(grid->bb);
 204         /* This snew also zeros the contents, this avoid possible
 205          * floating exceptions in SIMD with the unused bb elements.
 206          */
 207         if (grid->bSimple)
 208         {
 209             snew_aligned(grid->bb, grid->nc_nalloc, 16);
 210         }
 211         else
 212         {
 213 #if NBNXN_BBXXXX
 214             int pbb_nalloc;
 215
 216             pbb_nalloc = grid->nc_nalloc*c_gpuNumClusterPerCell/STRIDE_PBB*NNBSBB_XXXX;
 217             snew_aligned(grid->pbb, pbb_nalloc, 16);
 218 #else
 219             snew_aligned(grid->bb, grid->nc_nalloc*c_gpuNumClusterPerCell, 16);
 220 #endif
 221         }
 222
 223         if (grid->bSimple)
 224         {
 225             if (grid->na_cj == grid->na_c)
 226             {
 227                 grid->bbj = grid->bb;
 228             }
 229             else
 230             {
 231                 sfree_aligned(grid->bbj);
 232                 snew_aligned(grid->bbj, grid->nc_nalloc*grid->na_c/grid->na_cj, 16);
 233             }
 234         }
 235
 236         srenew(grid->flags, grid->nc_nalloc);
 237         if (nbs->bFEP)
 238         {
 239             srenew(grid->fep, grid->nc_nalloc*grid->na_sc/grid->na_c);
 240         }
 241     }
 242
 243     copy_rvec(corner0, grid->c0);
 244     copy_rvec(corner1, grid->c1);
 245     copy_rvec(size,    grid->size);
 246
 247     return nc_max;
 248 }
 249
 250 /* We need to sort paricles in grid columns on z-coordinate.
 251  * As particle are very often distributed homogeneously, we a sorting
 252  * algorithm similar to pigeonhole sort. We multiply the z-coordinate
 253  * by a factor, cast to an int and try to store in that hole. If the hole
 254  * is full, we move this or another particle. A second pass is needed to make
 255  * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
 256  * 4 is the optimal value for homogeneous particle distribution and allows
 257  * for an O(#particles) sort up till distributions were all particles are
 258  * concentrated in 1/4 of the space. No NlogN fallback is implemented,
 259  * as it can be expensive to detect imhomogeneous particle distributions.
 260  * SGSF is the maximum ratio of holes used, in the worst case all particles
 261  * end up in the last hole and we need #particles extra holes at the end.
 262  */
 263 #define SORT_GRID_OVERSIZE 4
 264 #define SGSF (SORT_GRID_OVERSIZE + 1)
 265
 266 /* Sort particle index a on coordinates x along dim.
 267  * Backwards tells if we want decreasing iso increasing coordinates.
 268  * h0 is the minimum of the coordinate range.
 269  * invh is the 1/length of the sorting range.
 270  * n_per_h (>=n) is the expected average number of particles per 1/invh
 271  * sort is the sorting work array.
 272  * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
 273  * or easier, allocate at least n*SGSF elements.
 274  */
 275 static void sort_atoms(int dim, gmx_bool Backwards,
 276                        int gmx_unused dd_zone,
 277                        int *a, int n, rvec *x,
 278                        real h0, real invh, int n_per_h,
 279                        int *sort)
 280 {
 281     if (n <= 1)
 282     {
 283         /* Nothing to do */
 284         return;
 285     }
 286
 287 #ifndef NDEBUG
 288     if (n > n_per_h)
 289     {
 290         gmx_incons("n > n_per_h");
 291     }
 292 #endif
 293
 294     /* Transform the inverse range height into the inverse hole height */
 295     invh *= n_per_h*SORT_GRID_OVERSIZE;
 296
 297     /* Set nsort to the maximum possible number of holes used.
 298      * In worst case all n elements end up in the last bin.
 299      */
 300     int nsort = n_per_h*SORT_GRID_OVERSIZE + n;
 301
 302     /* Determine the index range used, so we can limit it for the second pass */
 303     int zi_min = INT_MAX;
 304     int zi_max = -1;
 305
 306     /* Sort the particles using a simple index sort */
 307     for (int i = 0; i < n; i++)
 308     {
 309         /* The cast takes care of float-point rounding effects below zero.
 310          * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
 311          * times the box height out of the box.
 312          */
 313         int zi = static_cast<int>((x[a[i]][dim] - h0)*invh);
 314
 315 #ifndef NDEBUG
 316         /* As we can have rounding effect, we use > iso >= here */
 317         if (zi < 0 || (dd_zone == 0 && zi > n_per_h*SORT_GRID_OVERSIZE))
 318         {
 319             gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
 320                       a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
 321                       n_per_h, SORT_GRID_OVERSIZE);
 322         }
 323 #endif
 324
 325         /* In a non-local domain, particles communcated for bonded interactions
 326          * can be far beyond the grid size, which is set by the non-bonded
 327          * cut-off distance. We sort such particles into the last cell.
 328          */
 329         if (zi > n_per_h*SORT_GRID_OVERSIZE)
 330         {
 331             zi = n_per_h*SORT_GRID_OVERSIZE;
 332         }
 333
 334         /* Ideally this particle should go in sort cell zi,
 335          * but that might already be in use,
 336          * in that case find the first empty cell higher up
 337          */
 338         if (sort[zi] < 0)
 339         {
 340             sort[zi] = a[i];
 341             zi_min   = std::min(zi_min, zi);
 342             zi_max   = std::max(zi_max, zi);
 343         }
 344         else
 345         {
 346             /* We have multiple atoms in the same sorting slot.
 347              * Sort on real z for minimal bounding box size.
 348              * There is an extra check for identical z to ensure
 349              * well-defined output order, independent of input order
 350              * to ensure binary reproducibility after restarts.
 351              */
 352             while (sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
 353                                       (x[a[i]][dim] == x[sort[zi]][dim] &&
 354                                        a[i] > sort[zi])))
 355             {
 356                 zi++;
 357             }
 358
 359             if (sort[zi] >= 0)
 360             {
 361                 /* Shift all elements by one slot until we find an empty slot */
 362                 int cp  = sort[zi];
 363                 int zim = zi + 1;
 364                 while (sort[zim] >= 0)
 365                 {
 366                     int tmp   = sort[zim];
 367                     sort[zim] = cp;
 368                     cp        = tmp;
 369                     zim++;
 370                 }
 371                 sort[zim] = cp;
 372                 zi_max    = std::max(zi_max, zim);
 373             }
 374             sort[zi] = a[i];
 375             zi_max   = std::max(zi_max, zi);
 376         }
 377     }
 378
 379     int c = 0;
 380     if (!Backwards)
 381     {
 382         for (int zi = 0; zi < nsort; zi++)
 383         {
 384             if (sort[zi] >= 0)
 385             {
 386                 a[c++]   = sort[zi];
 387                 sort[zi] = -1;
 388             }
 389         }
 390     }
 391     else
 392     {
 393         for (int zi = zi_max; zi >= zi_min; zi--)
 394         {
 395             if (sort[zi] >= 0)
 396             {
 397                 a[c++]   = sort[zi];
 398                 sort[zi] = -1;
 399             }
 400         }
 401     }
 402     if (c < n)
 403     {
 404         gmx_incons("Lost particles while sorting");
 405     }
 406 }
 407
 408 #if GMX_DOUBLE
 409 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
 410 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
 411 #else
 412 #define R2F_D(x) (x)
 413 #define R2F_U(x) (x)
 414 #endif
 415
 416 /* Coordinate order x,y,z, bb order xyz0 */
 417 static void calc_bounding_box(int na, int stride, const real *x, nbnxn_bb_t *bb)
 418 {
 419     int  i;
 420     real xl, xh, yl, yh, zl, zh;
 421
 422     i  = 0;
 423     xl = x[i+XX];
 424     xh = x[i+XX];
 425     yl = x[i+YY];
 426     yh = x[i+YY];
 427     zl = x[i+ZZ];
 428     zh = x[i+ZZ];
 429     i += stride;
 430     for (int j = 1; j < na; j++)
 431     {
 432         xl = std::min(xl, x[i+XX]);
 433         xh = std::max(xh, x[i+XX]);
 434         yl = std::min(yl, x[i+YY]);
 435         yh = std::max(yh, x[i+YY]);
 436         zl = std::min(zl, x[i+ZZ]);
 437         zh = std::max(zh, x[i+ZZ]);
 438         i += stride;
 439     }
 440     /* Note: possible double to float conversion here */
 441     bb->lower[BB_X] = R2F_D(xl);
 442     bb->lower[BB_Y] = R2F_D(yl);
 443     bb->lower[BB_Z] = R2F_D(zl);
 444     bb->upper[BB_X] = R2F_U(xh);
 445     bb->upper[BB_Y] = R2F_U(yh);
 446     bb->upper[BB_Z] = R2F_U(zh);
 447 }
 448
 449 /* Packed coordinates, bb order xyz0 */
 450 static void calc_bounding_box_x_x4(int na, const real *x, nbnxn_bb_t *bb)
 451 {
 452     real xl, xh, yl, yh, zl, zh;
 453
 454     xl = x[XX*c_packX4];
 455     xh = x[XX*c_packX4];
 456     yl = x[YY*c_packX4];
 457     yh = x[YY*c_packX4];
 458     zl = x[ZZ*c_packX4];
 459     zh = x[ZZ*c_packX4];
 460     for (int j = 1; j < na; j++)
 461     {
 462         xl = std::min(xl, x[j+XX*c_packX4]);
 463         xh = std::max(xh, x[j+XX*c_packX4]);
 464         yl = std::min(yl, x[j+YY*c_packX4]);
 465         yh = std::max(yh, x[j+YY*c_packX4]);
 466         zl = std::min(zl, x[j+ZZ*c_packX4]);
 467         zh = std::max(zh, x[j+ZZ*c_packX4]);
 468     }
 469     /* Note: possible double to float conversion here */
 470     bb->lower[BB_X] = R2F_D(xl);
 471     bb->lower[BB_Y] = R2F_D(yl);
 472     bb->lower[BB_Z] = R2F_D(zl);
 473     bb->upper[BB_X] = R2F_U(xh);
 474     bb->upper[BB_Y] = R2F_U(yh);
 475     bb->upper[BB_Z] = R2F_U(zh);
 476 }
 477
 478 /* Packed coordinates, bb order xyz0 */
 479 static void calc_bounding_box_x_x8(int na, const real *x, nbnxn_bb_t *bb)
 480 {
 481     real xl, xh, yl, yh, zl, zh;
 482
 483     xl = x[XX*c_packX8];
 484     xh = x[XX*c_packX8];
 485     yl = x[YY*c_packX8];
 486     yh = x[YY*c_packX8];
 487     zl = x[ZZ*c_packX8];
 488     zh = x[ZZ*c_packX8];
 489     for (int j = 1; j < na; j++)
 490     {
 491         xl = std::min(xl, x[j+XX*c_packX8]);
 492         xh = std::max(xh, x[j+XX*c_packX8]);
 493         yl = std::min(yl, x[j+YY*c_packX8]);
 494         yh = std::max(yh, x[j+YY*c_packX8]);
 495         zl = std::min(zl, x[j+ZZ*c_packX8]);
 496         zh = std::max(zh, x[j+ZZ*c_packX8]);
 497     }
 498     /* Note: possible double to float conversion here */
 499     bb->lower[BB_X] = R2F_D(xl);
 500     bb->lower[BB_Y] = R2F_D(yl);
 501     bb->lower[BB_Z] = R2F_D(zl);
 502     bb->upper[BB_X] = R2F_U(xh);
 503     bb->upper[BB_Y] = R2F_U(yh);
 504     bb->upper[BB_Z] = R2F_U(zh);
 505 }
 506
 507 /* Packed coordinates, bb order xyz0 */
 508 static void calc_bounding_box_x_x4_halves(int na, const real *x,
 509                                           nbnxn_bb_t *bb, nbnxn_bb_t *bbj)
 510 {
 511     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 512     using namespace gmx;
 513
 514     calc_bounding_box_x_x4(std::min(na, 2), x, bbj);
 515
 516     if (na > 2)
 517     {
 518         calc_bounding_box_x_x4(std::min(na-2, 2), x+(c_packX4>>1), bbj+1);
 519     }
 520     else
 521     {
 522         /* Set the "empty" bounding box to the same as the first one,
 523          * so we don't need to treat special cases in the rest of the code.
 524          */
 525 #if NBNXN_SEARCH_BB_SIMD4
 526         store4(&bbj[1].lower[0], load4(&bbj[0].lower[0]));
 527         store4(&bbj[1].upper[0], load4(&bbj[0].upper[0]));
 528 #else
 529         bbj[1] = bbj[0];
 530 #endif
 531     }
 532
 533 #if NBNXN_SEARCH_BB_SIMD4
 534     store4(&bb->lower[0], min(load4(&bbj[0].lower[0]), load4(&bbj[1].lower[0])));
 535     store4(&bb->upper[0], max(load4(&bbj[0].upper[0]), load4(&bbj[1].upper[0])));
 536 #else
 537     {
 538         int i;
 539
 540         for (i = 0; i < NNBSBB_C; i++)
 541         {
 542             bb->lower[i] = std::min(bbj[0].lower[i], bbj[1].lower[i]);
 543             bb->upper[i] = std::max(bbj[0].upper[i], bbj[1].upper[i]);
 544         }
 545     }
 546 #endif
 547 }
 548
 549 #if NBNXN_SEARCH_BB_SIMD4
 550
 551 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
 552 static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
 553 {
 554     int  i;
 555     real xl, xh, yl, yh, zl, zh;
 556
 557     i  = 0;
 558     xl = x[i+XX];
 559     xh = x[i+XX];
 560     yl = x[i+YY];
 561     yh = x[i+YY];
 562     zl = x[i+ZZ];
 563     zh = x[i+ZZ];
 564     i += stride;
 565     for (int j = 1; j < na; j++)
 566     {
 567         xl = std::min(xl, x[i+XX]);
 568         xh = std::max(xh, x[i+XX]);
 569         yl = std::min(yl, x[i+YY]);
 570         yh = std::max(yh, x[i+YY]);
 571         zl = std::min(zl, x[i+ZZ]);
 572         zh = std::max(zh, x[i+ZZ]);
 573         i += stride;
 574     }
 575     /* Note: possible double to float conversion here */
 576     bb[0*STRIDE_PBB] = R2F_D(xl);
 577     bb[1*STRIDE_PBB] = R2F_D(yl);
 578     bb[2*STRIDE_PBB] = R2F_D(zl);
 579     bb[3*STRIDE_PBB] = R2F_U(xh);
 580     bb[4*STRIDE_PBB] = R2F_U(yh);
 581     bb[5*STRIDE_PBB] = R2F_U(zh);
 582 }
 583
 584 #endif /* NBNXN_SEARCH_BB_SIMD4 */
 585
 586 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 587
 588 /* Coordinate order xyz?, bb order xyz0 */
 589 static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb)
 590 {
 591     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 592     using namespace gmx;
 593
 594     Simd4Float bb_0_S, bb_1_S;
 595     Simd4Float x_S;
 596
 597     bb_0_S = load4(x);
 598     bb_1_S = bb_0_S;
 599
 600     for (int i = 1; i < na; i++)
 601     {
 602         x_S    = load4(x+i*NNBSBB_C);
 603         bb_0_S = min(bb_0_S, x_S);
 604         bb_1_S = max(bb_1_S, x_S);
 605     }
 606
 607     store4(&bb->lower[0], bb_0_S);
 608     store4(&bb->upper[0], bb_1_S);
 609 }
 610
 611 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
 612 static void calc_bounding_box_xxxx_simd4(int na, const float *x,
 613                                          nbnxn_bb_t *bb_work_aligned,
 614                                          real *bb)
 615 {
 616     calc_bounding_box_simd4(na, x, bb_work_aligned);
 617
 618     bb[0*STRIDE_PBB] = bb_work_aligned->lower[BB_X];
 619     bb[1*STRIDE_PBB] = bb_work_aligned->lower[BB_Y];
 620     bb[2*STRIDE_PBB] = bb_work_aligned->lower[BB_Z];
 621     bb[3*STRIDE_PBB] = bb_work_aligned->upper[BB_X];
 622     bb[4*STRIDE_PBB] = bb_work_aligned->upper[BB_Y];
 623     bb[5*STRIDE_PBB] = bb_work_aligned->upper[BB_Z];
 624 }
 625
 626 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
 627
 628
 629 /* Combines pairs of consecutive bounding boxes */
 630 static void combine_bounding_box_pairs(nbnxn_grid_t *grid, const nbnxn_bb_t *bb)
 631 {
 632     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 633     using namespace gmx;
 634
 635     for (int i = 0; i < grid->ncx*grid->ncy; i++)
 636     {
 637         /* Starting bb in a column is expected to be 2-aligned */
 638         int sc2 = grid->cxy_ind[i]>>1;
 639         /* For odd numbers skip the last bb here */
 640         int nc2 = (grid->cxy_na[i]+3)>>(2+1);
 641         int c2;
 642         for (c2 = sc2; c2 < sc2+nc2; c2++)
 643         {
 644 #if NBNXN_SEARCH_BB_SIMD4
 645             Simd4Float min_S, max_S;
 646
 647             min_S = min(load4(&bb[c2*2+0].lower[0]),
 648                         load4(&bb[c2*2+1].lower[0]));
 649             max_S = max(load4(&bb[c2*2+0].upper[0]),
 650                         load4(&bb[c2*2+1].upper[0]));
 651             store4(&grid->bbj[c2].lower[0], min_S);
 652             store4(&grid->bbj[c2].upper[0], max_S);
 653 #else
 654             for (int j = 0; j < NNBSBB_C; j++)
 655             {
 656                 grid->bbj[c2].lower[j] = std::min(bb[c2*2+0].lower[j],
 657                                                   bb[c2*2+1].lower[j]);
 658                 grid->bbj[c2].upper[j] = std::max(bb[c2*2+0].upper[j],
 659                                                   bb[c2*2+1].upper[j]);
 660             }
 661 #endif
 662         }
 663         if (((grid->cxy_na[i]+3)>>2) & 1)
 664         {
 665             /* The bb count in this column is odd: duplicate the last bb */
 666             for (int j = 0; j < NNBSBB_C; j++)
 667             {
 668                 grid->bbj[c2].lower[j] = bb[c2*2].lower[j];
 669                 grid->bbj[c2].upper[j] = bb[c2*2].upper[j];
 670             }
 671         }
 672     }
 673 }
 674
 675
 676 /* Prints the average bb size, used for debug output */
 677 static void print_bbsizes_simple(FILE                *fp,
 678                                  const nbnxn_grid_t  *grid)
 679 {
 680     dvec ba;
 681
 682     clear_dvec(ba);
 683     for (int c = 0; c < grid->nc; c++)
 684     {
 685         for (int d = 0; d < DIM; d++)
 686         {
 687             ba[d] += grid->bb[c].upper[d] - grid->bb[c].lower[d];
 688         }
 689     }
 690     dsvmul(1.0/grid->nc, ba, ba);
 691
 692     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 693             grid->sx,
 694             grid->sy,
 695             grid->atom_density > 0 ?
 696             grid->na_c/(grid->atom_density*grid->sx*grid->sy) : 0.0,
 697             ba[XX], ba[YY], ba[ZZ],
 698             ba[XX]/grid->sx,
 699             ba[YY]/grid->sy,
 700             grid->atom_density > 0 ?
 701             ba[ZZ]/(grid->na_c/(grid->atom_density*grid->sx*grid->sy)) : 0.0);
 702 }
 703
 704 /* Prints the average bb size, used for debug output */
 705 static void print_bbsizes_supersub(FILE                *fp,
 706                                    const nbnxn_grid_t  *grid)
 707 {
 708     int  ns;
 709     dvec ba;
 710
 711     clear_dvec(ba);
 712     ns = 0;
 713     for (int c = 0; c < grid->nc; c++)
 714     {
 715 #if NBNXN_BBXXXX
 716         for (int s = 0; s < grid->nsubc[c]; s += STRIDE_PBB)
 717         {
 718             int cs_w = (c*c_gpuNumClusterPerCell + s)/STRIDE_PBB;
 719             for (int i = 0; i < STRIDE_PBB; i++)
 720             {
 721                 for (int d = 0; d < DIM; d++)
 722                 {
 723                     ba[d] +=
 724                         grid->pbb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
 725                         grid->pbb[cs_w*NNBSBB_XXXX+     d *STRIDE_PBB+i];
 726                 }
 727             }
 728         }
 729 #else
 730         for (int s = 0; s < grid->nsubc[c]; s++)
 731         {
 732             int cs = c*c_gpuNumClusterPerCell + s;
 733             for (int d = 0; d < DIM; d++)
 734             {
 735                 ba[d] += grid->bb[cs].upper[d] - grid->bb[cs].lower[d];
 736             }
 737         }
 738 #endif
 739         ns += grid->nsubc[c];
 740     }
 741     dsvmul(1.0/ns, ba, ba);
 742
 743     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 744             grid->sx/c_gpuNumClusterPerCellX,
 745             grid->sy/c_gpuNumClusterPerCellY,
 746             grid->atom_density > 0 ?
 747             grid->na_sc/(grid->atom_density*grid->sx*grid->sy*c_gpuNumClusterPerCellZ) : 0.0,
 748             ba[XX], ba[YY], ba[ZZ],
 749             ba[XX]*c_gpuNumClusterPerCellX/grid->sx,
 750             ba[YY]*c_gpuNumClusterPerCellY/grid->sy,
 751             grid->atom_density > 0 ?
 752             ba[ZZ]/(grid->na_sc/(grid->atom_density*grid->sx*grid->sy*c_gpuNumClusterPerCellZ)) : 0.0);
 753 }
 754
 755 /* Set non-bonded interaction flags for the current cluster.
 756  * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
 757  */
 758 static void sort_cluster_on_flag(int natoms_cluster,
 759                                  int a0, int a1, const int *atinfo,
 760                                  int *order,
 761                                  int *flags)
 762 {
 763     const int natoms_cluster_max = 8;
 764     int       sort1[natoms_cluster_max];
 765     int       sort2[natoms_cluster_max];
 766
 767     assert(natoms_cluster <= natoms_cluster_max);
 768
 769     *flags = 0;
 770
 771     int subc = 0;
 772     for (int s = a0; s < a1; s += natoms_cluster)
 773     {
 774         /* Make lists for this (sub-)cell on atoms with and without LJ */
 775         int      n1         = 0;
 776         int      n2         = 0;
 777         gmx_bool haveQ      = FALSE;
 778         int      a_lj_max   = -1;
 779         for (int a = s; a < std::min(s + natoms_cluster, a1); a++)
 780         {
 781             haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
 782
 783             if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
 784             {
 785                 sort1[n1++] = order[a];
 786                 a_lj_max    = a;
 787             }
 788             else
 789             {
 790                 sort2[n2++] = order[a];
 791             }
 792         }
 793
 794         /* If we don't have atoms with LJ, there's nothing to sort */
 795         if (n1 > 0)
 796         {
 797             *flags |= NBNXN_CI_DO_LJ(subc);
 798
 799             if (2*n1 <= natoms_cluster)
 800             {
 801                 /* Only sort when strictly necessary. Ordering particles
 802                  * Ordering particles can lead to less accurate summation
 803                  * due to rounding, both for LJ and Coulomb interactions.
 804                  */
 805                 if (2*(a_lj_max - s) >= natoms_cluster)
 806                 {
 807                     for (int i = 0; i < n1; i++)
 808                     {
 809                         order[a0 + i]      = sort1[i];
 810                     }
 811                     for (int j = 0; j < n2; j++)
 812                     {
 813                         order[a0 + n1 + j] = sort2[j];
 814                     }
 815                 }
 816
 817                 *flags |= NBNXN_CI_HALF_LJ(subc);
 818             }
 819         }
 820         if (haveQ)
 821         {
 822             *flags |= NBNXN_CI_DO_COUL(subc);
 823         }
 824         subc++;
 825     }
 826 }
 827
 828 /* Fill a pair search cell with atoms.
 829  * Potentially sorts atoms and sets the interaction flags.
 830  */
 831 static void fill_cell(const nbnxn_search_t nbs,
 832                       nbnxn_grid_t *grid,
 833                       nbnxn_atomdata_t *nbat,
 834                       int a0, int a1,
 835                       const int *atinfo,
 836                       const rvec *x,
 837                       nbnxn_bb_t gmx_unused *bb_work_aligned)
 838 {
 839     int         na, a;
 840     size_t      offset;
 841     nbnxn_bb_t *bb_ptr;
 842 #if NBNXN_BBXXXX
 843     float      *pbb_ptr;
 844 #endif
 845
 846     na = a1 - a0;
 847
 848     if (grid->bSimple)
 849     {
 850         /* Note that non-local grids are already sorted.
 851          * Then sort_cluster_on_flag will only set the flags and the sorting
 852          * will not affect the atom order.
 853          */
 854         sort_cluster_on_flag(grid->na_c, a0, a1, atinfo, nbs->a,
 855                              grid->flags+(a0>>grid->na_c_2log)-grid->cell0);
 856     }
 857
 858     if (nbs->bFEP)
 859     {
 860         /* Set the fep flag for perturbed atoms in this (sub-)cell */
 861         int c;
 862
 863         /* The grid-local cluster/(sub-)cell index */
 864         c            = (a0 >> grid->na_c_2log) - grid->cell0*(grid->bSimple ? 1 : c_gpuNumClusterPerCell);
 865         grid->fep[c] = 0;
 866         for (int at = a0; at < a1; at++)
 867         {
 868             if (nbs->a[at] >= 0 && GET_CGINFO_FEP(atinfo[nbs->a[at]]))
 869             {
 870                 grid->fep[c] |= (1 << (at - a0));
 871             }
 872         }
 873     }
 874
 875     /* Now we have sorted the atoms, set the cell indices */
 876     for (a = a0; a < a1; a++)
 877     {
 878         nbs->cell[nbs->a[a]] = a;
 879     }
 880
 881     copy_rvec_to_nbat_real(nbs->a+a0, a1-a0, grid->na_c, x,
 882                            nbat->XFormat, nbat->x, a0);
 883
 884     if (nbat->XFormat == nbatX4)
 885     {
 886         /* Store the bounding boxes as xyz.xyz. */
 887         offset = (a0 - grid->cell0*grid->na_sc) >> grid->na_c_2log;
 888         bb_ptr = grid->bb + offset;
 889
 890 #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2
 891         if (2*grid->na_cj == grid->na_c)
 892         {
 893             calc_bounding_box_x_x4_halves(na, nbat->x + atom_to_x_index<c_packX4>(a0), bb_ptr,
 894                                           grid->bbj+offset*2);
 895         }
 896         else
 897 #endif
 898         {
 899             calc_bounding_box_x_x4(na, nbat->x + atom_to_x_index<c_packX4>(a0), bb_ptr);
 900         }
 901     }
 902     else if (nbat->XFormat == nbatX8)
 903     {
 904         /* Store the bounding boxes as xyz.xyz. */
 905         offset = (a0 - grid->cell0*grid->na_sc) >> grid->na_c_2log;
 906         bb_ptr = grid->bb + offset;
 907
 908         calc_bounding_box_x_x8(na, nbat->x +  atom_to_x_index<c_packX8>(a0), bb_ptr);
 909     }
 910 #if NBNXN_BBXXXX
 911     else if (!grid->bSimple)
 912     {
 913         /* Store the bounding boxes in a format convenient
 914          * for SIMD4 calculations: xxxxyyyyzzzz...
 915          */
 916         pbb_ptr =
 917             grid->pbb +
 918             ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG))*NNBSBB_XXXX +
 919             (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB-1));
 920
 921 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 922         if (nbat->XFormat == nbatXYZQ)
 923         {
 924             calc_bounding_box_xxxx_simd4(na, nbat->x+a0*nbat->xstride,
 925                                          bb_work_aligned, pbb_ptr);
 926         }
 927         else
 928 #endif
 929         {
 930             calc_bounding_box_xxxx(na, nbat->xstride, nbat->x+a0*nbat->xstride,
 931                                    pbb_ptr);
 932         }
 933         if (gmx_debug_at)
 934         {
 935             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 936                     a0 >> grid->na_c_2log,
 937                     pbb_ptr[0*STRIDE_PBB], pbb_ptr[3*STRIDE_PBB],
 938                     pbb_ptr[1*STRIDE_PBB], pbb_ptr[4*STRIDE_PBB],
 939                     pbb_ptr[2*STRIDE_PBB], pbb_ptr[5*STRIDE_PBB]);
 940         }
 941     }
 942 #endif
 943     else
 944     {
 945         /* Store the bounding boxes as xyz.xyz. */
 946         bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log);
 947
 948         calc_bounding_box(na, nbat->xstride, nbat->x+a0*nbat->xstride,
 949                           bb_ptr);
 950
 951         if (gmx_debug_at)
 952         {
 953             int bbo;
 954             bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
 955             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 956                     a0 >> grid->na_c_2log,
 957                     grid->bb[bbo].lower[BB_X],
 958                     grid->bb[bbo].lower[BB_Y],
 959                     grid->bb[bbo].lower[BB_Z],
 960                     grid->bb[bbo].upper[BB_X],
 961                     grid->bb[bbo].upper[BB_Y],
 962                     grid->bb[bbo].upper[BB_Z]);
 963         }
 964     }
 965 }
 966
 967 /* Spatially sort the atoms within one grid column */
 968 static void sort_columns_simple(const nbnxn_search_t nbs,
 969                                 int dd_zone,
 970                                 nbnxn_grid_t *grid,
 971                                 int a0, int a1,
 972                                 const int *atinfo,
 973                                 rvec *x,
 974                                 nbnxn_atomdata_t *nbat,
 975                                 int cxy_start, int cxy_end,
 976                                 int *sort_work)
 977 {
 978     int  cfilled, c;
 979
 980     if (debug)
 981     {
 982         fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
 983                 grid->cell0, cxy_start, cxy_end, a0, a1);
 984     }
 985
 986     /* Sort the atoms within each x,y column in 3 dimensions */
 987     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
 988     {
 989         int na  = grid->cxy_na[cxy];
 990         int ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
 991         int ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 992
 993         /* Sort the atoms within each x,y column on z coordinate */
 994         sort_atoms(ZZ, FALSE, dd_zone,
 995                    nbs->a+ash, na, x,
 996                    grid->c0[ZZ],
 997                    1.0/grid->size[ZZ], ncz*grid->na_sc,
 998                    sort_work);
 999
1000         /* Fill the ncz cells in this column */
1001         cfilled = grid->cxy_ind[cxy];
1002         for (int cz = 0; cz < ncz; cz++)
1003         {
1004             c  = grid->cxy_ind[cxy] + cz;
1005
1006             int ash_c = ash + cz*grid->na_sc;
1007             int na_c  = std::min(grid->na_sc, na-(ash_c-ash));
1008
1009             fill_cell(nbs, grid, nbat,
1010                       ash_c, ash_c+na_c, atinfo, x,
1011                       nullptr);
1012
1013             /* This copy to bbcz is not really necessary.
1014              * But it allows to use the same grid search code
1015              * for the simple and supersub cell setups.
1016              */
1017             if (na_c > 0)
1018             {
1019                 cfilled = c;
1020             }
1021             grid->bbcz[c*NNBSBB_D  ] = grid->bb[cfilled].lower[BB_Z];
1022             grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled].upper[BB_Z];
1023         }
1024
1025         /* Set the unused atom indices to -1 */
1026         for (int ind = na; ind < ncz*grid->na_sc; ind++)
1027         {
1028             nbs->a[ash+ind] = -1;
1029         }
1030     }
1031 }
1032
1033 /* Spatially sort the atoms within one grid column */
1034 static void sort_columns_supersub(const nbnxn_search_t nbs,
1035                                   int dd_zone,
1036                                   nbnxn_grid_t *grid,
1037                                   int a0, int a1,
1038                                   const int *atinfo,
1039                                   rvec *x,
1040                                   nbnxn_atomdata_t *nbat,
1041                                   int cxy_start, int cxy_end,
1042                                   int *sort_work)
1043 {
1044     nbnxn_bb_t bb_work_array[2], *bb_work_aligned;
1045
1046     bb_work_aligned = reinterpret_cast<nbnxn_bb_t *>((reinterpret_cast<std::size_t>(bb_work_array+1)) & (~(static_cast<std::size_t>(15))));
1047
1048     if (debug)
1049     {
1050         fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1051                 grid->cell0, cxy_start, cxy_end, a0, a1);
1052     }
1053
1054     int subdiv_x = grid->na_c;
1055     int subdiv_y = c_gpuNumClusterPerCellX*subdiv_x;
1056     int subdiv_z = c_gpuNumClusterPerCellY*subdiv_y;
1057
1058     /* Sort the atoms within each x,y column in 3 dimensions */
1059     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
1060     {
1061         int cx = cxy/grid->ncy;
1062         int cy = cxy - cx*grid->ncy;
1063
1064         int na  = grid->cxy_na[cxy];
1065         int ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
1066         int ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1067
1068         /* Sort the atoms within each x,y column on z coordinate */
1069         sort_atoms(ZZ, FALSE, dd_zone,
1070                    nbs->a + ash, na, x,
1071                    grid->c0[ZZ],
1072                    1.0/grid->size[ZZ], ncz*grid->na_sc,
1073                    sort_work);
1074
1075         /* This loop goes over the supercells and subcells along z at once */
1076         for (int sub_z = 0; sub_z < ncz*c_gpuNumClusterPerCellZ; sub_z++)
1077         {
1078             int ash_z = ash + sub_z*subdiv_z;
1079             int na_z  = std::min(subdiv_z, na - (ash_z - ash));
1080             int cz    = -1;
1081             /* We have already sorted on z */
1082
1083             if (sub_z % c_gpuNumClusterPerCellZ == 0)
1084             {
1085                 cz = sub_z/c_gpuNumClusterPerCellZ;
1086                 int c  = grid->cxy_ind[cxy] + cz;
1087
1088                 /* The number of atoms in this supercell */
1089                 int na_c = std::min(grid->na_sc, na - (ash_z - ash));
1090
1091                 grid->nsubc[c] = std::min(c_gpuNumClusterPerCell, (na_c + grid->na_c - 1)/grid->na_c);
1092
1093                 /* Store the z-boundaries of the super cell */
1094                 grid->bbcz[c*NNBSBB_D  ] = x[nbs->a[ash_z]][ZZ];
1095                 grid->bbcz[c*NNBSBB_D+1] = x[nbs->a[ash_z + na_c - 1]][ZZ];
1096             }
1097
1098             if (c_gpuNumClusterPerCellY > 1)
1099             {
1100                 /* Sort the atoms along y */
1101                 sort_atoms(YY, (sub_z & 1), dd_zone,
1102                            nbs->a+ash_z, na_z, x,
1103                            grid->c0[YY] + cy*grid->sy,
1104                            grid->inv_sy, subdiv_z,
1105                            sort_work);
1106             }
1107
1108             for (int sub_y = 0; sub_y < c_gpuNumClusterPerCellY; sub_y++)
1109             {
1110                 int ash_y = ash_z + sub_y*subdiv_y;
1111                 int na_y  = std::min(subdiv_y, na - (ash_y - ash));
1112
1113                 if (c_gpuNumClusterPerCellX > 1)
1114                 {
1115                     /* Sort the atoms along x */
1116                     sort_atoms(XX, ((cz*c_gpuNumClusterPerCellY + sub_y) & 1), dd_zone,
1117                                nbs->a + ash_y, na_y, x,
1118                                grid->c0[XX] + cx*grid->sx,
1119                                grid->inv_sx, subdiv_y,
1120                                sort_work);
1121                 }
1122
1123                 for (int sub_x = 0; sub_x < c_gpuNumClusterPerCellX; sub_x++)
1124                 {
1125                     int ash_x = ash_y + sub_x*subdiv_x;
1126                     int na_x  = std::min(subdiv_x, na - (ash_x - ash));
1127
1128                     fill_cell(nbs, grid, nbat,
1129                               ash_x, ash_x + na_x, atinfo, x,
1130                               bb_work_aligned);
1131                 }
1132             }
1133         }
1134
1135         /* Set the unused atom indices to -1 */
1136         for (int ind = na; ind < ncz*grid->na_sc; ind++)
1137         {
1138             nbs->a[ash+ind] = -1;
1139         }
1140     }
1141 }
1142
1143 /* Determine in which grid column atoms should go */
1144 static void calc_column_indices(nbnxn_grid_t *grid,
1145                                 int a0, int a1,
1146                                 rvec *x,
1147                                 int dd_zone, const int *move,
1148                                 int thread, int nthread,
1149                                 int *cell,
1150                                 int *cxy_na)
1151 {
1152     /* We add one extra cell for particles which moved during DD */
1153     for (int i = 0; i < grid->ncx*grid->ncy+1; i++)
1154     {
1155         cxy_na[i] = 0;
1156     }
1157
1158     int n0 = a0 + static_cast<int>((thread+0)*(a1 - a0))/nthread;
1159     int n1 = a0 + static_cast<int>((thread+1)*(a1 - a0))/nthread;
1160     if (dd_zone == 0)
1161     {
1162         /* Home zone */
1163         for (int i = n0; i < n1; i++)
1164         {
1165             if (move == nullptr || move[i] >= 0)
1166             {
1167                 /* We need to be careful with rounding,
1168                  * particles might be a few bits outside the local zone.
1169                  * The int cast takes care of the lower bound,
1170                  * we will explicitly take care of the upper bound.
1171                  */
1172                 int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1173                 int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1174
1175 #ifndef NDEBUG
1176                 if (cx < 0 || cx > grid->ncx ||
1177                     cy < 0 || cy > grid->ncy)
1178                 {
1179                     gmx_fatal(FARGS,
1180                               "grid cell cx %d cy %d out of range (max %d %d)\n"
1181                               "atom %f %f %f, grid->c0 %f %f",
1182                               cx, cy, grid->ncx, grid->ncy,
1183                               x[i][XX], x[i][YY], x[i][ZZ], grid->c0[XX], grid->c0[YY]);
1184                 }
1185 #endif
1186                 /* Take care of potential rouding issues */
1187                 cx = std::min(cx, grid->ncx - 1);
1188                 cy = std::min(cy, grid->ncy - 1);
1189
1190                 /* For the moment cell will contain only the, grid local,
1191                  * x and y indices, not z.
1192                  */
1193                 cell[i] = cx*grid->ncy + cy;
1194             }
1195             else
1196             {
1197                 /* Put this moved particle after the end of the grid,
1198                  * so we can process it later without using conditionals.
1199                  */
1200                 cell[i] = grid->ncx*grid->ncy;
1201             }
1202
1203             cxy_na[cell[i]]++;
1204         }
1205     }
1206     else
1207     {
1208         /* Non-home zone */
1209         for (int i = n0; i < n1; i++)
1210         {
1211             int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1212             int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1213
1214             /* For non-home zones there could be particles outside
1215              * the non-bonded cut-off range, which have been communicated
1216              * for bonded interactions only. For the result it doesn't
1217              * matter where these end up on the grid. For performance
1218              * we put them in an extra row at the border.
1219              */
1220             cx = std::max(cx, 0);
1221             cx = std::min(cx, grid->ncx - 1);
1222             cy = std::max(cy, 0);
1223             cy = std::min(cy, grid->ncy - 1);
1224
1225             /* For the moment cell will contain only the, grid local,
1226              * x and y indices, not z.
1227              */
1228             cell[i] = cx*grid->ncy + cy;
1229
1230             cxy_na[cell[i]]++;
1231         }
1232     }
1233 }
1234
1235 /* Determine in which grid cells the atoms should go */
1236 static void calc_cell_indices(const nbnxn_search_t nbs,
1237                               int dd_zone,
1238                               nbnxn_grid_t *grid,
1239                               int a0, int a1,
1240                               const int *atinfo,
1241                               rvec *x,
1242                               const int *move,
1243                               nbnxn_atomdata_t *nbat)
1244 {
1245     int   n0, n1;
1246     int   cx, cy, cxy, ncz_max, ncz;
1247     int   nthread;
1248     int   cxy_na_i;
1249
1250     nthread = gmx_omp_nthreads_get(emntPairsearch);
1251
1252 #pragma omp parallel for num_threads(nthread) schedule(static)
1253     for (int thread = 0; thread < nthread; thread++)
1254     {
1255         try
1256         {
1257             calc_column_indices(grid, a0, a1, x, dd_zone, move, thread, nthread,
1258                                 nbs->cell, nbs->work[thread].cxy_na);
1259         }
1260         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1261     }
1262
1263     /* Make the cell index as a function of x and y */
1264     ncz_max          = 0;
1265     ncz              = 0;
1266     grid->cxy_ind[0] = 0;
1267     for (int i = 0; i < grid->ncx*grid->ncy+1; i++)
1268     {
1269         /* We set ncz_max at the beginning of the loop iso at the end
1270          * to skip i=grid->ncx*grid->ncy which are moved particles
1271          * that do not need to be ordered on the grid.
1272          */
1273         if (ncz > ncz_max)
1274         {
1275             ncz_max = ncz;
1276         }
1277         cxy_na_i = nbs->work[0].cxy_na[i];
1278         for (int thread = 1; thread < nthread; thread++)
1279         {
1280             cxy_na_i += nbs->work[thread].cxy_na[i];
1281         }
1282         ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
1283         if (nbat->XFormat == nbatX8)
1284         {
1285             /* Make the number of cell a multiple of 2 */
1286             ncz = (ncz + 1) & ~1;
1287         }
1288         grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
1289         /* Clear cxy_na, so we can reuse the array below */
1290         grid->cxy_na[i] = 0;
1291     }
1292     grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
1293
1294     nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
1295
1296     if (debug)
1297     {
1298         fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1299                 grid->na_sc, grid->na_c, grid->nc,
1300                 grid->ncx, grid->ncy, grid->nc/((double)(grid->ncx*grid->ncy)),
1301                 ncz_max);
1302         if (gmx_debug_at)
1303         {
1304             int i = 0;
1305             for (cy = 0; cy < grid->ncy; cy++)
1306             {
1307                 for (cx = 0; cx < grid->ncx; cx++)
1308                 {
1309                     fprintf(debug, " %2d", grid->cxy_ind[i+1]-grid->cxy_ind[i]);
1310                     i++;
1311                 }
1312                 fprintf(debug, "\n");
1313             }
1314         }
1315     }
1316
1317     /* Make sure the work array for sorting is large enough */
1318     if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
1319     {
1320         for (int thread = 0; thread < nbs->nthread_max; thread++)
1321         {
1322             nbs->work[thread].sort_work_nalloc =
1323                 over_alloc_large(ncz_max*grid->na_sc*SGSF);
1324             srenew(nbs->work[thread].sort_work,
1325                    nbs->work[thread].sort_work_nalloc);
1326             /* When not in use, all elements should be -1 */
1327             for (int i = 0; i < nbs->work[thread].sort_work_nalloc; i++)
1328             {
1329                 nbs->work[thread].sort_work[i] = -1;
1330             }
1331         }
1332     }
1333
1334     /* Now we know the dimensions we can fill the grid.
1335      * This is the first, unsorted fill. We sort the columns after this.
1336      */
1337     for (int i = a0; i < a1; i++)
1338     {
1339         /* At this point nbs->cell contains the local grid x,y indices */
1340         cxy = nbs->cell[i];
1341         nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
1342     }
1343
1344     if (dd_zone == 0)
1345     {
1346         /* Set the cell indices for the moved particles */
1347         n0 = grid->nc*grid->na_sc;
1348         n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
1349         if (dd_zone == 0)
1350         {
1351             for (int i = n0; i < n1; i++)
1352             {
1353                 nbs->cell[nbs->a[i]] = i;
1354             }
1355         }
1356     }
1357
1358     /* Sort the super-cell columns along z into the sub-cells. */
1359 #pragma omp parallel for num_threads(nthread) schedule(static)
1360     for (int thread = 0; thread < nthread; thread++)
1361     {
1362         try
1363         {
1364             if (grid->bSimple)
1365             {
1366                 sort_columns_simple(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
1367                                     ((thread+0)*grid->ncx*grid->ncy)/nthread,
1368                                     ((thread+1)*grid->ncx*grid->ncy)/nthread,
1369                                     nbs->work[thread].sort_work);
1370             }
1371             else
1372             {
1373                 sort_columns_supersub(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
1374                                       ((thread+0)*grid->ncx*grid->ncy)/nthread,
1375                                       ((thread+1)*grid->ncx*grid->ncy)/nthread,
1376                                       nbs->work[thread].sort_work);
1377             }
1378         }
1379         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1380     }
1381
1382     if (grid->bSimple && nbat->XFormat == nbatX8)
1383     {
1384         combine_bounding_box_pairs(grid, grid->bb);
1385     }
1386
1387     if (!grid->bSimple)
1388     {
1389         grid->nsubc_tot = 0;
1390         for (int i = 0; i < grid->nc; i++)
1391         {
1392             grid->nsubc_tot += grid->nsubc[i];
1393         }
1394     }
1395
1396     if (debug)
1397     {
1398         if (grid->bSimple)
1399         {
1400             print_bbsizes_simple(debug, grid);
1401         }
1402         else
1403         {
1404             fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
1405                     grid->nsubc_tot, (a1-a0)/(double)grid->nsubc_tot);
1406
1407             print_bbsizes_supersub(debug, grid);
1408         }
1409     }
1410 }
1411
1412 static void init_buffer_flags(nbnxn_buffer_flags_t *flags,
1413                               int                   natoms)
1414 {
1415     flags->nflag = (natoms + NBNXN_BUFFERFLAG_SIZE - 1)/NBNXN_BUFFERFLAG_SIZE;
1416     if (flags->nflag > flags->flag_nalloc)
1417     {
1418         flags->flag_nalloc = over_alloc_large(flags->nflag);
1419         srenew(flags->flag, flags->flag_nalloc);
1420     }
1421     for (int b = 0; b < flags->nflag; b++)
1422     {
1423         bitmask_clear(&(flags->flag[b]));
1424     }
1425 }
1426
1427 /* Sets up a grid and puts the atoms on the grid.
1428  * This function only operates on one domain of the domain decompostion.
1429  * Note that without domain decomposition there is only one domain.
1430  */
1431 void nbnxn_put_on_grid(nbnxn_search_t nbs,
1432                        int ePBC, matrix box,
1433                        int dd_zone,
1434                        rvec corner0, rvec corner1,
1435                        int a0, int a1,
1436                        real atom_density,
1437                        const int *atinfo,
1438                        rvec *x,
1439                        int nmoved, int *move,
1440                        int nb_kernel_type,
1441                        nbnxn_atomdata_t *nbat)
1442 {
1443     nbnxn_grid_t *grid;
1444     int           n;
1445     int           nc_max_grid, nc_max;
1446
1447     grid = &nbs->grid[dd_zone];
1448
1449     nbs_cycle_start(&nbs->cc[enbsCCgrid]);
1450
1451     grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
1452
1453     grid->na_c      = nbnxn_kernel_to_cluster_i_size(nb_kernel_type);
1454     grid->na_cj     = nbnxn_kernel_to_cluster_j_size(nb_kernel_type);
1455     grid->na_sc     = (grid->bSimple ? 1 : c_gpuNumClusterPerCell)*grid->na_c;
1456     grid->na_c_2log = get_2log(grid->na_c);
1457
1458     nbat->na_c = grid->na_c;
1459
1460     if (dd_zone == 0)
1461     {
1462         grid->cell0 = 0;
1463     }
1464     else
1465     {
1466         grid->cell0 =
1467             (nbs->grid[dd_zone-1].cell0 + nbs->grid[dd_zone-1].nc)*
1468             nbs->grid[dd_zone-1].na_sc/grid->na_sc;
1469     }
1470
1471     n = a1 - a0;
1472
1473     if (dd_zone == 0)
1474     {
1475         nbs->ePBC = ePBC;
1476         copy_mat(box, nbs->box);
1477
1478         /* Avoid zero density */
1479         if (atom_density > 0)
1480         {
1481             grid->atom_density = atom_density;
1482         }
1483         else
1484         {
1485             grid->atom_density = grid_atom_density(n-nmoved, corner0, corner1);
1486         }
1487
1488         grid->cell0 = 0;
1489
1490         nbs->natoms_local    = a1 - nmoved;
1491         /* We assume that nbnxn_put_on_grid is called first
1492          * for the local atoms (dd_zone=0).
1493          */
1494         nbs->natoms_nonlocal = a1 - nmoved;
1495
1496         if (debug)
1497         {
1498             fprintf(debug, "natoms_local = %5d atom_density = %5.1f\n",
1499                     nbs->natoms_local, grid->atom_density);
1500         }
1501     }
1502     else
1503     {
1504         nbs->natoms_nonlocal = std::max(nbs->natoms_nonlocal, a1);
1505     }
1506
1507     /* We always use the home zone (grid[0]) for setting the cell size,
1508      * since determining densities for non-local zones is difficult.
1509      */
1510     nc_max_grid = set_grid_size_xy(nbs, grid,
1511                                    dd_zone, n-nmoved, corner0, corner1,
1512                                    nbs->grid[0].atom_density);
1513
1514     nc_max = grid->cell0 + nc_max_grid;
1515
1516     if (a1 > nbs->cell_nalloc)
1517     {
1518         nbs->cell_nalloc = over_alloc_large(a1);
1519         srenew(nbs->cell, nbs->cell_nalloc);
1520     }
1521
1522     /* To avoid conditionals we store the moved particles at the end of a,
1523      * make sure we have enough space.
1524      */
1525     if (nc_max*grid->na_sc + nmoved > nbs->a_nalloc)
1526     {
1527         nbs->a_nalloc = over_alloc_large(nc_max*grid->na_sc + nmoved);
1528         srenew(nbs->a, nbs->a_nalloc);
1529     }
1530
1531     /* We need padding up to a multiple of the buffer flag size: simply add */
1532     if (nc_max*grid->na_sc + NBNXN_BUFFERFLAG_SIZE > nbat->nalloc)
1533     {
1534         nbnxn_atomdata_realloc(nbat, nc_max*grid->na_sc+NBNXN_BUFFERFLAG_SIZE);
1535     }
1536
1537     calc_cell_indices(nbs, dd_zone, grid, a0, a1, atinfo, x, move, nbat);
1538
1539     if (dd_zone == 0)
1540     {
1541         nbat->natoms_local = nbat->natoms;
1542     }
1543
1544     nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
1545 }
1546
1547 /* Calls nbnxn_put_on_grid for all non-local domains */
1548 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t                   nbs,
1549                                 const struct gmx_domdec_zones_t *zones,
1550                                 const int                       *atinfo,
1551                                 rvec                            *x,
1552                                 int                              nb_kernel_type,
1553                                 nbnxn_atomdata_t                *nbat)
1554 {
1555     rvec c0, c1;
1556
1557     for (int zone = 1; zone < zones->n; zone++)
1558     {
1559         for (int d = 0; d < DIM; d++)
1560         {
1561             c0[d] = zones->size[zone].bb_x0[d];
1562             c1[d] = zones->size[zone].bb_x1[d];
1563         }
1564
1565         nbnxn_put_on_grid(nbs, nbs->ePBC, nullptr,
1566                           zone, c0, c1,
1567                           zones->cg_range[zone],
1568                           zones->cg_range[zone+1],
1569                           -1,
1570                           atinfo,
1571                           x,
1572                           0, nullptr,
1573                           nb_kernel_type,
1574                           nbat);
1575     }
1576 }
1577
1578 /* Add simple grid type information to the local super/sub grid */
1579 void nbnxn_grid_add_simple(nbnxn_search_t    nbs,
1580                            nbnxn_atomdata_t *nbat)
1581 {
1582     nbnxn_grid_t *grid;
1583     float        *bbcz;
1584     nbnxn_bb_t   *bb;
1585     int           ncd;
1586
1587     grid = &nbs->grid[0];
1588
1589     if (grid->bSimple)
1590     {
1591         gmx_incons("nbnxn_grid_simple called with a simple grid");
1592     }
1593
1594     ncd = grid->na_sc/NBNXN_CPU_CLUSTER_I_SIZE;
1595
1596     if (grid->nc*ncd > grid->nc_nalloc_simple)
1597     {
1598         grid->nc_nalloc_simple = over_alloc_large(grid->nc*ncd);
1599         srenew(grid->bbcz_simple, grid->nc_nalloc_simple*NNBSBB_D);
1600         srenew(grid->bb_simple, grid->nc_nalloc_simple);
1601         srenew(grid->flags_simple, grid->nc_nalloc_simple);
1602         if (nbat->XFormat)
1603         {
1604             sfree_aligned(grid->bbj);
1605             snew_aligned(grid->bbj, grid->nc_nalloc_simple/2, 16);
1606         }
1607     }
1608
1609     bbcz = grid->bbcz_simple;
1610     bb   = grid->bb_simple;
1611
1612 #if GMX_OPENMP && !(defined __clang_analyzer__)
1613     // cppcheck-suppress unreadVariable
1614     int nthreads = gmx_omp_nthreads_get(emntPairsearch);
1615 #endif
1616
1617 #pragma omp parallel for num_threads(nthreads) schedule(static)
1618     for (int sc = 0; sc < grid->nc; sc++)
1619     {
1620         try
1621         {
1622             for (int c = 0; c < ncd; c++)
1623             {
1624                 int tx = sc*ncd + c;
1625                 int na = NBNXN_CPU_CLUSTER_I_SIZE;
1626                 while (na > 0 &&
1627                        nbat->type[tx*NBNXN_CPU_CLUSTER_I_SIZE+na-1] == nbat->ntype-1)
1628                 {
1629                     na--;
1630                 }
1631
1632                 if (na > 0)
1633                 {
1634                     switch (nbat->XFormat)
1635                     {
1636                         case nbatX4:
1637                             /* c_packX4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
1638                             calc_bounding_box_x_x4(na, nbat->x+tx*STRIDE_P4,
1639                                                    bb+tx);
1640                             break;
1641                         case nbatX8:
1642                             /* c_packX8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
1643                             calc_bounding_box_x_x8(na, nbat->x + atom_to_x_index<c_packX8>(tx*NBNXN_CPU_CLUSTER_I_SIZE),
1644                                                    bb+tx);
1645                             break;
1646                         default:
1647                             calc_bounding_box(na, nbat->xstride,
1648                                               nbat->x+tx*NBNXN_CPU_CLUSTER_I_SIZE*nbat->xstride,
1649                                               bb+tx);
1650                             break;
1651                     }
1652                     bbcz[tx*NNBSBB_D+0] = bb[tx].lower[BB_Z];
1653                     bbcz[tx*NNBSBB_D+1] = bb[tx].upper[BB_Z];
1654
1655                     /* No interaction optimization yet here */
1656                     grid->flags_simple[tx] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
1657                 }
1658                 else
1659                 {
1660                     grid->flags_simple[tx] = 0;
1661                 }
1662             }
1663         }
1664         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1665     }
1666
1667     if (grid->bSimple && nbat->XFormat == nbatX8)
1668     {
1669         combine_bounding_box_pairs(grid, grid->bb_simple);
1670     }
1671 }
1672
1673 void nbnxn_get_ncells(nbnxn_search_t nbs, int *ncx, int *ncy)
1674 {
1675     *ncx = nbs->grid[0].ncx;
1676     *ncy = nbs->grid[0].ncy;
1677 }
1678
1679 void nbnxn_get_atomorder(const nbnxn_search_t nbs, const int **a, int *n)
1680 {
1681     const nbnxn_grid_t *grid;
1682
1683     grid = &nbs->grid[0];
1684
1685     /* Return the atom order for the home cell (index 0) */
1686     *a  = nbs->a;
1687
1688     *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
1689 }
1690
1691 void nbnxn_set_atomorder(nbnxn_search_t nbs)
1692 {
1693     /* Set the atom order for the home cell (index 0) */
1694     nbnxn_grid_t *grid = &nbs->grid[0];
1695
1696     int           ao = 0;
1697     for (int cx = 0; cx < grid->ncx; cx++)
1698     {
1699         for (int cy = 0; cy < grid->ncy; cy++)
1700         {
1701             int cxy = cx*grid->ncy + cy;
1702             int j   = grid->cxy_ind[cxy]*grid->na_sc;
1703             for (int cz = 0; cz < grid->cxy_na[cxy]; cz++)
1704             {
1705                 nbs->a[j]     = ao;
1706                 nbs->cell[ao] = j;
1707                 ao++;
1708                 j++;
1709             }
1710         }
1711     }
1712 }