src/gromacs/mdlib/nbnxn_grid.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "nbnxn_grid.h"
  39
  40 #include <assert.h>
  41 #include <string.h>
  42
  43 #include <cmath>
  44
  45 #include <algorithm>
  46
  47 #include "gromacs/domdec/domdec_struct.h"
  48 #include "gromacs/math/utilities.h"
  49 #include "gromacs/math/vec.h"
  50 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  51 #include "gromacs/mdlib/nb_verlet.h"
  52 #include "gromacs/mdlib/nbnxn_atomdata.h"
  53 #include "gromacs/mdlib/nbnxn_consts.h"
  54 #include "gromacs/mdlib/nbnxn_internal.h"
  55 #include "gromacs/mdlib/nbnxn_search.h"
  56 #include "gromacs/mdlib/nbnxn_util.h"
  57 #include "gromacs/simd/simd.h"
  58 #include "gromacs/simd/vector_operations.h"
  59 #include "gromacs/utility/exceptions.h"
  60 #include "gromacs/utility/smalloc.h"
  61
  62 struct gmx_domdec_zones_t;
  63
  64 static void nbnxn_grid_init(nbnxn_grid_t * grid)
  65 {
  66     grid->cxy_na      = nullptr;
  67     grid->cxy_ind     = nullptr;
  68     grid->cxy_nalloc  = 0;
  69     grid->bb          = nullptr;
  70     grid->bbj         = nullptr;
  71     grid->nc_nalloc   = 0;
  72 }
  73
  74 void nbnxn_grids_init(nbnxn_search_t nbs, int ngrid)
  75 {
  76     int g;
  77
  78     nbs->ngrid = ngrid;
  79
  80     snew(nbs->grid, nbs->ngrid);
  81     for (g = 0; g < nbs->ngrid; g++)
  82     {
  83         nbnxn_grid_init(&nbs->grid[g]);
  84     }
  85 }
  86
  87 static real grid_atom_density(int n, rvec corner0, rvec corner1)
  88 {
  89     rvec size;
  90
  91     if (n == 0)
  92     {
  93         /* To avoid zero density we use a minimum of 1 atom */
  94         n = 1;
  95     }
  96
  97     rvec_sub(corner1, corner0, size);
  98
  99     return n/(size[XX]*size[YY]*size[ZZ]);
 100 }
 101
 102 static int set_grid_size_xy(const nbnxn_search_t nbs,
 103                             nbnxn_grid_t *grid,
 104                             int dd_zone,
 105                             int n, rvec corner0, rvec corner1,
 106                             real atom_density)
 107 {
 108     rvec size;
 109     int  na_c;
 110     int  nc_max;
 111     real tlen, tlen_x, tlen_y;
 112
 113     rvec_sub(corner1, corner0, size);
 114
 115     if (n > grid->na_sc)
 116     {
 117         assert(atom_density > 0);
 118
 119         /* target cell length */
 120         if (grid->bSimple)
 121         {
 122             /* To minimize the zero interactions, we should make
 123              * the largest of the i/j cell cubic.
 124              */
 125             na_c = std::max(grid->na_c, grid->na_cj);
 126
 127             /* Approximately cubic cells */
 128             tlen   = std::cbrt(na_c/atom_density);
 129             tlen_x = tlen;
 130             tlen_y = tlen;
 131         }
 132         else
 133         {
 134             /* Approximately cubic sub cells */
 135             tlen   = std::cbrt(grid->na_c/atom_density);
 136             tlen_x = tlen*c_gpuNumClusterPerCellX;
 137             tlen_y = tlen*c_gpuNumClusterPerCellY;
 138         }
 139         /* We round ncx and ncy down, because we get less cell pairs
 140          * in the nbsist when the fixed cell dimensions (x,y) are
 141          * larger than the variable one (z) than the other way around.
 142          */
 143         grid->ncx = std::max(1, static_cast<int>(size[XX]/tlen_x));
 144         grid->ncy = std::max(1, static_cast<int>(size[YY]/tlen_y));
 145     }
 146     else
 147     {
 148         grid->ncx = 1;
 149         grid->ncy = 1;
 150     }
 151
 152     grid->sx     = size[XX]/grid->ncx;
 153     grid->sy     = size[YY]/grid->ncy;
 154     grid->inv_sx = 1/grid->sx;
 155     grid->inv_sy = 1/grid->sy;
 156
 157     if (dd_zone > 0)
 158     {
 159         /* This is a non-home zone, add an extra row of cells
 160          * for particles communicated for bonded interactions.
 161          * These can be beyond the cut-off. It doesn't matter where
 162          * they end up on the grid, but for performance it's better
 163          * if they don't end up in cells that can be within cut-off range.
 164          */
 165         grid->ncx++;
 166         grid->ncy++;
 167     }
 168
 169     /* We need one additional cell entry for particles moved by DD */
 170     if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
 171     {
 172         grid->cxy_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
 173         srenew(grid->cxy_na, grid->cxy_nalloc);
 174         srenew(grid->cxy_ind, grid->cxy_nalloc+1);
 175     }
 176     for (int t = 0; t < nbs->nthread_max; t++)
 177     {
 178         if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
 179         {
 180             nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
 181             srenew(nbs->work[t].cxy_na, nbs->work[t].cxy_na_nalloc);
 182         }
 183     }
 184
 185     /* Worst case scenario of 1 atom in each last cell */
 186     if (grid->na_cj <= grid->na_c)
 187     {
 188         nc_max = n/grid->na_sc + grid->ncx*grid->ncy;
 189     }
 190     else
 191     {
 192         nc_max = n/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
 193     }
 194
 195     if (nc_max > grid->nc_nalloc)
 196     {
 197         grid->nc_nalloc = over_alloc_large(nc_max);
 198         srenew(grid->nsubc, grid->nc_nalloc);
 199         srenew(grid->bbcz, grid->nc_nalloc*NNBSBB_D);
 200
 201         sfree_aligned(grid->bb);
 202         /* This snew also zeros the contents, this avoid possible
 203          * floating exceptions in SIMD with the unused bb elements.
 204          */
 205         if (grid->bSimple)
 206         {
 207             snew_aligned(grid->bb, grid->nc_nalloc, 16);
 208         }
 209         else
 210         {
 211 #if NBNXN_BBXXXX
 212             int pbb_nalloc;
 213
 214             pbb_nalloc = grid->nc_nalloc*c_gpuNumClusterPerCell/STRIDE_PBB*NNBSBB_XXXX;
 215             snew_aligned(grid->pbb, pbb_nalloc, 16);
 216 #else
 217             snew_aligned(grid->bb, grid->nc_nalloc*c_gpuNumClusterPerCell, 16);
 218 #endif
 219         }
 220
 221         if (grid->bSimple)
 222         {
 223             if (grid->na_cj == grid->na_c)
 224             {
 225                 grid->bbj = grid->bb;
 226             }
 227             else
 228             {
 229                 sfree_aligned(grid->bbj);
 230                 snew_aligned(grid->bbj, grid->nc_nalloc*grid->na_c/grid->na_cj, 16);
 231             }
 232         }
 233
 234         srenew(grid->flags, grid->nc_nalloc);
 235         if (nbs->bFEP)
 236         {
 237             srenew(grid->fep, grid->nc_nalloc*grid->na_sc/grid->na_c);
 238         }
 239     }
 240
 241     copy_rvec(corner0, grid->c0);
 242     copy_rvec(corner1, grid->c1);
 243     copy_rvec(size,    grid->size);
 244
 245     return nc_max;
 246 }
 247
 248 /* We need to sort paricles in grid columns on z-coordinate.
 249  * As particle are very often distributed homogeneously, we a sorting
 250  * algorithm similar to pigeonhole sort. We multiply the z-coordinate
 251  * by a factor, cast to an int and try to store in that hole. If the hole
 252  * is full, we move this or another particle. A second pass is needed to make
 253  * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
 254  * 4 is the optimal value for homogeneous particle distribution and allows
 255  * for an O(#particles) sort up till distributions were all particles are
 256  * concentrated in 1/4 of the space. No NlogN fallback is implemented,
 257  * as it can be expensive to detect imhomogeneous particle distributions.
 258  * SGSF is the maximum ratio of holes used, in the worst case all particles
 259  * end up in the last hole and we need #particles extra holes at the end.
 260  */
 261 #define SORT_GRID_OVERSIZE 4
 262 #define SGSF (SORT_GRID_OVERSIZE + 1)
 263
 264 /* Sort particle index a on coordinates x along dim.
 265  * Backwards tells if we want decreasing iso increasing coordinates.
 266  * h0 is the minimum of the coordinate range.
 267  * invh is the 1/length of the sorting range.
 268  * n_per_h (>=n) is the expected average number of particles per 1/invh
 269  * sort is the sorting work array.
 270  * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
 271  * or easier, allocate at least n*SGSF elements.
 272  */
 273 static void sort_atoms(int dim, gmx_bool Backwards,
 274                        int gmx_unused dd_zone,
 275                        int *a, int n, rvec *x,
 276                        real h0, real invh, int n_per_h,
 277                        int *sort)
 278 {
 279     if (n <= 1)
 280     {
 281         /* Nothing to do */
 282         return;
 283     }
 284
 285 #ifndef NDEBUG
 286     if (n > n_per_h)
 287     {
 288         gmx_incons("n > n_per_h");
 289     }
 290 #endif
 291
 292     /* Transform the inverse range height into the inverse hole height */
 293     invh *= n_per_h*SORT_GRID_OVERSIZE;
 294
 295     /* Set nsort to the maximum possible number of holes used.
 296      * In worst case all n elements end up in the last bin.
 297      */
 298     int nsort = n_per_h*SORT_GRID_OVERSIZE + n;
 299
 300     /* Determine the index range used, so we can limit it for the second pass */
 301     int zi_min = INT_MAX;
 302     int zi_max = -1;
 303
 304     /* Sort the particles using a simple index sort */
 305     for (int i = 0; i < n; i++)
 306     {
 307         /* The cast takes care of float-point rounding effects below zero.
 308          * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
 309          * times the box height out of the box.
 310          */
 311         int zi = static_cast<int>((x[a[i]][dim] - h0)*invh);
 312
 313 #ifndef NDEBUG
 314         /* As we can have rounding effect, we use > iso >= here */
 315         if (zi < 0 || (dd_zone == 0 && zi > n_per_h*SORT_GRID_OVERSIZE))
 316         {
 317             gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
 318                       a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
 319                       n_per_h, SORT_GRID_OVERSIZE);
 320         }
 321 #endif
 322
 323         /* In a non-local domain, particles communcated for bonded interactions
 324          * can be far beyond the grid size, which is set by the non-bonded
 325          * cut-off distance. We sort such particles into the last cell.
 326          */
 327         if (zi > n_per_h*SORT_GRID_OVERSIZE)
 328         {
 329             zi = n_per_h*SORT_GRID_OVERSIZE;
 330         }
 331
 332         /* Ideally this particle should go in sort cell zi,
 333          * but that might already be in use,
 334          * in that case find the first empty cell higher up
 335          */
 336         if (sort[zi] < 0)
 337         {
 338             sort[zi] = a[i];
 339             zi_min   = std::min(zi_min, zi);
 340             zi_max   = std::max(zi_max, zi);
 341         }
 342         else
 343         {
 344             /* We have multiple atoms in the same sorting slot.
 345              * Sort on real z for minimal bounding box size.
 346              * There is an extra check for identical z to ensure
 347              * well-defined output order, independent of input order
 348              * to ensure binary reproducibility after restarts.
 349              */
 350             while (sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
 351                                       (x[a[i]][dim] == x[sort[zi]][dim] &&
 352                                        a[i] > sort[zi])))
 353             {
 354                 zi++;
 355             }
 356
 357             if (sort[zi] >= 0)
 358             {
 359                 /* Shift all elements by one slot until we find an empty slot */
 360                 int cp  = sort[zi];
 361                 int zim = zi + 1;
 362                 while (sort[zim] >= 0)
 363                 {
 364                     int tmp   = sort[zim];
 365                     sort[zim] = cp;
 366                     cp        = tmp;
 367                     zim++;
 368                 }
 369                 sort[zim] = cp;
 370                 zi_max    = std::max(zi_max, zim);
 371             }
 372             sort[zi] = a[i];
 373             zi_max   = std::max(zi_max, zi);
 374         }
 375     }
 376
 377     int c = 0;
 378     if (!Backwards)
 379     {
 380         for (int zi = 0; zi < nsort; zi++)
 381         {
 382             if (sort[zi] >= 0)
 383             {
 384                 a[c++]   = sort[zi];
 385                 sort[zi] = -1;
 386             }
 387         }
 388     }
 389     else
 390     {
 391         for (int zi = zi_max; zi >= zi_min; zi--)
 392         {
 393             if (sort[zi] >= 0)
 394             {
 395                 a[c++]   = sort[zi];
 396                 sort[zi] = -1;
 397             }
 398         }
 399     }
 400     if (c < n)
 401     {
 402         gmx_incons("Lost particles while sorting");
 403     }
 404 }
 405
 406 #if GMX_DOUBLE
 407 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
 408 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
 409 #else
 410 #define R2F_D(x) (x)
 411 #define R2F_U(x) (x)
 412 #endif
 413
 414 /* Coordinate order x,y,z, bb order xyz0 */
 415 static void calc_bounding_box(int na, int stride, const real *x, nbnxn_bb_t *bb)
 416 {
 417     int  i;
 418     real xl, xh, yl, yh, zl, zh;
 419
 420     i  = 0;
 421     xl = x[i+XX];
 422     xh = x[i+XX];
 423     yl = x[i+YY];
 424     yh = x[i+YY];
 425     zl = x[i+ZZ];
 426     zh = x[i+ZZ];
 427     i += stride;
 428     for (int j = 1; j < na; j++)
 429     {
 430         xl = std::min(xl, x[i+XX]);
 431         xh = std::max(xh, x[i+XX]);
 432         yl = std::min(yl, x[i+YY]);
 433         yh = std::max(yh, x[i+YY]);
 434         zl = std::min(zl, x[i+ZZ]);
 435         zh = std::max(zh, x[i+ZZ]);
 436         i += stride;
 437     }
 438     /* Note: possible double to float conversion here */
 439     bb->lower[BB_X] = R2F_D(xl);
 440     bb->lower[BB_Y] = R2F_D(yl);
 441     bb->lower[BB_Z] = R2F_D(zl);
 442     bb->upper[BB_X] = R2F_U(xh);
 443     bb->upper[BB_Y] = R2F_U(yh);
 444     bb->upper[BB_Z] = R2F_U(zh);
 445 }
 446
 447 /* Packed coordinates, bb order xyz0 */
 448 static void calc_bounding_box_x_x4(int na, const real *x, nbnxn_bb_t *bb)
 449 {
 450     real xl, xh, yl, yh, zl, zh;
 451
 452     xl = x[XX*c_packX4];
 453     xh = x[XX*c_packX4];
 454     yl = x[YY*c_packX4];
 455     yh = x[YY*c_packX4];
 456     zl = x[ZZ*c_packX4];
 457     zh = x[ZZ*c_packX4];
 458     for (int j = 1; j < na; j++)
 459     {
 460         xl = std::min(xl, x[j+XX*c_packX4]);
 461         xh = std::max(xh, x[j+XX*c_packX4]);
 462         yl = std::min(yl, x[j+YY*c_packX4]);
 463         yh = std::max(yh, x[j+YY*c_packX4]);
 464         zl = std::min(zl, x[j+ZZ*c_packX4]);
 465         zh = std::max(zh, x[j+ZZ*c_packX4]);
 466     }
 467     /* Note: possible double to float conversion here */
 468     bb->lower[BB_X] = R2F_D(xl);
 469     bb->lower[BB_Y] = R2F_D(yl);
 470     bb->lower[BB_Z] = R2F_D(zl);
 471     bb->upper[BB_X] = R2F_U(xh);
 472     bb->upper[BB_Y] = R2F_U(yh);
 473     bb->upper[BB_Z] = R2F_U(zh);
 474 }
 475
 476 /* Packed coordinates, bb order xyz0 */
 477 static void calc_bounding_box_x_x8(int na, const real *x, nbnxn_bb_t *bb)
 478 {
 479     real xl, xh, yl, yh, zl, zh;
 480
 481     xl = x[XX*c_packX8];
 482     xh = x[XX*c_packX8];
 483     yl = x[YY*c_packX8];
 484     yh = x[YY*c_packX8];
 485     zl = x[ZZ*c_packX8];
 486     zh = x[ZZ*c_packX8];
 487     for (int j = 1; j < na; j++)
 488     {
 489         xl = std::min(xl, x[j+XX*c_packX8]);
 490         xh = std::max(xh, x[j+XX*c_packX8]);
 491         yl = std::min(yl, x[j+YY*c_packX8]);
 492         yh = std::max(yh, x[j+YY*c_packX8]);
 493         zl = std::min(zl, x[j+ZZ*c_packX8]);
 494         zh = std::max(zh, x[j+ZZ*c_packX8]);
 495     }
 496     /* Note: possible double to float conversion here */
 497     bb->lower[BB_X] = R2F_D(xl);
 498     bb->lower[BB_Y] = R2F_D(yl);
 499     bb->lower[BB_Z] = R2F_D(zl);
 500     bb->upper[BB_X] = R2F_U(xh);
 501     bb->upper[BB_Y] = R2F_U(yh);
 502     bb->upper[BB_Z] = R2F_U(zh);
 503 }
 504
 505 /* Packed coordinates, bb order xyz0 */
 506 gmx_unused static void calc_bounding_box_x_x4_halves(int na, const real *x,
 507                                                      nbnxn_bb_t *bb, nbnxn_bb_t *bbj)
 508 {
 509     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 510     using namespace gmx;
 511
 512     calc_bounding_box_x_x4(std::min(na, 2), x, bbj);
 513
 514     if (na > 2)
 515     {
 516         calc_bounding_box_x_x4(std::min(na-2, 2), x+(c_packX4>>1), bbj+1);
 517     }
 518     else
 519     {
 520         /* Set the "empty" bounding box to the same as the first one,
 521          * so we don't need to treat special cases in the rest of the code.
 522          */
 523 #if NBNXN_SEARCH_BB_SIMD4
 524         store4(&bbj[1].lower[0], load4(&bbj[0].lower[0]));
 525         store4(&bbj[1].upper[0], load4(&bbj[0].upper[0]));
 526 #else
 527         bbj[1] = bbj[0];
 528 #endif
 529     }
 530
 531 #if NBNXN_SEARCH_BB_SIMD4
 532     store4(&bb->lower[0], min(load4(&bbj[0].lower[0]), load4(&bbj[1].lower[0])));
 533     store4(&bb->upper[0], max(load4(&bbj[0].upper[0]), load4(&bbj[1].upper[0])));
 534 #else
 535     {
 536         int i;
 537
 538         for (i = 0; i < NNBSBB_C; i++)
 539         {
 540             bb->lower[i] = std::min(bbj[0].lower[i], bbj[1].lower[i]);
 541             bb->upper[i] = std::max(bbj[0].upper[i], bbj[1].upper[i]);
 542         }
 543     }
 544 #endif
 545 }
 546
 547 #if NBNXN_SEARCH_BB_SIMD4
 548
 549 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
 550 static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
 551 {
 552     int  i;
 553     real xl, xh, yl, yh, zl, zh;
 554
 555     i  = 0;
 556     xl = x[i+XX];
 557     xh = x[i+XX];
 558     yl = x[i+YY];
 559     yh = x[i+YY];
 560     zl = x[i+ZZ];
 561     zh = x[i+ZZ];
 562     i += stride;
 563     for (int j = 1; j < na; j++)
 564     {
 565         xl = std::min(xl, x[i+XX]);
 566         xh = std::max(xh, x[i+XX]);
 567         yl = std::min(yl, x[i+YY]);
 568         yh = std::max(yh, x[i+YY]);
 569         zl = std::min(zl, x[i+ZZ]);
 570         zh = std::max(zh, x[i+ZZ]);
 571         i += stride;
 572     }
 573     /* Note: possible double to float conversion here */
 574     bb[0*STRIDE_PBB] = R2F_D(xl);
 575     bb[1*STRIDE_PBB] = R2F_D(yl);
 576     bb[2*STRIDE_PBB] = R2F_D(zl);
 577     bb[3*STRIDE_PBB] = R2F_U(xh);
 578     bb[4*STRIDE_PBB] = R2F_U(yh);
 579     bb[5*STRIDE_PBB] = R2F_U(zh);
 580 }
 581
 582 #endif /* NBNXN_SEARCH_BB_SIMD4 */
 583
 584 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 585
 586 /* Coordinate order xyz?, bb order xyz0 */
 587 static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb)
 588 {
 589     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 590     using namespace gmx;
 591
 592     Simd4Float bb_0_S, bb_1_S;
 593     Simd4Float x_S;
 594
 595     bb_0_S = load4(x);
 596     bb_1_S = bb_0_S;
 597
 598     for (int i = 1; i < na; i++)
 599     {
 600         x_S    = load4(x+i*NNBSBB_C);
 601         bb_0_S = min(bb_0_S, x_S);
 602         bb_1_S = max(bb_1_S, x_S);
 603     }
 604
 605     store4(&bb->lower[0], bb_0_S);
 606     store4(&bb->upper[0], bb_1_S);
 607 }
 608
 609 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
 610 static void calc_bounding_box_xxxx_simd4(int na, const float *x,
 611                                          nbnxn_bb_t *bb_work_aligned,
 612                                          real *bb)
 613 {
 614     calc_bounding_box_simd4(na, x, bb_work_aligned);
 615
 616     bb[0*STRIDE_PBB] = bb_work_aligned->lower[BB_X];
 617     bb[1*STRIDE_PBB] = bb_work_aligned->lower[BB_Y];
 618     bb[2*STRIDE_PBB] = bb_work_aligned->lower[BB_Z];
 619     bb[3*STRIDE_PBB] = bb_work_aligned->upper[BB_X];
 620     bb[4*STRIDE_PBB] = bb_work_aligned->upper[BB_Y];
 621     bb[5*STRIDE_PBB] = bb_work_aligned->upper[BB_Z];
 622 }
 623
 624 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
 625
 626
 627 /* Combines pairs of consecutive bounding boxes */
 628 static void combine_bounding_box_pairs(nbnxn_grid_t *grid, const nbnxn_bb_t *bb)
 629 {
 630     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 631     using namespace gmx;
 632
 633     for (int i = 0; i < grid->ncx*grid->ncy; i++)
 634     {
 635         /* Starting bb in a column is expected to be 2-aligned */
 636         int sc2 = grid->cxy_ind[i]>>1;
 637         /* For odd numbers skip the last bb here */
 638         int nc2 = (grid->cxy_na[i]+3)>>(2+1);
 639         int c2;
 640         for (c2 = sc2; c2 < sc2+nc2; c2++)
 641         {
 642 #if NBNXN_SEARCH_BB_SIMD4
 643             Simd4Float min_S, max_S;
 644
 645             min_S = min(load4(&bb[c2*2+0].lower[0]),
 646                         load4(&bb[c2*2+1].lower[0]));
 647             max_S = max(load4(&bb[c2*2+0].upper[0]),
 648                         load4(&bb[c2*2+1].upper[0]));
 649             store4(&grid->bbj[c2].lower[0], min_S);
 650             store4(&grid->bbj[c2].upper[0], max_S);
 651 #else
 652             for (int j = 0; j < NNBSBB_C; j++)
 653             {
 654                 grid->bbj[c2].lower[j] = std::min(bb[c2*2+0].lower[j],
 655                                                   bb[c2*2+1].lower[j]);
 656                 grid->bbj[c2].upper[j] = std::max(bb[c2*2+0].upper[j],
 657                                                   bb[c2*2+1].upper[j]);
 658             }
 659 #endif
 660         }
 661         if (((grid->cxy_na[i]+3)>>2) & 1)
 662         {
 663             /* The bb count in this column is odd: duplicate the last bb */
 664             for (int j = 0; j < NNBSBB_C; j++)
 665             {
 666                 grid->bbj[c2].lower[j] = bb[c2*2].lower[j];
 667                 grid->bbj[c2].upper[j] = bb[c2*2].upper[j];
 668             }
 669         }
 670     }
 671 }
 672
 673
 674 /* Prints the average bb size, used for debug output */
 675 static void print_bbsizes_simple(FILE                *fp,
 676                                  const nbnxn_grid_t  *grid)
 677 {
 678     dvec ba;
 679
 680     clear_dvec(ba);
 681     for (int c = 0; c < grid->nc; c++)
 682     {
 683         for (int d = 0; d < DIM; d++)
 684         {
 685             ba[d] += grid->bb[c].upper[d] - grid->bb[c].lower[d];
 686         }
 687     }
 688     dsvmul(1.0/grid->nc, ba, ba);
 689
 690     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 691             grid->sx,
 692             grid->sy,
 693             grid->atom_density > 0 ?
 694             grid->na_c/(grid->atom_density*grid->sx*grid->sy) : 0.0,
 695             ba[XX], ba[YY], ba[ZZ],
 696             ba[XX]/grid->sx,
 697             ba[YY]/grid->sy,
 698             grid->atom_density > 0 ?
 699             ba[ZZ]/(grid->na_c/(grid->atom_density*grid->sx*grid->sy)) : 0.0);
 700 }
 701
 702 /* Prints the average bb size, used for debug output */
 703 static void print_bbsizes_supersub(FILE                *fp,
 704                                    const nbnxn_grid_t  *grid)
 705 {
 706     int  ns;
 707     dvec ba;
 708
 709     clear_dvec(ba);
 710     ns = 0;
 711     for (int c = 0; c < grid->nc; c++)
 712     {
 713 #if NBNXN_BBXXXX
 714         for (int s = 0; s < grid->nsubc[c]; s += STRIDE_PBB)
 715         {
 716             int cs_w = (c*c_gpuNumClusterPerCell + s)/STRIDE_PBB;
 717             for (int i = 0; i < STRIDE_PBB; i++)
 718             {
 719                 for (int d = 0; d < DIM; d++)
 720                 {
 721                     ba[d] +=
 722                         grid->pbb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
 723                         grid->pbb[cs_w*NNBSBB_XXXX+     d *STRIDE_PBB+i];
 724                 }
 725             }
 726         }
 727 #else
 728         for (int s = 0; s < grid->nsubc[c]; s++)
 729         {
 730             int cs = c*c_gpuNumClusterPerCell + s;
 731             for (int d = 0; d < DIM; d++)
 732             {
 733                 ba[d] += grid->bb[cs].upper[d] - grid->bb[cs].lower[d];
 734             }
 735         }
 736 #endif
 737         ns += grid->nsubc[c];
 738     }
 739     dsvmul(1.0/ns, ba, ba);
 740
 741     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 742             grid->sx/c_gpuNumClusterPerCellX,
 743             grid->sy/c_gpuNumClusterPerCellY,
 744             grid->atom_density > 0 ?
 745             grid->na_sc/(grid->atom_density*grid->sx*grid->sy*c_gpuNumClusterPerCellZ) : 0.0,
 746             ba[XX], ba[YY], ba[ZZ],
 747             ba[XX]*c_gpuNumClusterPerCellX/grid->sx,
 748             ba[YY]*c_gpuNumClusterPerCellY/grid->sy,
 749             grid->atom_density > 0 ?
 750             ba[ZZ]/(grid->na_sc/(grid->atom_density*grid->sx*grid->sy*c_gpuNumClusterPerCellZ)) : 0.0);
 751 }
 752
 753 /* Set non-bonded interaction flags for the current cluster.
 754  * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
 755  */
 756 static void sort_cluster_on_flag(int natoms_cluster,
 757                                  int a0, int a1, const int *atinfo,
 758                                  int *order,
 759                                  int *flags)
 760 {
 761     const int natoms_cluster_max = 8;
 762     int       sort1[natoms_cluster_max];
 763     int       sort2[natoms_cluster_max];
 764
 765     assert(natoms_cluster <= natoms_cluster_max);
 766
 767     *flags = 0;
 768
 769     int subc = 0;
 770     for (int s = a0; s < a1; s += natoms_cluster)
 771     {
 772         /* Make lists for this (sub-)cell on atoms with and without LJ */
 773         int      n1         = 0;
 774         int      n2         = 0;
 775         gmx_bool haveQ      = FALSE;
 776         int      a_lj_max   = -1;
 777         for (int a = s; a < std::min(s + natoms_cluster, a1); a++)
 778         {
 779             haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
 780
 781             if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
 782             {
 783                 sort1[n1++] = order[a];
 784                 a_lj_max    = a;
 785             }
 786             else
 787             {
 788                 sort2[n2++] = order[a];
 789             }
 790         }
 791
 792         /* If we don't have atoms with LJ, there's nothing to sort */
 793         if (n1 > 0)
 794         {
 795             *flags |= NBNXN_CI_DO_LJ(subc);
 796
 797             if (2*n1 <= natoms_cluster)
 798             {
 799                 /* Only sort when strictly necessary. Ordering particles
 800                  * Ordering particles can lead to less accurate summation
 801                  * due to rounding, both for LJ and Coulomb interactions.
 802                  */
 803                 if (2*(a_lj_max - s) >= natoms_cluster)
 804                 {
 805                     for (int i = 0; i < n1; i++)
 806                     {
 807                         order[a0 + i]      = sort1[i];
 808                     }
 809                     for (int j = 0; j < n2; j++)
 810                     {
 811                         order[a0 + n1 + j] = sort2[j];
 812                     }
 813                 }
 814
 815                 *flags |= NBNXN_CI_HALF_LJ(subc);
 816             }
 817         }
 818         if (haveQ)
 819         {
 820             *flags |= NBNXN_CI_DO_COUL(subc);
 821         }
 822         subc++;
 823     }
 824 }
 825
 826 /* Fill a pair search cell with atoms.
 827  * Potentially sorts atoms and sets the interaction flags.
 828  */
 829 static void fill_cell(const nbnxn_search_t nbs,
 830                       nbnxn_grid_t *grid,
 831                       nbnxn_atomdata_t *nbat,
 832                       int a0, int a1,
 833                       const int *atinfo,
 834                       const rvec *x,
 835                       nbnxn_bb_t gmx_unused *bb_work_aligned)
 836 {
 837     int         na, a;
 838     size_t      offset;
 839     nbnxn_bb_t *bb_ptr;
 840 #if NBNXN_BBXXXX
 841     float      *pbb_ptr;
 842 #endif
 843
 844     na = a1 - a0;
 845
 846     if (grid->bSimple)
 847     {
 848         /* Note that non-local grids are already sorted.
 849          * Then sort_cluster_on_flag will only set the flags and the sorting
 850          * will not affect the atom order.
 851          */
 852         sort_cluster_on_flag(grid->na_c, a0, a1, atinfo, nbs->a,
 853                              grid->flags+(a0>>grid->na_c_2log)-grid->cell0);
 854     }
 855
 856     if (nbs->bFEP)
 857     {
 858         /* Set the fep flag for perturbed atoms in this (sub-)cell */
 859         int c;
 860
 861         /* The grid-local cluster/(sub-)cell index */
 862         c            = (a0 >> grid->na_c_2log) - grid->cell0*(grid->bSimple ? 1 : c_gpuNumClusterPerCell);
 863         grid->fep[c] = 0;
 864         for (int at = a0; at < a1; at++)
 865         {
 866             if (nbs->a[at] >= 0 && GET_CGINFO_FEP(atinfo[nbs->a[at]]))
 867             {
 868                 grid->fep[c] |= (1 << (at - a0));
 869             }
 870         }
 871     }
 872
 873     /* Now we have sorted the atoms, set the cell indices */
 874     for (a = a0; a < a1; a++)
 875     {
 876         nbs->cell[nbs->a[a]] = a;
 877     }
 878
 879     copy_rvec_to_nbat_real(nbs->a+a0, a1-a0, grid->na_c, x,
 880                            nbat->XFormat, nbat->x, a0);
 881
 882     if (nbat->XFormat == nbatX4)
 883     {
 884         /* Store the bounding boxes as xyz.xyz. */
 885         offset = (a0 - grid->cell0*grid->na_sc) >> grid->na_c_2log;
 886         bb_ptr = grid->bb + offset;
 887
 888 #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2
 889         if (2*grid->na_cj == grid->na_c)
 890         {
 891             calc_bounding_box_x_x4_halves(na, nbat->x + atom_to_x_index<c_packX4>(a0), bb_ptr,
 892                                           grid->bbj+offset*2);
 893         }
 894         else
 895 #endif
 896         {
 897             calc_bounding_box_x_x4(na, nbat->x + atom_to_x_index<c_packX4>(a0), bb_ptr);
 898         }
 899     }
 900     else if (nbat->XFormat == nbatX8)
 901     {
 902         /* Store the bounding boxes as xyz.xyz. */
 903         offset = (a0 - grid->cell0*grid->na_sc) >> grid->na_c_2log;
 904         bb_ptr = grid->bb + offset;
 905
 906         calc_bounding_box_x_x8(na, nbat->x +  atom_to_x_index<c_packX8>(a0), bb_ptr);
 907     }
 908 #if NBNXN_BBXXXX
 909     else if (!grid->bSimple)
 910     {
 911         /* Store the bounding boxes in a format convenient
 912          * for SIMD4 calculations: xxxxyyyyzzzz...
 913          */
 914         pbb_ptr =
 915             grid->pbb +
 916             ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG))*NNBSBB_XXXX +
 917             (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB-1));
 918
 919 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 920         if (nbat->XFormat == nbatXYZQ)
 921         {
 922             calc_bounding_box_xxxx_simd4(na, nbat->x+a0*nbat->xstride,
 923                                          bb_work_aligned, pbb_ptr);
 924         }
 925         else
 926 #endif
 927         {
 928             calc_bounding_box_xxxx(na, nbat->xstride, nbat->x+a0*nbat->xstride,
 929                                    pbb_ptr);
 930         }
 931         if (gmx_debug_at)
 932         {
 933             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 934                     a0 >> grid->na_c_2log,
 935                     pbb_ptr[0*STRIDE_PBB], pbb_ptr[3*STRIDE_PBB],
 936                     pbb_ptr[1*STRIDE_PBB], pbb_ptr[4*STRIDE_PBB],
 937                     pbb_ptr[2*STRIDE_PBB], pbb_ptr[5*STRIDE_PBB]);
 938         }
 939     }
 940 #endif
 941     else
 942     {
 943         /* Store the bounding boxes as xyz.xyz. */
 944         bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log);
 945
 946         calc_bounding_box(na, nbat->xstride, nbat->x+a0*nbat->xstride,
 947                           bb_ptr);
 948
 949         if (gmx_debug_at)
 950         {
 951             int bbo;
 952             bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
 953             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 954                     a0 >> grid->na_c_2log,
 955                     grid->bb[bbo].lower[BB_X],
 956                     grid->bb[bbo].lower[BB_Y],
 957                     grid->bb[bbo].lower[BB_Z],
 958                     grid->bb[bbo].upper[BB_X],
 959                     grid->bb[bbo].upper[BB_Y],
 960                     grid->bb[bbo].upper[BB_Z]);
 961         }
 962     }
 963 }
 964
 965 /* Spatially sort the atoms within one grid column */
 966 static void sort_columns_simple(const nbnxn_search_t nbs,
 967                                 int dd_zone,
 968                                 nbnxn_grid_t *grid,
 969                                 int a0, int a1,
 970                                 const int *atinfo,
 971                                 rvec *x,
 972                                 nbnxn_atomdata_t *nbat,
 973                                 int cxy_start, int cxy_end,
 974                                 int *sort_work)
 975 {
 976     int  cfilled, c;
 977
 978     if (debug)
 979     {
 980         fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
 981                 grid->cell0, cxy_start, cxy_end, a0, a1);
 982     }
 983
 984     /* Sort the atoms within each x,y column in 3 dimensions */
 985     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
 986     {
 987         int na  = grid->cxy_na[cxy];
 988         int ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
 989         int ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 990
 991         /* Sort the atoms within each x,y column on z coordinate */
 992         sort_atoms(ZZ, FALSE, dd_zone,
 993                    nbs->a+ash, na, x,
 994                    grid->c0[ZZ],
 995                    1.0/grid->size[ZZ], ncz*grid->na_sc,
 996                    sort_work);
 997
 998         /* Fill the ncz cells in this column */
 999         cfilled = grid->cxy_ind[cxy];
1000         for (int cz = 0; cz < ncz; cz++)
1001         {
1002             c  = grid->cxy_ind[cxy] + cz;
1003
1004             int ash_c = ash + cz*grid->na_sc;
1005             int na_c  = std::min(grid->na_sc, na-(ash_c-ash));
1006
1007             fill_cell(nbs, grid, nbat,
1008                       ash_c, ash_c+na_c, atinfo, x,
1009                       nullptr);
1010
1011             /* This copy to bbcz is not really necessary.
1012              * But it allows to use the same grid search code
1013              * for the simple and supersub cell setups.
1014              */
1015             if (na_c > 0)
1016             {
1017                 cfilled = c;
1018             }
1019             grid->bbcz[c*NNBSBB_D  ] = grid->bb[cfilled].lower[BB_Z];
1020             grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled].upper[BB_Z];
1021         }
1022
1023         /* Set the unused atom indices to -1 */
1024         for (int ind = na; ind < ncz*grid->na_sc; ind++)
1025         {
1026             nbs->a[ash+ind] = -1;
1027         }
1028     }
1029 }
1030
1031 /* Spatially sort the atoms within one grid column */
1032 static void sort_columns_supersub(const nbnxn_search_t nbs,
1033                                   int dd_zone,
1034                                   nbnxn_grid_t *grid,
1035                                   int a0, int a1,
1036                                   const int *atinfo,
1037                                   rvec *x,
1038                                   nbnxn_atomdata_t *nbat,
1039                                   int cxy_start, int cxy_end,
1040                                   int *sort_work)
1041 {
1042     nbnxn_bb_t bb_work_array[2], *bb_work_aligned;
1043
1044     bb_work_aligned = reinterpret_cast<nbnxn_bb_t *>((reinterpret_cast<std::size_t>(bb_work_array+1)) & (~(static_cast<std::size_t>(15))));
1045
1046     if (debug)
1047     {
1048         fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1049                 grid->cell0, cxy_start, cxy_end, a0, a1);
1050     }
1051
1052     int subdiv_x = grid->na_c;
1053     int subdiv_y = c_gpuNumClusterPerCellX*subdiv_x;
1054     int subdiv_z = c_gpuNumClusterPerCellY*subdiv_y;
1055
1056     /* Sort the atoms within each x,y column in 3 dimensions */
1057     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
1058     {
1059         int cx = cxy/grid->ncy;
1060         int cy = cxy - cx*grid->ncy;
1061
1062         int na  = grid->cxy_na[cxy];
1063         int ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
1064         int ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1065
1066         /* Sort the atoms within each x,y column on z coordinate */
1067         sort_atoms(ZZ, FALSE, dd_zone,
1068                    nbs->a + ash, na, x,
1069                    grid->c0[ZZ],
1070                    1.0/grid->size[ZZ], ncz*grid->na_sc,
1071                    sort_work);
1072
1073         /* This loop goes over the supercells and subcells along z at once */
1074         for (int sub_z = 0; sub_z < ncz*c_gpuNumClusterPerCellZ; sub_z++)
1075         {
1076             int ash_z = ash + sub_z*subdiv_z;
1077             int na_z  = std::min(subdiv_z, na - (ash_z - ash));
1078             int cz    = -1;
1079             /* We have already sorted on z */
1080
1081             if (sub_z % c_gpuNumClusterPerCellZ == 0)
1082             {
1083                 cz = sub_z/c_gpuNumClusterPerCellZ;
1084                 int c  = grid->cxy_ind[cxy] + cz;
1085
1086                 /* The number of atoms in this supercell */
1087                 int na_c = std::min(grid->na_sc, na - (ash_z - ash));
1088
1089                 grid->nsubc[c] = std::min(c_gpuNumClusterPerCell, (na_c + grid->na_c - 1)/grid->na_c);
1090
1091                 /* Store the z-boundaries of the super cell */
1092                 grid->bbcz[c*NNBSBB_D  ] = x[nbs->a[ash_z]][ZZ];
1093                 grid->bbcz[c*NNBSBB_D+1] = x[nbs->a[ash_z + na_c - 1]][ZZ];
1094             }
1095
1096             if (c_gpuNumClusterPerCellY > 1)
1097             {
1098                 /* Sort the atoms along y */
1099                 sort_atoms(YY, (sub_z & 1), dd_zone,
1100                            nbs->a+ash_z, na_z, x,
1101                            grid->c0[YY] + cy*grid->sy,
1102                            grid->inv_sy, subdiv_z,
1103                            sort_work);
1104             }
1105
1106             for (int sub_y = 0; sub_y < c_gpuNumClusterPerCellY; sub_y++)
1107             {
1108                 int ash_y = ash_z + sub_y*subdiv_y;
1109                 int na_y  = std::min(subdiv_y, na - (ash_y - ash));
1110
1111                 if (c_gpuNumClusterPerCellX > 1)
1112                 {
1113                     /* Sort the atoms along x */
1114                     sort_atoms(XX, ((cz*c_gpuNumClusterPerCellY + sub_y) & 1), dd_zone,
1115                                nbs->a + ash_y, na_y, x,
1116                                grid->c0[XX] + cx*grid->sx,
1117                                grid->inv_sx, subdiv_y,
1118                                sort_work);
1119                 }
1120
1121                 for (int sub_x = 0; sub_x < c_gpuNumClusterPerCellX; sub_x++)
1122                 {
1123                     int ash_x = ash_y + sub_x*subdiv_x;
1124                     int na_x  = std::min(subdiv_x, na - (ash_x - ash));
1125
1126                     fill_cell(nbs, grid, nbat,
1127                               ash_x, ash_x + na_x, atinfo, x,
1128                               bb_work_aligned);
1129                 }
1130             }
1131         }
1132
1133         /* Set the unused atom indices to -1 */
1134         for (int ind = na; ind < ncz*grid->na_sc; ind++)
1135         {
1136             nbs->a[ash+ind] = -1;
1137         }
1138     }
1139 }
1140
1141 /* Determine in which grid column atoms should go */
1142 static void calc_column_indices(nbnxn_grid_t *grid,
1143                                 int a0, int a1,
1144                                 rvec *x,
1145                                 int dd_zone, const int *move,
1146                                 int thread, int nthread,
1147                                 int *cell,
1148                                 int *cxy_na)
1149 {
1150     /* We add one extra cell for particles which moved during DD */
1151     for (int i = 0; i < grid->ncx*grid->ncy+1; i++)
1152     {
1153         cxy_na[i] = 0;
1154     }
1155
1156     int n0 = a0 + static_cast<int>((thread+0)*(a1 - a0))/nthread;
1157     int n1 = a0 + static_cast<int>((thread+1)*(a1 - a0))/nthread;
1158     if (dd_zone == 0)
1159     {
1160         /* Home zone */
1161         for (int i = n0; i < n1; i++)
1162         {
1163             if (move == nullptr || move[i] >= 0)
1164             {
1165                 /* We need to be careful with rounding,
1166                  * particles might be a few bits outside the local zone.
1167                  * The int cast takes care of the lower bound,
1168                  * we will explicitly take care of the upper bound.
1169                  */
1170                 int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1171                 int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1172
1173 #ifndef NDEBUG
1174                 if (cx < 0 || cx > grid->ncx ||
1175                     cy < 0 || cy > grid->ncy)
1176                 {
1177                     gmx_fatal(FARGS,
1178                               "grid cell cx %d cy %d out of range (max %d %d)\n"
1179                               "atom %f %f %f, grid->c0 %f %f",
1180                               cx, cy, grid->ncx, grid->ncy,
1181                               x[i][XX], x[i][YY], x[i][ZZ], grid->c0[XX], grid->c0[YY]);
1182                 }
1183 #endif
1184                 /* Take care of potential rouding issues */
1185                 cx = std::min(cx, grid->ncx - 1);
1186                 cy = std::min(cy, grid->ncy - 1);
1187
1188                 /* For the moment cell will contain only the, grid local,
1189                  * x and y indices, not z.
1190                  */
1191                 cell[i] = cx*grid->ncy + cy;
1192             }
1193             else
1194             {
1195                 /* Put this moved particle after the end of the grid,
1196                  * so we can process it later without using conditionals.
1197                  */
1198                 cell[i] = grid->ncx*grid->ncy;
1199             }
1200
1201             cxy_na[cell[i]]++;
1202         }
1203     }
1204     else
1205     {
1206         /* Non-home zone */
1207         for (int i = n0; i < n1; i++)
1208         {
1209             int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1210             int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1211
1212             /* For non-home zones there could be particles outside
1213              * the non-bonded cut-off range, which have been communicated
1214              * for bonded interactions only. For the result it doesn't
1215              * matter where these end up on the grid. For performance
1216              * we put them in an extra row at the border.
1217              */
1218             cx = std::max(cx, 0);
1219             cx = std::min(cx, grid->ncx - 1);
1220             cy = std::max(cy, 0);
1221             cy = std::min(cy, grid->ncy - 1);
1222
1223             /* For the moment cell will contain only the, grid local,
1224              * x and y indices, not z.
1225              */
1226             cell[i] = cx*grid->ncy + cy;
1227
1228             cxy_na[cell[i]]++;
1229         }
1230     }
1231 }
1232
1233 /* Determine in which grid cells the atoms should go */
1234 static void calc_cell_indices(const nbnxn_search_t nbs,
1235                               int dd_zone,
1236                               nbnxn_grid_t *grid,
1237                               int a0, int a1,
1238                               const int *atinfo,
1239                               rvec *x,
1240                               const int *move,
1241                               nbnxn_atomdata_t *nbat)
1242 {
1243     int   n0, n1;
1244     int   cx, cy, cxy, ncz_max, ncz;
1245     int   nthread;
1246     int   cxy_na_i;
1247
1248     nthread = gmx_omp_nthreads_get(emntPairsearch);
1249
1250 #pragma omp parallel for num_threads(nthread) schedule(static)
1251     for (int thread = 0; thread < nthread; thread++)
1252     {
1253         try
1254         {
1255             calc_column_indices(grid, a0, a1, x, dd_zone, move, thread, nthread,
1256                                 nbs->cell, nbs->work[thread].cxy_na);
1257         }
1258         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1259     }
1260
1261     /* Make the cell index as a function of x and y */
1262     ncz_max          = 0;
1263     ncz              = 0;
1264     grid->cxy_ind[0] = 0;
1265     for (int i = 0; i < grid->ncx*grid->ncy+1; i++)
1266     {
1267         /* We set ncz_max at the beginning of the loop iso at the end
1268          * to skip i=grid->ncx*grid->ncy which are moved particles
1269          * that do not need to be ordered on the grid.
1270          */
1271         if (ncz > ncz_max)
1272         {
1273             ncz_max = ncz;
1274         }
1275         cxy_na_i = nbs->work[0].cxy_na[i];
1276         for (int thread = 1; thread < nthread; thread++)
1277         {
1278             cxy_na_i += nbs->work[thread].cxy_na[i];
1279         }
1280         ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
1281         if (nbat->XFormat == nbatX8)
1282         {
1283             /* Make the number of cell a multiple of 2 */
1284             ncz = (ncz + 1) & ~1;
1285         }
1286         grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
1287         /* Clear cxy_na, so we can reuse the array below */
1288         grid->cxy_na[i] = 0;
1289     }
1290     grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
1291
1292     nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
1293
1294     if (debug)
1295     {
1296         fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1297                 grid->na_sc, grid->na_c, grid->nc,
1298                 grid->ncx, grid->ncy, grid->nc/((double)(grid->ncx*grid->ncy)),
1299                 ncz_max);
1300         if (gmx_debug_at)
1301         {
1302             int i = 0;
1303             for (cy = 0; cy < grid->ncy; cy++)
1304             {
1305                 for (cx = 0; cx < grid->ncx; cx++)
1306                 {
1307                     fprintf(debug, " %2d", grid->cxy_ind[i+1]-grid->cxy_ind[i]);
1308                     i++;
1309                 }
1310                 fprintf(debug, "\n");
1311             }
1312         }
1313     }
1314
1315     /* Make sure the work array for sorting is large enough */
1316     if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
1317     {
1318         for (int thread = 0; thread < nbs->nthread_max; thread++)
1319         {
1320             nbs->work[thread].sort_work_nalloc =
1321                 over_alloc_large(ncz_max*grid->na_sc*SGSF);
1322             srenew(nbs->work[thread].sort_work,
1323                    nbs->work[thread].sort_work_nalloc);
1324             /* When not in use, all elements should be -1 */
1325             for (int i = 0; i < nbs->work[thread].sort_work_nalloc; i++)
1326             {
1327                 nbs->work[thread].sort_work[i] = -1;
1328             }
1329         }
1330     }
1331
1332     /* Now we know the dimensions we can fill the grid.
1333      * This is the first, unsorted fill. We sort the columns after this.
1334      */
1335     for (int i = a0; i < a1; i++)
1336     {
1337         /* At this point nbs->cell contains the local grid x,y indices */
1338         cxy = nbs->cell[i];
1339         nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
1340     }
1341
1342     if (dd_zone == 0)
1343     {
1344         /* Set the cell indices for the moved particles */
1345         n0 = grid->nc*grid->na_sc;
1346         n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
1347         if (dd_zone == 0)
1348         {
1349             for (int i = n0; i < n1; i++)
1350             {
1351                 nbs->cell[nbs->a[i]] = i;
1352             }
1353         }
1354     }
1355
1356     /* Sort the super-cell columns along z into the sub-cells. */
1357 #pragma omp parallel for num_threads(nthread) schedule(static)
1358     for (int thread = 0; thread < nthread; thread++)
1359     {
1360         try
1361         {
1362             if (grid->bSimple)
1363             {
1364                 sort_columns_simple(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
1365                                     ((thread+0)*grid->ncx*grid->ncy)/nthread,
1366                                     ((thread+1)*grid->ncx*grid->ncy)/nthread,
1367                                     nbs->work[thread].sort_work);
1368             }
1369             else
1370             {
1371                 sort_columns_supersub(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
1372                                       ((thread+0)*grid->ncx*grid->ncy)/nthread,
1373                                       ((thread+1)*grid->ncx*grid->ncy)/nthread,
1374                                       nbs->work[thread].sort_work);
1375             }
1376         }
1377         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1378     }
1379
1380     if (grid->bSimple && nbat->XFormat == nbatX8)
1381     {
1382         combine_bounding_box_pairs(grid, grid->bb);
1383     }
1384
1385     if (!grid->bSimple)
1386     {
1387         grid->nsubc_tot = 0;
1388         for (int i = 0; i < grid->nc; i++)
1389         {
1390             grid->nsubc_tot += grid->nsubc[i];
1391         }
1392     }
1393
1394     if (debug)
1395     {
1396         if (grid->bSimple)
1397         {
1398             print_bbsizes_simple(debug, grid);
1399         }
1400         else
1401         {
1402             fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
1403                     grid->nsubc_tot, (a1-a0)/(double)grid->nsubc_tot);
1404
1405             print_bbsizes_supersub(debug, grid);
1406         }
1407     }
1408 }
1409
1410 /* Sets up a grid and puts the atoms on the grid.
1411  * This function only operates on one domain of the domain decompostion.
1412  * Note that without domain decomposition there is only one domain.
1413  */
1414 void nbnxn_put_on_grid(nbnxn_search_t nbs,
1415                        int ePBC, matrix box,
1416                        int dd_zone,
1417                        rvec corner0, rvec corner1,
1418                        int a0, int a1,
1419                        real atom_density,
1420                        const int *atinfo,
1421                        rvec *x,
1422                        int nmoved, int *move,
1423                        int nb_kernel_type,
1424                        nbnxn_atomdata_t *nbat)
1425 {
1426     nbnxn_grid_t *grid;
1427     int           n;
1428     int           nc_max_grid, nc_max;
1429
1430     grid = &nbs->grid[dd_zone];
1431
1432     nbs_cycle_start(&nbs->cc[enbsCCgrid]);
1433
1434     grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
1435
1436     grid->na_c      = nbnxn_kernel_to_cluster_i_size(nb_kernel_type);
1437     grid->na_cj     = nbnxn_kernel_to_cluster_j_size(nb_kernel_type);
1438     grid->na_sc     = (grid->bSimple ? 1 : c_gpuNumClusterPerCell)*grid->na_c;
1439     grid->na_c_2log = get_2log(grid->na_c);
1440
1441     nbat->na_c = grid->na_c;
1442
1443     if (dd_zone == 0)
1444     {
1445         grid->cell0 = 0;
1446     }
1447     else
1448     {
1449         grid->cell0 =
1450             (nbs->grid[dd_zone-1].cell0 + nbs->grid[dd_zone-1].nc)*
1451             nbs->grid[dd_zone-1].na_sc/grid->na_sc;
1452     }
1453
1454     n = a1 - a0;
1455
1456     if (dd_zone == 0)
1457     {
1458         nbs->ePBC = ePBC;
1459         copy_mat(box, nbs->box);
1460
1461         /* Avoid zero density */
1462         if (atom_density > 0)
1463         {
1464             grid->atom_density = atom_density;
1465         }
1466         else
1467         {
1468             grid->atom_density = grid_atom_density(n-nmoved, corner0, corner1);
1469         }
1470
1471         grid->cell0 = 0;
1472
1473         nbs->natoms_local    = a1 - nmoved;
1474         /* We assume that nbnxn_put_on_grid is called first
1475          * for the local atoms (dd_zone=0).
1476          */
1477         nbs->natoms_nonlocal = a1 - nmoved;
1478
1479         if (debug)
1480         {
1481             fprintf(debug, "natoms_local = %5d atom_density = %5.1f\n",
1482                     nbs->natoms_local, grid->atom_density);
1483         }
1484     }
1485     else
1486     {
1487         nbs->natoms_nonlocal = std::max(nbs->natoms_nonlocal, a1);
1488     }
1489
1490     /* We always use the home zone (grid[0]) for setting the cell size,
1491      * since determining densities for non-local zones is difficult.
1492      */
1493     nc_max_grid = set_grid_size_xy(nbs, grid,
1494                                    dd_zone, n-nmoved, corner0, corner1,
1495                                    nbs->grid[0].atom_density);
1496
1497     nc_max = grid->cell0 + nc_max_grid;
1498
1499     if (a1 > nbs->cell_nalloc)
1500     {
1501         nbs->cell_nalloc = over_alloc_large(a1);
1502         srenew(nbs->cell, nbs->cell_nalloc);
1503     }
1504
1505     /* To avoid conditionals we store the moved particles at the end of a,
1506      * make sure we have enough space.
1507      */
1508     if (nc_max*grid->na_sc + nmoved > nbs->a_nalloc)
1509     {
1510         nbs->a_nalloc = over_alloc_large(nc_max*grid->na_sc + nmoved);
1511         srenew(nbs->a, nbs->a_nalloc);
1512     }
1513
1514     /* We need padding up to a multiple of the buffer flag size: simply add */
1515     if (nc_max*grid->na_sc + NBNXN_BUFFERFLAG_SIZE > nbat->nalloc)
1516     {
1517         nbnxn_atomdata_realloc(nbat, nc_max*grid->na_sc+NBNXN_BUFFERFLAG_SIZE);
1518     }
1519
1520     calc_cell_indices(nbs, dd_zone, grid, a0, a1, atinfo, x, move, nbat);
1521
1522     if (dd_zone == 0)
1523     {
1524         nbat->natoms_local = nbat->natoms;
1525     }
1526
1527     nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
1528 }
1529
1530 /* Calls nbnxn_put_on_grid for all non-local domains */
1531 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t                   nbs,
1532                                 const struct gmx_domdec_zones_t *zones,
1533                                 const int                       *atinfo,
1534                                 rvec                            *x,
1535                                 int                              nb_kernel_type,
1536                                 nbnxn_atomdata_t                *nbat)
1537 {
1538     rvec c0, c1;
1539
1540     for (int zone = 1; zone < zones->n; zone++)
1541     {
1542         for (int d = 0; d < DIM; d++)
1543         {
1544             c0[d] = zones->size[zone].bb_x0[d];
1545             c1[d] = zones->size[zone].bb_x1[d];
1546         }
1547
1548         nbnxn_put_on_grid(nbs, nbs->ePBC, nullptr,
1549                           zone, c0, c1,
1550                           zones->cg_range[zone],
1551                           zones->cg_range[zone+1],
1552                           -1,
1553                           atinfo,
1554                           x,
1555                           0, nullptr,
1556                           nb_kernel_type,
1557                           nbat);
1558     }
1559 }
1560
1561 void nbnxn_get_ncells(nbnxn_search_t nbs, int *ncx, int *ncy)
1562 {
1563     *ncx = nbs->grid[0].ncx;
1564     *ncy = nbs->grid[0].ncy;
1565 }
1566
1567 void nbnxn_get_atomorder(const nbnxn_search_t nbs, const int **a, int *n)
1568 {
1569     const nbnxn_grid_t *grid;
1570
1571     grid = &nbs->grid[0];
1572
1573     /* Return the atom order for the home cell (index 0) */
1574     *a  = nbs->a;
1575
1576     *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
1577 }
1578
1579 void nbnxn_set_atomorder(nbnxn_search_t nbs)
1580 {
1581     /* Set the atom order for the home cell (index 0) */
1582     nbnxn_grid_t *grid = &nbs->grid[0];
1583
1584     int           ao = 0;
1585     for (int cx = 0; cx < grid->ncx; cx++)
1586     {
1587         for (int cy = 0; cy < grid->ncy; cy++)
1588         {
1589             int cxy = cx*grid->ncy + cy;
1590             int j   = grid->cxy_ind[cxy]*grid->na_sc;
1591             for (int cz = 0; cz < grid->cxy_na[cxy]; cz++)
1592             {
1593                 nbs->a[j]     = ao;
1594                 nbs->cell[ao] = j;
1595                 ao++;
1596                 j++;
1597             }
1598         }
1599     }
1600 }