Fix nbnxm OpenCL with cluster size 4
[gromacs.git] / src / gromacs / nbnxm / grid.cpp
blob132ae556e1454a1ecfa50210cb02e7cfae4eaa95
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 /*! \internal \file
38 * \brief
39 * Implements the Grid class.
41 * \author Berk Hess <hess@kth.se>
42 * \ingroup module_nbnxm
45 #include "gmxpre.h"
47 #include "grid.h"
49 #include <cmath>
50 #include <cstring>
52 #include <algorithm>
54 #include "gromacs/math/utilities.h"
55 #include "gromacs/math/vec.h"
56 #include "gromacs/mdlib/gmx_omp_nthreads.h"
57 #include "gromacs/mdlib/updategroupscog.h"
58 #include "gromacs/mdtypes/forcerec.h" // only for GET_CGINFO_*
59 #include "gromacs/nbnxm/atomdata.h"
60 #include "gromacs/nbnxm/nbnxm_geometry.h"
61 #include "gromacs/simd/simd.h"
62 #include "gromacs/simd/vector_operations.h"
64 namespace Nbnxm
67 Grid::Geometry::Geometry(const PairlistType pairlistType) :
68 isSimple(pairlistType != PairlistType::HierarchicalNxN),
69 numAtomsICluster(IClusterSizePerListType[pairlistType]),
70 numAtomsJCluster(JClusterSizePerListType[pairlistType]),
71 numAtomsPerCell((isSimple ? 1 : c_gpuNumClusterPerCell)*numAtomsICluster),
72 numAtomsICluster2Log(get_2log(numAtomsICluster))
76 Grid::Grid(const PairlistType pairlistType) :
77 geometry_(pairlistType)
81 /*! \brief Returns the atom density (> 0) of a rectangular grid */
82 static real gridAtomDensity(int numAtoms,
83 const rvec lowerCorner,
84 const rvec upperCorner)
86 rvec size;
88 if (numAtoms == 0)
90 /* To avoid zero density we use a minimum of 1 atom */
91 numAtoms = 1;
94 rvec_sub(upperCorner, lowerCorner, size);
96 return numAtoms/(size[XX]*size[YY]*size[ZZ]);
99 void Grid::setDimensions(const int ddZone,
100 const int numAtoms,
101 const rvec lowerCorner,
102 const rvec upperCorner,
103 real atomDensity,
104 const real maxAtomGroupRadius,
105 const bool haveFep)
107 /* For the home zone we compute the density when not set (=-1) or when =0 */
108 if (ddZone == 0 && atomDensity <= 0)
110 atomDensity = gridAtomDensity(numAtoms, lowerCorner, upperCorner);
113 dimensions_.atomDensity = atomDensity;
114 dimensions_.maxAtomGroupRadius = maxAtomGroupRadius;
116 rvec size;
117 rvec_sub(upperCorner, lowerCorner, size);
119 if (numAtoms > geometry_.numAtomsPerCell)
121 GMX_ASSERT(atomDensity > 0, "With one or more atoms, the density should be positive");
123 /* target cell length */
124 real tlen_x;
125 real tlen_y;
126 if (geometry_.isSimple)
128 /* To minimize the zero interactions, we should make
129 * the largest of the i/j cell cubic.
131 int numAtomsInCell = std::max(geometry_.numAtomsICluster,
132 geometry_.numAtomsJCluster);
134 /* Approximately cubic cells */
135 real tlen = std::cbrt(numAtomsInCell/atomDensity);
136 tlen_x = tlen;
137 tlen_y = tlen;
139 else
141 /* Approximately cubic sub cells */
142 real tlen = std::cbrt(geometry_.numAtomsICluster/atomDensity);
143 tlen_x = tlen*c_gpuNumClusterPerCellX;
144 tlen_y = tlen*c_gpuNumClusterPerCellY;
146 /* We round ncx and ncy down, because we get less cell pairs
147 * in the nbsist when the fixed cell dimensions (x,y) are
148 * larger than the variable one (z) than the other way around.
150 dimensions_.numCells[XX] = std::max(1, static_cast<int>(size[XX]/tlen_x));
151 dimensions_.numCells[YY] = std::max(1, static_cast<int>(size[YY]/tlen_y));
153 else
155 dimensions_.numCells[XX] = 1;
156 dimensions_.numCells[YY] = 1;
159 for (int d = 0; d < DIM - 1; d++)
161 dimensions_.cellSize[d] = size[d]/dimensions_.numCells[d];
162 dimensions_.invCellSize[d] = 1/dimensions_.cellSize[d];
165 if (ddZone > 0)
167 /* This is a non-home zone, add an extra row of cells
168 * for particles communicated for bonded interactions.
169 * These can be beyond the cut-off. It doesn't matter where
170 * they end up on the grid, but for performance it's better
171 * if they don't end up in cells that can be within cut-off range.
173 dimensions_.numCells[XX]++;
174 dimensions_.numCells[YY]++;
177 /* We need one additional cell entry for particles moved by DD */
178 cxy_na_.resize(numColumns() + 1);
179 cxy_ind_.resize(numColumns() + 2);
181 /* Worst case scenario of 1 atom in each last cell */
182 int maxNumCells;
183 if (geometry_.numAtomsJCluster <= geometry_.numAtomsICluster)
185 maxNumCells = numAtoms/geometry_.numAtomsPerCell + numColumns();
187 else
189 maxNumCells = numAtoms/geometry_.numAtomsPerCell + numColumns()*geometry_.numAtomsJCluster/geometry_.numAtomsICluster;
192 if (!geometry_.isSimple)
194 numClusters_.resize(maxNumCells);
196 bbcz_.resize(maxNumCells);
198 /* This resize also zeros the contents, this avoid possible
199 * floating exceptions in SIMD with the unused bb elements.
201 if (geometry_.isSimple)
203 bb_.resize(maxNumCells);
205 else
207 #if NBNXN_BBXXXX
208 pbb_.resize(packedBoundingBoxesIndex(maxNumCells*c_gpuNumClusterPerCell));
209 #else
210 bb_.resize(maxNumCells*c_gpuNumClusterPerCell);
211 #endif
214 if (geometry_.numAtomsJCluster == geometry_.numAtomsICluster)
216 bbj_ = bb_;
218 else
220 GMX_ASSERT(geometry_.isSimple, "Only CPU lists should have different i/j cluster sizes");
222 bbjStorage_.resize(maxNumCells*geometry_.numAtomsICluster/geometry_.numAtomsJCluster);
223 bbj_ = bbjStorage_;
226 flags_.resize(maxNumCells);
227 if (haveFep)
229 fep_.resize(maxNumCells*geometry_.numAtomsPerCell/geometry_.numAtomsICluster);
232 copy_rvec(lowerCorner, dimensions_.lowerCorner);
233 copy_rvec(upperCorner, dimensions_.upperCorner);
234 copy_rvec(size, dimensions_.gridSize);
237 /* We need to sort paricles in grid columns on z-coordinate.
238 * As particle are very often distributed homogeneously, we use a sorting
239 * algorithm similar to pigeonhole sort. We multiply the z-coordinate
240 * by a factor, cast to an int and try to store in that hole. If the hole
241 * is full, we move this or another particle. A second pass is needed to make
242 * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
243 * 4 is the optimal value for homogeneous particle distribution and allows
244 * for an O(#particles) sort up till distributions were all particles are
245 * concentrated in 1/4 of the space. No NlogN fallback is implemented,
246 * as it can be expensive to detect imhomogeneous particle distributions.
248 /*! \brief Ratio of grid cells to atoms */
249 static constexpr int c_sortGridRatio = 4;
250 /*! \brief Maximum ratio of holes used, in the worst case all particles
251 * end up in the last hole and we need num. atoms extra holes at the end.
253 static constexpr int c_sortGridMaxSizeFactor = c_sortGridRatio + 1;
255 /*! \brief Sorts particle index a on coordinates x along dim.
257 * Backwards tells if we want decreasing iso increasing coordinates.
258 * h0 is the minimum of the coordinate range.
259 * invh is the 1/length of the sorting range.
260 * n_per_h (>=n) is the expected average number of particles per 1/invh
261 * sort is the sorting work array.
262 * sort should have a size of at least n_per_h*c_sortGridRatio + n,
263 * or easier, allocate at least n*c_sortGridMaxSizeFactor elements.
265 static void sort_atoms(int dim, gmx_bool Backwards,
266 int gmx_unused dd_zone,
267 bool gmx_unused relevantAtomsAreWithinGridBounds,
268 int *a, int n,
269 gmx::ArrayRef<const gmx::RVec> x,
270 real h0, real invh, int n_per_h,
271 gmx::ArrayRef<int> sort)
273 if (n <= 1)
275 /* Nothing to do */
276 return;
279 GMX_ASSERT(n <= n_per_h, "We require n <= n_per_h");
281 /* Transform the inverse range height into the inverse hole height */
282 invh *= n_per_h*c_sortGridRatio;
284 /* Set nsort to the maximum possible number of holes used.
285 * In worst case all n elements end up in the last bin.
287 int nsort = n_per_h*c_sortGridRatio + n;
289 /* Determine the index range used, so we can limit it for the second pass */
290 int zi_min = INT_MAX;
291 int zi_max = -1;
293 /* Sort the particles using a simple index sort */
294 for (int i = 0; i < n; i++)
296 /* The cast takes care of float-point rounding effects below zero.
297 * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
298 * times the box height out of the box.
300 int zi = static_cast<int>((x[a[i]][dim] - h0)*invh);
302 #ifndef NDEBUG
303 /* As we can have rounding effect, we use > iso >= here */
304 if (relevantAtomsAreWithinGridBounds &&
305 (zi < 0 || (dd_zone == 0 && zi > n_per_h*c_sortGridRatio)))
307 gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
308 a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
309 n_per_h, c_sortGridRatio);
311 #endif
312 if (zi < 0)
314 zi = 0;
317 /* In a non-local domain, particles communcated for bonded interactions
318 * can be far beyond the grid size, which is set by the non-bonded
319 * cut-off distance. We sort such particles into the last cell.
321 if (zi > n_per_h*c_sortGridRatio)
323 zi = n_per_h*c_sortGridRatio;
326 /* Ideally this particle should go in sort cell zi,
327 * but that might already be in use,
328 * in that case find the first empty cell higher up
330 if (sort[zi] < 0)
332 sort[zi] = a[i];
333 zi_min = std::min(zi_min, zi);
334 zi_max = std::max(zi_max, zi);
336 else
338 /* We have multiple atoms in the same sorting slot.
339 * Sort on real z for minimal bounding box size.
340 * There is an extra check for identical z to ensure
341 * well-defined output order, independent of input order
342 * to ensure binary reproducibility after restarts.
344 while (sort[zi] >= 0 && ( x[a[i]][dim] > x[sort[zi]][dim] ||
345 (x[a[i]][dim] == x[sort[zi]][dim] &&
346 a[i] > sort[zi])))
348 zi++;
351 if (sort[zi] >= 0)
353 /* Shift all elements by one slot until we find an empty slot */
354 int cp = sort[zi];
355 int zim = zi + 1;
356 while (sort[zim] >= 0)
358 int tmp = sort[zim];
359 sort[zim] = cp;
360 cp = tmp;
361 zim++;
363 sort[zim] = cp;
364 zi_max = std::max(zi_max, zim);
366 sort[zi] = a[i];
367 zi_max = std::max(zi_max, zi);
371 int c = 0;
372 if (!Backwards)
374 for (int zi = 0; zi < nsort; zi++)
376 if (sort[zi] >= 0)
378 a[c++] = sort[zi];
379 sort[zi] = -1;
383 else
385 for (int zi = zi_max; zi >= zi_min; zi--)
387 if (sort[zi] >= 0)
389 a[c++] = sort[zi];
390 sort[zi] = -1;
394 if (c < n)
396 gmx_incons("Lost particles while sorting");
400 #if GMX_DOUBLE
401 //! Returns double up to one least significant float bit smaller than x
402 static double R2F_D(const float x)
404 return static_cast<float>(x >= 0 ? (1 - GMX_FLOAT_EPS)*x : (1 + GMX_FLOAT_EPS)*x);
406 //! Returns double up to one least significant float bit larger than x
407 static double R2F_U(const float x)
409 return static_cast<float>(x >= 0 ? (1 + GMX_FLOAT_EPS)*x : (1 - GMX_FLOAT_EPS)*x);
411 #else
412 //! Returns x
413 static float R2F_D(const float x)
415 return x;
417 //! Returns x
418 static float R2F_U(const float x)
420 return x;
422 #endif
424 //! Computes the bounding box for na coordinates in order x,y,z, bb order xyz0
425 static void calc_bounding_box(int na, int stride, const real *x,
426 BoundingBox *bb)
428 int i;
429 real xl, xh, yl, yh, zl, zh;
431 i = 0;
432 xl = x[i+XX];
433 xh = x[i+XX];
434 yl = x[i+YY];
435 yh = x[i+YY];
436 zl = x[i+ZZ];
437 zh = x[i+ZZ];
438 i += stride;
439 for (int j = 1; j < na; j++)
441 xl = std::min(xl, x[i+XX]);
442 xh = std::max(xh, x[i+XX]);
443 yl = std::min(yl, x[i+YY]);
444 yh = std::max(yh, x[i+YY]);
445 zl = std::min(zl, x[i+ZZ]);
446 zh = std::max(zh, x[i+ZZ]);
447 i += stride;
449 /* Note: possible double to float conversion here */
450 bb->lower.x = R2F_D(xl);
451 bb->lower.y = R2F_D(yl);
452 bb->lower.z = R2F_D(zl);
453 bb->upper.x = R2F_U(xh);
454 bb->upper.y = R2F_U(yh);
455 bb->upper.z = R2F_U(zh);
458 /*! \brief Computes the bounding box for na packed coordinates, bb order xyz0 */
459 static void calc_bounding_box_x_x4(int na, const real *x,
460 BoundingBox *bb)
462 real xl, xh, yl, yh, zl, zh;
464 xl = x[XX*c_packX4];
465 xh = x[XX*c_packX4];
466 yl = x[YY*c_packX4];
467 yh = x[YY*c_packX4];
468 zl = x[ZZ*c_packX4];
469 zh = x[ZZ*c_packX4];
470 for (int j = 1; j < na; j++)
472 xl = std::min(xl, x[j+XX*c_packX4]);
473 xh = std::max(xh, x[j+XX*c_packX4]);
474 yl = std::min(yl, x[j+YY*c_packX4]);
475 yh = std::max(yh, x[j+YY*c_packX4]);
476 zl = std::min(zl, x[j+ZZ*c_packX4]);
477 zh = std::max(zh, x[j+ZZ*c_packX4]);
479 /* Note: possible double to float conversion here */
480 bb->lower.x = R2F_D(xl);
481 bb->lower.y = R2F_D(yl);
482 bb->lower.z = R2F_D(zl);
483 bb->upper.x = R2F_U(xh);
484 bb->upper.y = R2F_U(yh);
485 bb->upper.z = R2F_U(zh);
488 /*! \brief Computes the bounding box for na coordinates, bb order xyz0 */
489 static void calc_bounding_box_x_x8(int na, const real *x,
490 BoundingBox *bb)
492 real xl, xh, yl, yh, zl, zh;
494 xl = x[XX*c_packX8];
495 xh = x[XX*c_packX8];
496 yl = x[YY*c_packX8];
497 yh = x[YY*c_packX8];
498 zl = x[ZZ*c_packX8];
499 zh = x[ZZ*c_packX8];
500 for (int j = 1; j < na; j++)
502 xl = std::min(xl, x[j+XX*c_packX8]);
503 xh = std::max(xh, x[j+XX*c_packX8]);
504 yl = std::min(yl, x[j+YY*c_packX8]);
505 yh = std::max(yh, x[j+YY*c_packX8]);
506 zl = std::min(zl, x[j+ZZ*c_packX8]);
507 zh = std::max(zh, x[j+ZZ*c_packX8]);
509 /* Note: possible double to float conversion here */
510 bb->lower.x = R2F_D(xl);
511 bb->lower.y = R2F_D(yl);
512 bb->lower.z = R2F_D(zl);
513 bb->upper.x = R2F_U(xh);
514 bb->upper.y = R2F_U(yh);
515 bb->upper.z = R2F_U(zh);
518 /*! \brief Computes the bounding box for na packed coordinates, bb order xyz0 */
519 gmx_unused static void calc_bounding_box_x_x4_halves(int na, const real *x,
520 BoundingBox *bb,
521 BoundingBox *bbj)
523 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
524 using namespace gmx;
526 calc_bounding_box_x_x4(std::min(na, 2), x, bbj);
528 if (na > 2)
530 calc_bounding_box_x_x4(std::min(na-2, 2), x+(c_packX4>>1), bbj+1);
532 else
534 /* Set the "empty" bounding box to the same as the first one,
535 * so we don't need to treat special cases in the rest of the code.
537 #if NBNXN_SEARCH_BB_SIMD4
538 store4(bbj[1].lower.ptr(), load4(bbj[0].lower.ptr()));
539 store4(bbj[1].upper.ptr(), load4(bbj[0].upper.ptr()));
540 #else
541 bbj[1] = bbj[0];
542 #endif
545 #if NBNXN_SEARCH_BB_SIMD4
546 store4(bb->lower.ptr(),
547 min(load4(bbj[0].lower.ptr()), load4(bbj[1].lower.ptr())));
548 store4(bb->upper.ptr(),
549 max(load4(bbj[0].upper.ptr()), load4(bbj[1].upper.ptr())));
550 #else
552 bb->lower = BoundingBox::Corner::min(bbj[0].lower,
553 bbj[1].lower);
554 bb->upper = BoundingBox::Corner::max(bbj[0].upper,
555 bbj[1].upper);
557 #endif
560 #if NBNXN_BBXXXX
562 /*! \brief Computes the bounding box for na coordinates in order xyz, bb order xxxxyyyyzzzz */
563 static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
565 int i;
566 real xl, xh, yl, yh, zl, zh;
568 i = 0;
569 xl = x[i+XX];
570 xh = x[i+XX];
571 yl = x[i+YY];
572 yh = x[i+YY];
573 zl = x[i+ZZ];
574 zh = x[i+ZZ];
575 i += stride;
576 for (int j = 1; j < na; j++)
578 xl = std::min(xl, x[i+XX]);
579 xh = std::max(xh, x[i+XX]);
580 yl = std::min(yl, x[i+YY]);
581 yh = std::max(yh, x[i+YY]);
582 zl = std::min(zl, x[i+ZZ]);
583 zh = std::max(zh, x[i+ZZ]);
584 i += stride;
586 /* Note: possible double to float conversion here */
587 bb[0*c_packedBoundingBoxesDimSize] = R2F_D(xl);
588 bb[1*c_packedBoundingBoxesDimSize] = R2F_D(yl);
589 bb[2*c_packedBoundingBoxesDimSize] = R2F_D(zl);
590 bb[3*c_packedBoundingBoxesDimSize] = R2F_U(xh);
591 bb[4*c_packedBoundingBoxesDimSize] = R2F_U(yh);
592 bb[5*c_packedBoundingBoxesDimSize] = R2F_U(zh);
595 #endif /* NBNXN_BBXXXX */
597 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
599 /*! \brief Computes the bounding box for na coordinates in order xyz?, bb order xyz0 */
600 static void calc_bounding_box_simd4(int na, const float *x,
601 BoundingBox *bb)
603 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
604 using namespace gmx;
606 static_assert(sizeof(BoundingBox::Corner) == GMX_SIMD4_WIDTH*sizeof(float),
607 "The Corner struct should hold exactly 4 floats");
609 Simd4Float bb_0_S = load4(x);
610 Simd4Float bb_1_S = bb_0_S;
612 for (int i = 1; i < na; i++)
614 Simd4Float x_S = load4(x + i*GMX_SIMD4_WIDTH);
615 bb_0_S = min(bb_0_S, x_S);
616 bb_1_S = max(bb_1_S, x_S);
619 store4(bb->lower.ptr(), bb_0_S);
620 store4(bb->upper.ptr(), bb_1_S);
623 #if NBNXN_BBXXXX
625 /*! \brief Computes the bounding box for na coordinates in order xyz?, bb order xxxxyyyyzzzz */
626 static void calc_bounding_box_xxxx_simd4(int na, const float *x,
627 BoundingBox *bb_work_aligned,
628 real *bb)
630 calc_bounding_box_simd4(na, x, bb_work_aligned);
632 bb[0*c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.x;
633 bb[1*c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.y;
634 bb[2*c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.z;
635 bb[3*c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.x;
636 bb[4*c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.y;
637 bb[5*c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.z;
640 #endif /* NBNXN_BBXXXX */
642 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
645 /*! \brief Combines pairs of consecutive bounding boxes */
646 static void combine_bounding_box_pairs(const Grid &grid,
647 gmx::ArrayRef<const BoundingBox> bb,
648 gmx::ArrayRef<BoundingBox> bbj)
650 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
651 using namespace gmx;
653 for (int i = 0; i < grid.numColumns(); i++)
655 /* Starting bb in a column is expected to be 2-aligned */
656 const int sc2 = grid.firstCellInColumn(i) >> 1;
657 /* For odd numbers skip the last bb here */
658 const int nc2 = (grid.numAtomsInColumn(i) + 3) >> (2 + 1);
659 int c2;
660 for (c2 = sc2; c2 < sc2 + nc2; c2++)
662 #if NBNXN_SEARCH_BB_SIMD4
663 Simd4Float min_S, max_S;
665 min_S = min(load4(bb[c2*2+0].lower.ptr()),
666 load4(bb[c2*2+1].lower.ptr()));
667 max_S = max(load4(bb[c2*2+0].upper.ptr()),
668 load4(bb[c2*2+1].upper.ptr()));
669 store4(bbj[c2].lower.ptr(), min_S);
670 store4(bbj[c2].upper.ptr(), max_S);
671 #else
672 bbj[c2].lower = BoundingBox::Corner::min(bb[c2*2 + 0].lower,
673 bb[c2*2 + 1].lower);
674 bbj[c2].upper = BoundingBox::Corner::max(bb[c2*2 + 0].upper,
675 bb[c2*2 + 1].upper);
676 #endif
678 if (((grid.numAtomsInColumn(i) + 3) >> 2) & 1)
680 /* The bb count in this column is odd: duplicate the last bb */
681 bbj[c2].lower = bb[c2*2].lower;
682 bbj[c2].upper = bb[c2*2].upper;
688 /*! \brief Prints the average bb size, used for debug output */
689 static void print_bbsizes_simple(FILE *fp,
690 const Grid &grid)
692 dvec ba = { 0 };
693 for (int c = 0; c < grid.numCells(); c++)
695 const BoundingBox &bb = grid.iBoundingBoxes()[c];
696 ba[XX] += bb.upper.x - bb.lower.x;
697 ba[YY] += bb.upper.y - bb.lower.y;
698 ba[ZZ] += bb.upper.z - bb.lower.z;
700 dsvmul(1.0/grid.numCells(), ba, ba);
702 const Grid::Dimensions &dims = grid.dimensions();
703 real avgCellSizeZ = (dims.atomDensity > 0 ? grid.geometry().numAtomsICluster/(dims.atomDensity*dims.cellSize[XX]*dims.cellSize[YY]) : 0.0);
705 fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
706 dims.cellSize[XX],
707 dims.cellSize[YY],
708 avgCellSizeZ,
709 ba[XX], ba[YY], ba[ZZ],
710 ba[XX]*dims.invCellSize[XX],
711 ba[YY]*dims.invCellSize[YY],
712 dims.atomDensity > 0 ? ba[ZZ]/avgCellSizeZ : 0.0);
715 /*! \brief Prints the average bb size, used for debug output */
716 static void print_bbsizes_supersub(FILE *fp,
717 const Grid &grid)
719 int ns;
720 dvec ba;
722 clear_dvec(ba);
723 ns = 0;
724 for (int c = 0; c < grid.numCells(); c++)
726 #if NBNXN_BBXXXX
727 for (int s = 0; s < grid.numClustersPerCell()[c]; s += c_packedBoundingBoxesDimSize)
729 int cs_w = (c*c_gpuNumClusterPerCell + s)/c_packedBoundingBoxesDimSize;
730 auto boundingBoxes = grid.packedBoundingBoxes().subArray(cs_w*c_packedBoundingBoxesSize, c_packedBoundingBoxesSize);
731 for (int i = 0; i < c_packedBoundingBoxesDimSize; i++)
733 for (int d = 0; d < DIM; d++)
735 ba[d] +=
736 boundingBoxes[(DIM + d)*c_packedBoundingBoxesDimSize + i] -
737 boundingBoxes[(0 + d)*c_packedBoundingBoxesDimSize + i];
741 #else
742 for (int s = 0; s < grid.numClustersPerCell()[c]; s++)
744 const BoundingBox &bb = grid.iBoundingBoxes()[c*c_gpuNumClusterPerCell + s];
745 ba[XX] += bb.upper.x - bb.lower.x;
746 ba[YY] += bb.upper.y - bb.lower.y;
747 ba[ZZ] += bb.upper.z - bb.lower.z;
749 #endif
750 ns += grid.numClustersPerCell()[c];
752 dsvmul(1.0/ns, ba, ba);
754 const Grid::Dimensions &dims = grid.dimensions();
755 const real avgClusterSizeZ =
756 (dims.atomDensity > 0 ? grid.geometry().numAtomsPerCell/(dims.atomDensity*dims.cellSize[XX]*dims.cellSize[YY]*c_gpuNumClusterPerCellZ) : 0.0);
758 fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
759 dims.cellSize[XX]/c_gpuNumClusterPerCellX,
760 dims.cellSize[YY]/c_gpuNumClusterPerCellY,
761 avgClusterSizeZ,
762 ba[XX], ba[YY], ba[ZZ],
763 ba[XX]*c_gpuNumClusterPerCellX*dims.invCellSize[XX],
764 ba[YY]*c_gpuNumClusterPerCellY*dims.invCellSize[YY],
765 dims.atomDensity > 0 ? ba[ZZ]/avgClusterSizeZ : 0.0);
768 /*!\brief Set non-bonded interaction flags for the current cluster.
770 * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
772 static void sort_cluster_on_flag(int numAtomsInCluster,
773 int atomStart,
774 int atomEnd,
775 const int *atinfo,
776 gmx::ArrayRef<int> order,
777 int *flags)
779 constexpr int c_maxNumAtomsInCluster = 8;
780 int sort1[c_maxNumAtomsInCluster];
781 int sort2[c_maxNumAtomsInCluster];
783 GMX_ASSERT(numAtomsInCluster <= c_maxNumAtomsInCluster, "Need to increase c_maxNumAtomsInCluster to support larger clusters");
785 *flags = 0;
787 int subc = 0;
788 for (int s = atomStart; s < atomEnd; s += numAtomsInCluster)
790 /* Make lists for this (sub-)cell on atoms with and without LJ */
791 int n1 = 0;
792 int n2 = 0;
793 gmx_bool haveQ = FALSE;
794 int a_lj_max = -1;
795 for (int a = s; a < std::min(s + numAtomsInCluster, atomEnd); a++)
797 haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
799 if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
801 sort1[n1++] = order[a];
802 a_lj_max = a;
804 else
806 sort2[n2++] = order[a];
810 /* If we don't have atoms with LJ, there's nothing to sort */
811 if (n1 > 0)
813 *flags |= NBNXN_CI_DO_LJ(subc);
815 if (2*n1 <= numAtomsInCluster)
817 /* Only sort when strictly necessary. Ordering particles
818 * Ordering particles can lead to less accurate summation
819 * due to rounding, both for LJ and Coulomb interactions.
821 if (2*(a_lj_max - s) >= numAtomsInCluster)
823 for (int i = 0; i < n1; i++)
825 order[atomStart + i] = sort1[i];
827 for (int j = 0; j < n2; j++)
829 order[atomStart + n1 + j] = sort2[j];
833 *flags |= NBNXN_CI_HALF_LJ(subc);
836 if (haveQ)
838 *flags |= NBNXN_CI_DO_COUL(subc);
840 subc++;
844 /*! \brief Fill a pair search cell with atoms.
846 * Potentially sorts atoms and sets the interaction flags.
848 void Grid::fillCell(const GridSetData &gridSetData,
849 nbnxn_atomdata_t *nbat,
850 int atomStart,
851 int atomEnd,
852 const int *atinfo,
853 gmx::ArrayRef<const gmx::RVec> x,
854 BoundingBox gmx_unused *bb_work_aligned)
856 const int numAtoms = atomEnd - atomStart;
858 const gmx::ArrayRef<int> &cells = gridSetData.cells;
859 const gmx::ArrayRef<int> &atomIndices = gridSetData.atomIndices;
861 if (geometry_.isSimple)
863 /* Note that non-local grids are already sorted.
864 * Then sort_cluster_on_flag will only set the flags and the sorting
865 * will not affect the atom order.
867 sort_cluster_on_flag(geometry_.numAtomsICluster, atomStart, atomEnd, atinfo, atomIndices,
868 flags_.data() + atomToCluster(atomStart) - cellOffset_);
871 if (gridSetData.haveFep)
873 /* Set the fep flag for perturbed atoms in this (sub-)cell */
875 /* The grid-local cluster/(sub-)cell index */
876 int cell = atomToCluster(atomStart) - cellOffset_*(geometry_.isSimple ? 1 : c_gpuNumClusterPerCell);
877 fep_[cell] = 0;
878 for (int at = atomStart; at < atomEnd; at++)
880 if (atomIndices[at] >= 0 && GET_CGINFO_FEP(atinfo[atomIndices[at]]))
882 fep_[cell] |= (1 << (at - atomStart));
887 /* Now we have sorted the atoms, set the cell indices */
888 for (int at = atomStart; at < atomEnd; at++)
890 cells[atomIndices[at]] = at;
893 copy_rvec_to_nbat_real(atomIndices.data() + atomStart, numAtoms, geometry_.numAtomsICluster,
894 as_rvec_array(x.data()),
895 nbat->XFormat, nbat->x().data(), atomStart);
897 if (nbat->XFormat == nbatX4)
899 /* Store the bounding boxes as xyz.xyz. */
900 size_t offset = atomToCluster(atomStart - cellOffset_*geometry_.numAtomsICluster);
901 BoundingBox *bb_ptr = bb_.data() + offset;
903 #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2
904 if (2*geometry_.numAtomsJCluster == geometry_.numAtomsICluster)
906 calc_bounding_box_x_x4_halves(numAtoms, nbat->x().data() + atom_to_x_index<c_packX4>(atomStart), bb_ptr,
907 bbj_.data() + offset*2);
909 else
910 #endif
912 calc_bounding_box_x_x4(numAtoms, nbat->x().data() + atom_to_x_index<c_packX4>(atomStart), bb_ptr);
915 else if (nbat->XFormat == nbatX8)
917 /* Store the bounding boxes as xyz.xyz. */
918 size_t offset = atomToCluster(atomStart - cellOffset_*geometry_.numAtomsICluster);
919 BoundingBox *bb_ptr = bb_.data() + offset;
921 calc_bounding_box_x_x8(numAtoms, nbat->x().data() + atom_to_x_index<c_packX8>(atomStart), bb_ptr);
923 #if NBNXN_BBXXXX
924 else if (!geometry_.isSimple)
926 /* Store the bounding boxes in a format convenient
927 * for SIMD4 calculations: xxxxyyyyzzzz...
929 const int clusterIndex = ((atomStart - cellOffset_*geometry_.numAtomsPerCell) >> geometry_.numAtomsICluster2Log);
930 float *pbb_ptr =
931 pbb_.data() +
932 packedBoundingBoxesIndex(clusterIndex) +
933 (clusterIndex & (c_packedBoundingBoxesDimSize - 1));
935 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
936 if (nbat->XFormat == nbatXYZQ)
938 calc_bounding_box_xxxx_simd4(numAtoms, nbat->x().data() + atomStart*nbat->xstride,
939 bb_work_aligned, pbb_ptr);
941 else
942 #endif
944 calc_bounding_box_xxxx(numAtoms, nbat->xstride, nbat->x().data() + atomStart*nbat->xstride,
945 pbb_ptr);
947 if (gmx_debug_at)
949 fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
950 atomToCluster(atomStart),
951 pbb_ptr[0*c_packedBoundingBoxesDimSize], pbb_ptr[3*c_packedBoundingBoxesDimSize],
952 pbb_ptr[1*c_packedBoundingBoxesDimSize], pbb_ptr[4*c_packedBoundingBoxesDimSize],
953 pbb_ptr[2*c_packedBoundingBoxesDimSize], pbb_ptr[5*c_packedBoundingBoxesDimSize]);
956 #endif
957 else
959 /* Store the bounding boxes as xyz.xyz. */
960 BoundingBox *bb_ptr = bb_.data() + atomToCluster(atomStart - cellOffset_*geometry_.numAtomsPerCell);
962 calc_bounding_box(numAtoms, nbat->xstride, nbat->x().data() + atomStart*nbat->xstride,
963 bb_ptr);
965 if (gmx_debug_at)
967 int bbo = atomToCluster(atomStart - cellOffset_*geometry_.numAtomsPerCell);
968 fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
969 atomToCluster(atomStart),
970 bb_[bbo].lower.x,
971 bb_[bbo].lower.y,
972 bb_[bbo].lower.z,
973 bb_[bbo].upper.x,
974 bb_[bbo].upper.y,
975 bb_[bbo].upper.z);
980 void Grid::sortColumnsCpuGeometry(const GridSetData &gridSetData,
981 int dd_zone,
982 int atomStart, int atomEnd,
983 const int *atinfo,
984 gmx::ArrayRef<const gmx::RVec> x,
985 nbnxn_atomdata_t *nbat,
986 int cxy_start, int cxy_end,
987 gmx::ArrayRef<int> sort_work)
989 if (debug)
991 fprintf(debug, "cell_offset %d sorting columns %d - %d, atoms %d - %d\n",
992 cellOffset_, cxy_start, cxy_end, atomStart, atomEnd);
995 const bool relevantAtomsAreWithinGridBounds = (dimensions_.maxAtomGroupRadius == 0);
997 const int numAtomsPerCell = geometry_.numAtomsPerCell;
999 /* Sort the atoms within each x,y column in 3 dimensions */
1000 for (int cxy = cxy_start; cxy < cxy_end; cxy++)
1002 const int numAtoms = numAtomsInColumn(cxy);
1003 const int numCellsZ = cxy_ind_[cxy + 1] - cxy_ind_[cxy];
1004 const int atomOffset = firstAtomInColumn(cxy);
1006 /* Sort the atoms within each x,y column on z coordinate */
1007 sort_atoms(ZZ, FALSE, dd_zone,
1008 relevantAtomsAreWithinGridBounds,
1009 gridSetData.atomIndices.data() + atomOffset, numAtoms, x,
1010 dimensions_.lowerCorner[ZZ],
1011 1.0/dimensions_.gridSize[ZZ], numCellsZ*numAtomsPerCell,
1012 sort_work);
1014 /* Fill the ncz cells in this column */
1015 const int firstCell = firstCellInColumn(cxy);
1016 int cellFilled = firstCell;
1017 for (int cellZ = 0; cellZ < numCellsZ; cellZ++)
1019 const int cell = firstCell + cellZ;
1021 const int atomOffsetCell = atomOffset + cellZ*numAtomsPerCell;
1022 const int numAtomsCell = std::min(numAtomsPerCell, numAtoms - (atomOffsetCell - atomOffset));
1024 fillCell(gridSetData, nbat,
1025 atomOffsetCell, atomOffsetCell + numAtomsCell, atinfo, x,
1026 nullptr);
1028 /* This copy to bbcz is not really necessary.
1029 * But it allows to use the same grid search code
1030 * for the simple and supersub cell setups.
1032 if (numAtomsCell > 0)
1034 cellFilled = cell;
1036 bbcz_[cell].lower = bb_[cellFilled].lower.z;
1037 bbcz_[cell].upper = bb_[cellFilled].upper.z;
1040 /* Set the unused atom indices to -1 */
1041 for (int ind = numAtoms; ind < numCellsZ*numAtomsPerCell; ind++)
1043 gridSetData.atomIndices[atomOffset + ind] = -1;
1048 /* Spatially sort the atoms within one grid column */
1049 void Grid::sortColumnsGpuGeometry(const GridSetData &gridSetData,
1050 int dd_zone,
1051 int atomStart, int atomEnd,
1052 const int *atinfo,
1053 gmx::ArrayRef<const gmx::RVec> x,
1054 nbnxn_atomdata_t *nbat,
1055 int cxy_start, int cxy_end,
1056 gmx::ArrayRef<int> sort_work)
1058 BoundingBox bb_work_array[2];
1059 BoundingBox *bb_work_aligned = reinterpret_cast<BoundingBox *>((reinterpret_cast<std::size_t>(bb_work_array + 1)) & (~(static_cast<std::size_t>(15))));
1061 if (debug)
1063 fprintf(debug, "cell_offset %d sorting columns %d - %d, atoms %d - %d\n",
1064 cellOffset_, cxy_start, cxy_end, atomStart, atomEnd);
1067 const bool relevantAtomsAreWithinGridBounds = (dimensions_.maxAtomGroupRadius == 0);
1069 const int numAtomsPerCell = geometry_.numAtomsPerCell;
1071 const int subdiv_x = geometry_.numAtomsICluster;
1072 const int subdiv_y = c_gpuNumClusterPerCellX*subdiv_x;
1073 const int subdiv_z = c_gpuNumClusterPerCellY*subdiv_y;
1075 /* Extract the atom index array that will be filled here */
1076 const gmx::ArrayRef<int> &atomIndices = gridSetData.atomIndices;
1078 /* Sort the atoms within each x,y column in 3 dimensions.
1079 * Loop over all columns on the x/y grid.
1081 for (int cxy = cxy_start; cxy < cxy_end; cxy++)
1083 const int gridX = cxy/dimensions_.numCells[YY];
1084 const int gridY = cxy - gridX*dimensions_.numCells[YY];
1086 const int numAtomsInColumn = cxy_na_[cxy];
1087 const int numCellsInColumn = cxy_ind_[cxy + 1] - cxy_ind_[cxy];
1088 const int atomOffset = firstAtomInColumn(cxy);
1090 /* Sort the atoms within each x,y column on z coordinate */
1091 sort_atoms(ZZ, FALSE, dd_zone,
1092 relevantAtomsAreWithinGridBounds,
1093 atomIndices.data() + atomOffset, numAtomsInColumn, x,
1094 dimensions_.lowerCorner[ZZ],
1095 1.0/dimensions_.gridSize[ZZ], numCellsInColumn*numAtomsPerCell,
1096 sort_work);
1098 /* This loop goes over the cells and clusters along z at once */
1099 for (int sub_z = 0; sub_z < numCellsInColumn*c_gpuNumClusterPerCellZ; sub_z++)
1101 const int atomOffsetZ = atomOffset + sub_z*subdiv_z;
1102 const int numAtomsZ = std::min(subdiv_z, numAtomsInColumn - (atomOffsetZ - atomOffset));
1103 int cz = -1;
1104 /* We have already sorted on z */
1106 if (sub_z % c_gpuNumClusterPerCellZ == 0)
1108 cz = sub_z/c_gpuNumClusterPerCellZ;
1109 const int cell = cxy_ind_[cxy] + cz;
1111 /* The number of atoms in this cell/super-cluster */
1112 const int numAtoms = std::min(numAtomsPerCell, numAtomsInColumn - (atomOffsetZ - atomOffset));
1114 numClusters_[cell] = std::min(c_gpuNumClusterPerCell,
1115 (numAtoms + geometry_.numAtomsICluster - 1)/ geometry_.numAtomsICluster);
1117 /* Store the z-boundaries of the bounding box of the cell */
1118 bbcz_[cell].lower = x[atomIndices[atomOffsetZ]][ZZ];
1119 bbcz_[cell].upper = x[atomIndices[atomOffsetZ + numAtoms - 1]][ZZ];
1122 if (c_gpuNumClusterPerCellY > 1)
1124 /* Sort the atoms along y */
1125 sort_atoms(YY, (sub_z & 1) != 0, dd_zone,
1126 relevantAtomsAreWithinGridBounds,
1127 atomIndices.data() + atomOffsetZ, numAtomsZ, x,
1128 dimensions_.lowerCorner[YY] + gridY*dimensions_.cellSize[YY],
1129 dimensions_.invCellSize[YY], subdiv_z,
1130 sort_work);
1133 for (int sub_y = 0; sub_y < c_gpuNumClusterPerCellY; sub_y++)
1135 const int atomOffsetY = atomOffsetZ + sub_y*subdiv_y;
1136 const int numAtomsY = std::min(subdiv_y, numAtomsInColumn - (atomOffsetY - atomOffset));
1138 if (c_gpuNumClusterPerCellX > 1)
1140 /* Sort the atoms along x */
1141 sort_atoms(XX, ((cz*c_gpuNumClusterPerCellY + sub_y) & 1) != 0, dd_zone,
1142 relevantAtomsAreWithinGridBounds,
1143 atomIndices.data() + atomOffsetY, numAtomsY, x,
1144 dimensions_.lowerCorner[XX] + gridX*dimensions_.cellSize[XX],
1145 dimensions_.invCellSize[XX], subdiv_y,
1146 sort_work);
1149 for (int sub_x = 0; sub_x < c_gpuNumClusterPerCellX; sub_x++)
1151 const int atomOffsetX = atomOffsetY + sub_x*subdiv_x;
1152 const int numAtomsX = std::min(subdiv_x, numAtomsInColumn - (atomOffsetX - atomOffset));
1154 fillCell(gridSetData, nbat,
1155 atomOffsetX, atomOffsetX + numAtomsX, atinfo, x,
1156 bb_work_aligned);
1161 /* Set the unused atom indices to -1 */
1162 for (int ind = numAtomsInColumn; ind < numCellsInColumn*numAtomsPerCell; ind++)
1164 atomIndices[atomOffset + ind] = -1;
1169 /*! \brief Sets the cell index in the cell array for atom \p atomIndex and increments the atom count for the grid column */
1170 static void setCellAndAtomCount(gmx::ArrayRef<int> cell,
1171 int cellIndex,
1172 gmx::ArrayRef<int> cxy_na,
1173 int atomIndex)
1175 cell[atomIndex] = cellIndex;
1176 cxy_na[cellIndex] += 1;
1179 void Grid::calcColumnIndices(const Grid::Dimensions &gridDims,
1180 const gmx::UpdateGroupsCog *updateGroupsCog,
1181 const int atomStart,
1182 const int atomEnd,
1183 gmx::ArrayRef<const gmx::RVec> x,
1184 const int dd_zone,
1185 const int *move,
1186 const int thread,
1187 const int nthread,
1188 gmx::ArrayRef<int> cell,
1189 gmx::ArrayRef<int> cxy_na)
1191 const int numColumns = gridDims.numCells[XX]*gridDims.numCells[YY];
1193 /* We add one extra cell for particles which moved during DD */
1194 for (int i = 0; i < numColumns; i++)
1196 cxy_na[i] = 0;
1199 int taskAtomStart = atomStart + static_cast<int>((thread + 0)*(atomEnd - atomStart))/nthread;
1200 int taskAtomEnd = atomStart + static_cast<int>((thread + 1)*(atomEnd - atomStart))/nthread;
1202 if (dd_zone == 0)
1204 /* Home zone */
1205 for (int i = taskAtomStart; i < taskAtomEnd; i++)
1207 if (move == nullptr || move[i] >= 0)
1210 const gmx::RVec &coord = (updateGroupsCog ? updateGroupsCog->cogForAtom(i) : x[i]);
1212 /* We need to be careful with rounding,
1213 * particles might be a few bits outside the local zone.
1214 * The int cast takes care of the lower bound,
1215 * we will explicitly take care of the upper bound.
1217 int cx = static_cast<int>((coord[XX] - gridDims.lowerCorner[XX])*gridDims.invCellSize[XX]);
1218 int cy = static_cast<int>((coord[YY] - gridDims.lowerCorner[YY])*gridDims.invCellSize[YY]);
1220 #ifndef NDEBUG
1221 if (cx < 0 || cx > gridDims.numCells[XX] ||
1222 cy < 0 || cy > gridDims.numCells[YY])
1224 gmx_fatal(FARGS,
1225 "grid cell cx %d cy %d out of range (max %d %d)\n"
1226 "atom %f %f %f, grid->c0 %f %f",
1227 cx, cy,
1228 gridDims.numCells[XX],
1229 gridDims.numCells[YY],
1230 x[i][XX], x[i][YY], x[i][ZZ],
1231 gridDims.lowerCorner[XX],
1232 gridDims.lowerCorner[YY]);
1234 #endif
1235 /* Take care of potential rouding issues */
1236 cx = std::min(cx, gridDims.numCells[XX] - 1);
1237 cy = std::min(cy, gridDims.numCells[YY] - 1);
1239 /* For the moment cell will contain only the, grid local,
1240 * x and y indices, not z.
1242 setCellAndAtomCount(cell, cx*gridDims.numCells[YY] + cy, cxy_na, i);
1244 else
1246 /* Put this moved particle after the end of the grid,
1247 * so we can process it later without using conditionals.
1249 setCellAndAtomCount(cell, numColumns, cxy_na, i);
1253 else
1255 /* Non-home zone */
1256 for (int i = taskAtomStart; i < taskAtomEnd; i++)
1258 int cx = static_cast<int>((x[i][XX] - gridDims.lowerCorner[XX])*gridDims.invCellSize[XX]);
1259 int cy = static_cast<int>((x[i][YY] - gridDims.lowerCorner[YY])*gridDims.invCellSize[YY]);
1261 /* For non-home zones there could be particles outside
1262 * the non-bonded cut-off range, which have been communicated
1263 * for bonded interactions only. For the result it doesn't
1264 * matter where these end up on the grid. For performance
1265 * we put them in an extra row at the border.
1267 cx = std::max(cx, 0);
1268 cx = std::min(cx, gridDims.numCells[XX] - 1);
1269 cy = std::max(cy, 0);
1270 cy = std::min(cy, gridDims.numCells[YY] - 1);
1272 /* For the moment cell will contain only the, grid local,
1273 * x and y indices, not z.
1275 setCellAndAtomCount(cell, cx*gridDims.numCells[YY] + cy, cxy_na, i);
1280 /*! \brief Resizes grid and atom data which depend on the number of cells */
1281 static void resizeForNumberOfCells(const int numNbnxnAtoms,
1282 const int numAtomsMoved,
1283 GridSetData *gridSetData,
1284 nbnxn_atomdata_t *nbat)
1286 /* Note: gridSetData->cellIndices was already resized before */
1288 /* To avoid conditionals we store the moved particles at the end of a,
1289 * make sure we have enough space.
1291 gridSetData->atomIndices.resize(numNbnxnAtoms + numAtomsMoved);
1293 /* Make space in nbat for storing the atom coordinates */
1294 nbat->resizeCoordinateBuffer(numNbnxnAtoms);
1297 void Grid::setCellIndices(int ddZone,
1298 int cellOffset,
1299 GridSetData *gridSetData,
1300 gmx::ArrayRef<GridWork> gridWork,
1301 int atomStart,
1302 int atomEnd,
1303 const int *atinfo,
1304 gmx::ArrayRef<const gmx::RVec> x,
1305 const int numAtomsMoved,
1306 nbnxn_atomdata_t *nbat)
1308 cellOffset_ = cellOffset;
1310 const int nthread = gmx_omp_nthreads_get(emntPairsearch);
1312 const int numAtomsPerCell = geometry_.numAtomsPerCell;
1314 /* Make the cell index as a function of x and y */
1315 int ncz_max = 0;
1316 int ncz = 0;
1317 cxy_ind_[0] = 0;
1318 for (int i = 0; i < numColumns() + 1; i++)
1320 /* We set ncz_max at the beginning of the loop iso at the end
1321 * to skip i=grid->ncx*grid->numCells[YY] which are moved particles
1322 * that do not need to be ordered on the grid.
1324 if (ncz > ncz_max)
1326 ncz_max = ncz;
1328 int cxy_na_i = gridWork[0].numAtomsPerColumn[i];
1329 for (int thread = 1; thread < nthread; thread++)
1331 cxy_na_i += gridWork[thread].numAtomsPerColumn[i];
1333 ncz = (cxy_na_i + numAtomsPerCell - 1)/numAtomsPerCell;
1334 if (nbat->XFormat == nbatX8)
1336 /* Make the number of cell a multiple of 2 */
1337 ncz = (ncz + 1) & ~1;
1339 cxy_ind_[i+1] = cxy_ind_[i] + ncz;
1340 /* Clear cxy_na_, so we can reuse the array below */
1341 cxy_na_[i] = 0;
1343 numCellsTotal_ = cxy_ind_[numColumns()] - cxy_ind_[0];
1345 /* Resize grid and atom data which depend on the number of cells */
1346 resizeForNumberOfCells(atomIndexEnd(), numAtomsMoved, gridSetData, nbat);
1348 if (debug)
1350 fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1351 numAtomsPerCell, geometry_.numAtomsICluster, numCellsTotal_,
1352 dimensions_.numCells[XX], dimensions_.numCells[YY],
1353 numCellsTotal_/(static_cast<double>(numColumns())),
1354 ncz_max);
1355 if (gmx_debug_at)
1357 int i = 0;
1358 for (int cy = 0; cy < dimensions_.numCells[YY]; cy++)
1360 for (int cx = 0; cx < dimensions_.numCells[XX]; cx++)
1362 fprintf(debug, " %2d", cxy_ind_[i + 1] - cxy_ind_[i]);
1363 i++;
1365 fprintf(debug, "\n");
1370 /* Make sure the work array for sorting is large enough */
1371 const int worstCaseSortBufferSize = ncz_max*numAtomsPerCell*c_sortGridMaxSizeFactor;
1372 if (worstCaseSortBufferSize > gmx::index(gridWork[0].sortBuffer.size()))
1374 for (GridWork &work : gridWork)
1376 /* Elements not in use should be -1 */
1377 work.sortBuffer.resize(worstCaseSortBufferSize, -1);
1381 /* Now we know the dimensions we can fill the grid.
1382 * This is the first, unsorted fill. We sort the columns after this.
1384 gmx::ArrayRef<int> cells = gridSetData->cells;
1385 gmx::ArrayRef<int> atomIndices = gridSetData->atomIndices;
1386 for (int i = atomStart; i < atomEnd; i++)
1388 /* At this point nbs->cell contains the local grid x,y indices */
1389 const int cxy = cells[i];
1390 atomIndices[firstAtomInColumn(cxy) + cxy_na_[cxy]++] = i;
1393 if (ddZone == 0)
1395 /* Set the cell indices for the moved particles */
1396 int n0 = numCellsTotal_*numAtomsPerCell;
1397 int n1 = numCellsTotal_*numAtomsPerCell + cxy_na_[numColumns()];
1398 if (ddZone == 0)
1400 for (int i = n0; i < n1; i++)
1402 cells[atomIndices[i]] = i;
1407 /* Sort the super-cell columns along z into the sub-cells. */
1408 #pragma omp parallel for num_threads(nthread) schedule(static)
1409 for (int thread = 0; thread < nthread; thread++)
1413 int columnStart = ((thread + 0)*numColumns())/nthread;
1414 int columnEnd = ((thread + 1)*numColumns())/nthread;
1415 if (geometry_.isSimple)
1417 sortColumnsCpuGeometry(*gridSetData, ddZone,
1418 atomStart, atomEnd, atinfo, x, nbat,
1419 columnStart, columnEnd,
1420 gridWork[thread].sortBuffer);
1422 else
1424 sortColumnsGpuGeometry(*gridSetData, ddZone,
1425 atomStart, atomEnd, atinfo, x, nbat,
1426 columnStart, columnEnd,
1427 gridWork[thread].sortBuffer);
1430 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1433 if (geometry_.isSimple && nbat->XFormat == nbatX8)
1435 combine_bounding_box_pairs(*this, bb_, bbj_);
1438 if (!geometry_.isSimple)
1440 numClustersTotal_ = 0;
1441 for (int i = 0; i < numCellsTotal_; i++)
1443 numClustersTotal_ += numClusters_[i];
1447 if (debug)
1449 if (geometry_.isSimple)
1451 print_bbsizes_simple(debug, *this);
1453 else
1455 fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
1456 numClustersTotal_,
1457 (atomEnd - atomStart)/static_cast<double>(numClustersTotal_));
1459 print_bbsizes_supersub(debug, *this);
1464 } // namespace Nbnxm