Use std::vector in nbnxn_grid_t
[gromacs.git] / src / gromacs / mdlib / nbnxn_grid.cpp
blob81623c07456cdb42e412ccaff32ea65b7aa9c399
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #include "gmxpre.h"
38 #include "nbnxn_grid.h"
40 #include <string.h>
42 #include <cmath>
44 #include <algorithm>
46 #include "gromacs/domdec/domdec_struct.h"
47 #include "gromacs/math/utilities.h"
48 #include "gromacs/math/vec.h"
49 #include "gromacs/mdlib/gmx_omp_nthreads.h"
50 #include "gromacs/mdlib/nb_verlet.h"
51 #include "gromacs/mdlib/nbnxn_atomdata.h"
52 #include "gromacs/mdlib/nbnxn_consts.h"
53 #include "gromacs/mdlib/nbnxn_internal.h"
54 #include "gromacs/mdlib/nbnxn_search.h"
55 #include "gromacs/mdlib/nbnxn_util.h"
56 #include "gromacs/mdtypes/forcerec.h" // only for GET_CGINFO_*
57 #include "gromacs/simd/simd.h"
58 #include "gromacs/simd/vector_operations.h"
59 #include "gromacs/utility/exceptions.h"
60 #include "gromacs/utility/smalloc.h"
62 struct gmx_domdec_zones_t;
64 static real grid_atom_density(int numAtoms,
65 const rvec lowerCorner,
66 const rvec upperCorner)
68 rvec size;
70 if (numAtoms == 0)
72 /* To avoid zero density we use a minimum of 1 atom */
73 numAtoms = 1;
76 rvec_sub(upperCorner, lowerCorner, size);
78 return numAtoms/(size[XX]*size[YY]*size[ZZ]);
81 static int set_grid_size_xy(const nbnxn_search_t nbs,
82 nbnxn_grid_t *grid,
83 int ddZone,
84 int numAtoms,
85 const rvec lowerCorner,
86 const rvec upperCorner,
87 real atomDensity)
89 rvec size;
90 real tlen, tlen_x, tlen_y;
92 rvec_sub(upperCorner, lowerCorner, size);
94 if (numAtoms > grid->na_sc)
96 GMX_ASSERT(atomDensity > 0, "With one or more atoms, the density should be positive");
98 /* target cell length */
99 if (grid->bSimple)
101 /* To minimize the zero interactions, we should make
102 * the largest of the i/j cell cubic.
104 int numAtomsInCell = std::max(grid->na_c, grid->na_cj);
106 /* Approximately cubic cells */
107 tlen = std::cbrt(numAtomsInCell/atomDensity);
108 tlen_x = tlen;
109 tlen_y = tlen;
111 else
113 /* Approximately cubic sub cells */
114 tlen = std::cbrt(grid->na_c/atomDensity);
115 tlen_x = tlen*c_gpuNumClusterPerCellX;
116 tlen_y = tlen*c_gpuNumClusterPerCellY;
118 /* We round ncx and ncy down, because we get less cell pairs
119 * in the nbsist when the fixed cell dimensions (x,y) are
120 * larger than the variable one (z) than the other way around.
122 grid->ncx = std::max(1, static_cast<int>(size[XX]/tlen_x));
123 grid->ncy = std::max(1, static_cast<int>(size[YY]/tlen_y));
125 else
127 grid->ncx = 1;
128 grid->ncy = 1;
131 grid->sx = size[XX]/grid->ncx;
132 grid->sy = size[YY]/grid->ncy;
133 grid->inv_sx = 1/grid->sx;
134 grid->inv_sy = 1/grid->sy;
136 if (ddZone > 0)
138 /* This is a non-home zone, add an extra row of cells
139 * for particles communicated for bonded interactions.
140 * These can be beyond the cut-off. It doesn't matter where
141 * they end up on the grid, but for performance it's better
142 * if they don't end up in cells that can be within cut-off range.
144 grid->ncx++;
145 grid->ncy++;
148 /* We need one additional cell entry for particles moved by DD */
149 grid->cxy_na.resize(grid->ncx*grid->ncy + 1);
150 grid->cxy_ind.resize(grid->ncx*grid->ncy + 2);
152 for (int t = 0; t < nbs->nthread_max; t++)
154 if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
156 nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
157 srenew(nbs->work[t].cxy_na, nbs->work[t].cxy_na_nalloc);
161 /* Worst case scenario of 1 atom in each last cell */
162 int maxNumCells;
163 if (grid->na_cj <= grid->na_c)
165 maxNumCells = numAtoms/grid->na_sc + grid->ncx*grid->ncy;
167 else
169 maxNumCells = numAtoms/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
172 grid->nsubc.resize(maxNumCells);
173 grid->bbcz.resize(maxNumCells*NNBSBB_D);
175 /* This resize also zeros the contents, this avoid possible
176 * floating exceptions in SIMD with the unused bb elements.
178 if (grid->bSimple)
180 grid->bb.resize(maxNumCells);
182 else
184 #if NBNXN_BBXXXX
185 grid->pbb.resize(maxNumCells*c_gpuNumClusterPerCell/STRIDE_PBB*NNBSBB_XXXX);
186 #else
187 grid->bb.resize(maxNumCells*c_gpuNumClusterPerCell);
188 #endif
191 if (grid->bSimple)
193 if (grid->na_cj == grid->na_c)
195 grid->bbj = grid->bb;
197 else
199 grid->bbjStorage.resize(maxNumCells*grid->na_c/grid->na_cj);
200 grid->bbj = grid->bbjStorage;
204 grid->flags.resize(maxNumCells);
205 if (nbs->bFEP)
207 grid->fep.resize(maxNumCells*grid->na_sc/grid->na_c);
210 copy_rvec(lowerCorner, grid->c0);
211 copy_rvec(upperCorner, grid->c1);
212 copy_rvec(size, grid->size);
214 return maxNumCells;
217 /* We need to sort paricles in grid columns on z-coordinate.
218 * As particle are very often distributed homogeneously, we a sorting
219 * algorithm similar to pigeonhole sort. We multiply the z-coordinate
220 * by a factor, cast to an int and try to store in that hole. If the hole
221 * is full, we move this or another particle. A second pass is needed to make
222 * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
223 * 4 is the optimal value for homogeneous particle distribution and allows
224 * for an O(#particles) sort up till distributions were all particles are
225 * concentrated in 1/4 of the space. No NlogN fallback is implemented,
226 * as it can be expensive to detect imhomogeneous particle distributions.
227 * SGSF is the maximum ratio of holes used, in the worst case all particles
228 * end up in the last hole and we need #particles extra holes at the end.
230 #define SORT_GRID_OVERSIZE 4
231 #define SGSF (SORT_GRID_OVERSIZE + 1)
233 /* Sort particle index a on coordinates x along dim.
234 * Backwards tells if we want decreasing iso increasing coordinates.
235 * h0 is the minimum of the coordinate range.
236 * invh is the 1/length of the sorting range.
237 * n_per_h (>=n) is the expected average number of particles per 1/invh
238 * sort is the sorting work array.
239 * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
240 * or easier, allocate at least n*SGSF elements.
242 static void sort_atoms(int dim, gmx_bool Backwards,
243 int gmx_unused dd_zone,
244 int *a, int n, const rvec *x,
245 real h0, real invh, int n_per_h,
246 int *sort)
248 if (n <= 1)
250 /* Nothing to do */
251 return;
254 #ifndef NDEBUG
255 if (n > n_per_h)
257 gmx_incons("n > n_per_h");
259 #endif
261 /* Transform the inverse range height into the inverse hole height */
262 invh *= n_per_h*SORT_GRID_OVERSIZE;
264 /* Set nsort to the maximum possible number of holes used.
265 * In worst case all n elements end up in the last bin.
267 int nsort = n_per_h*SORT_GRID_OVERSIZE + n;
269 /* Determine the index range used, so we can limit it for the second pass */
270 int zi_min = INT_MAX;
271 int zi_max = -1;
273 /* Sort the particles using a simple index sort */
274 for (int i = 0; i < n; i++)
276 /* The cast takes care of float-point rounding effects below zero.
277 * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
278 * times the box height out of the box.
280 int zi = static_cast<int>((x[a[i]][dim] - h0)*invh);
282 #ifndef NDEBUG
283 /* As we can have rounding effect, we use > iso >= here */
284 if (zi < 0 || (dd_zone == 0 && zi > n_per_h*SORT_GRID_OVERSIZE))
286 gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
287 a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
288 n_per_h, SORT_GRID_OVERSIZE);
290 #endif
292 /* In a non-local domain, particles communcated for bonded interactions
293 * can be far beyond the grid size, which is set by the non-bonded
294 * cut-off distance. We sort such particles into the last cell.
296 if (zi > n_per_h*SORT_GRID_OVERSIZE)
298 zi = n_per_h*SORT_GRID_OVERSIZE;
301 /* Ideally this particle should go in sort cell zi,
302 * but that might already be in use,
303 * in that case find the first empty cell higher up
305 if (sort[zi] < 0)
307 sort[zi] = a[i];
308 zi_min = std::min(zi_min, zi);
309 zi_max = std::max(zi_max, zi);
311 else
313 /* We have multiple atoms in the same sorting slot.
314 * Sort on real z for minimal bounding box size.
315 * There is an extra check for identical z to ensure
316 * well-defined output order, independent of input order
317 * to ensure binary reproducibility after restarts.
319 while (sort[zi] >= 0 && ( x[a[i]][dim] > x[sort[zi]][dim] ||
320 (x[a[i]][dim] == x[sort[zi]][dim] &&
321 a[i] > sort[zi])))
323 zi++;
326 if (sort[zi] >= 0)
328 /* Shift all elements by one slot until we find an empty slot */
329 int cp = sort[zi];
330 int zim = zi + 1;
331 while (sort[zim] >= 0)
333 int tmp = sort[zim];
334 sort[zim] = cp;
335 cp = tmp;
336 zim++;
338 sort[zim] = cp;
339 zi_max = std::max(zi_max, zim);
341 sort[zi] = a[i];
342 zi_max = std::max(zi_max, zi);
346 int c = 0;
347 if (!Backwards)
349 for (int zi = 0; zi < nsort; zi++)
351 if (sort[zi] >= 0)
353 a[c++] = sort[zi];
354 sort[zi] = -1;
358 else
360 for (int zi = zi_max; zi >= zi_min; zi--)
362 if (sort[zi] >= 0)
364 a[c++] = sort[zi];
365 sort[zi] = -1;
369 if (c < n)
371 gmx_incons("Lost particles while sorting");
375 #if GMX_DOUBLE
376 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
377 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
378 #else
379 #define R2F_D(x) (x)
380 #define R2F_U(x) (x)
381 #endif
383 /* Coordinate order x,y,z, bb order xyz0 */
384 static void calc_bounding_box(int na, int stride, const real *x, nbnxn_bb_t *bb)
386 int i;
387 real xl, xh, yl, yh, zl, zh;
389 i = 0;
390 xl = x[i+XX];
391 xh = x[i+XX];
392 yl = x[i+YY];
393 yh = x[i+YY];
394 zl = x[i+ZZ];
395 zh = x[i+ZZ];
396 i += stride;
397 for (int j = 1; j < na; j++)
399 xl = std::min(xl, x[i+XX]);
400 xh = std::max(xh, x[i+XX]);
401 yl = std::min(yl, x[i+YY]);
402 yh = std::max(yh, x[i+YY]);
403 zl = std::min(zl, x[i+ZZ]);
404 zh = std::max(zh, x[i+ZZ]);
405 i += stride;
407 /* Note: possible double to float conversion here */
408 bb->lower[BB_X] = R2F_D(xl);
409 bb->lower[BB_Y] = R2F_D(yl);
410 bb->lower[BB_Z] = R2F_D(zl);
411 bb->upper[BB_X] = R2F_U(xh);
412 bb->upper[BB_Y] = R2F_U(yh);
413 bb->upper[BB_Z] = R2F_U(zh);
416 /* Packed coordinates, bb order xyz0 */
417 static void calc_bounding_box_x_x4(int na, const real *x, nbnxn_bb_t *bb)
419 real xl, xh, yl, yh, zl, zh;
421 xl = x[XX*c_packX4];
422 xh = x[XX*c_packX4];
423 yl = x[YY*c_packX4];
424 yh = x[YY*c_packX4];
425 zl = x[ZZ*c_packX4];
426 zh = x[ZZ*c_packX4];
427 for (int j = 1; j < na; j++)
429 xl = std::min(xl, x[j+XX*c_packX4]);
430 xh = std::max(xh, x[j+XX*c_packX4]);
431 yl = std::min(yl, x[j+YY*c_packX4]);
432 yh = std::max(yh, x[j+YY*c_packX4]);
433 zl = std::min(zl, x[j+ZZ*c_packX4]);
434 zh = std::max(zh, x[j+ZZ*c_packX4]);
436 /* Note: possible double to float conversion here */
437 bb->lower[BB_X] = R2F_D(xl);
438 bb->lower[BB_Y] = R2F_D(yl);
439 bb->lower[BB_Z] = R2F_D(zl);
440 bb->upper[BB_X] = R2F_U(xh);
441 bb->upper[BB_Y] = R2F_U(yh);
442 bb->upper[BB_Z] = R2F_U(zh);
445 /* Packed coordinates, bb order xyz0 */
446 static void calc_bounding_box_x_x8(int na, const real *x, nbnxn_bb_t *bb)
448 real xl, xh, yl, yh, zl, zh;
450 xl = x[XX*c_packX8];
451 xh = x[XX*c_packX8];
452 yl = x[YY*c_packX8];
453 yh = x[YY*c_packX8];
454 zl = x[ZZ*c_packX8];
455 zh = x[ZZ*c_packX8];
456 for (int j = 1; j < na; j++)
458 xl = std::min(xl, x[j+XX*c_packX8]);
459 xh = std::max(xh, x[j+XX*c_packX8]);
460 yl = std::min(yl, x[j+YY*c_packX8]);
461 yh = std::max(yh, x[j+YY*c_packX8]);
462 zl = std::min(zl, x[j+ZZ*c_packX8]);
463 zh = std::max(zh, x[j+ZZ*c_packX8]);
465 /* Note: possible double to float conversion here */
466 bb->lower[BB_X] = R2F_D(xl);
467 bb->lower[BB_Y] = R2F_D(yl);
468 bb->lower[BB_Z] = R2F_D(zl);
469 bb->upper[BB_X] = R2F_U(xh);
470 bb->upper[BB_Y] = R2F_U(yh);
471 bb->upper[BB_Z] = R2F_U(zh);
474 /* Packed coordinates, bb order xyz0 */
475 gmx_unused static void calc_bounding_box_x_x4_halves(int na, const real *x,
476 nbnxn_bb_t *bb, nbnxn_bb_t *bbj)
478 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
479 using namespace gmx;
481 calc_bounding_box_x_x4(std::min(na, 2), x, bbj);
483 if (na > 2)
485 calc_bounding_box_x_x4(std::min(na-2, 2), x+(c_packX4>>1), bbj+1);
487 else
489 /* Set the "empty" bounding box to the same as the first one,
490 * so we don't need to treat special cases in the rest of the code.
492 #if NBNXN_SEARCH_BB_SIMD4
493 store4(&bbj[1].lower[0], load4(&bbj[0].lower[0]));
494 store4(&bbj[1].upper[0], load4(&bbj[0].upper[0]));
495 #else
496 bbj[1] = bbj[0];
497 #endif
500 #if NBNXN_SEARCH_BB_SIMD4
501 store4(&bb->lower[0], min(load4(&bbj[0].lower[0]), load4(&bbj[1].lower[0])));
502 store4(&bb->upper[0], max(load4(&bbj[0].upper[0]), load4(&bbj[1].upper[0])));
503 #else
505 int i;
507 for (i = 0; i < NNBSBB_C; i++)
509 bb->lower[i] = std::min(bbj[0].lower[i], bbj[1].lower[i]);
510 bb->upper[i] = std::max(bbj[0].upper[i], bbj[1].upper[i]);
513 #endif
516 #if NBNXN_SEARCH_BB_SIMD4
518 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
519 static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
521 int i;
522 real xl, xh, yl, yh, zl, zh;
524 i = 0;
525 xl = x[i+XX];
526 xh = x[i+XX];
527 yl = x[i+YY];
528 yh = x[i+YY];
529 zl = x[i+ZZ];
530 zh = x[i+ZZ];
531 i += stride;
532 for (int j = 1; j < na; j++)
534 xl = std::min(xl, x[i+XX]);
535 xh = std::max(xh, x[i+XX]);
536 yl = std::min(yl, x[i+YY]);
537 yh = std::max(yh, x[i+YY]);
538 zl = std::min(zl, x[i+ZZ]);
539 zh = std::max(zh, x[i+ZZ]);
540 i += stride;
542 /* Note: possible double to float conversion here */
543 bb[0*STRIDE_PBB] = R2F_D(xl);
544 bb[1*STRIDE_PBB] = R2F_D(yl);
545 bb[2*STRIDE_PBB] = R2F_D(zl);
546 bb[3*STRIDE_PBB] = R2F_U(xh);
547 bb[4*STRIDE_PBB] = R2F_U(yh);
548 bb[5*STRIDE_PBB] = R2F_U(zh);
551 #endif /* NBNXN_SEARCH_BB_SIMD4 */
553 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
555 /* Coordinate order xyz?, bb order xyz0 */
556 static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb)
558 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
559 using namespace gmx;
561 Simd4Float bb_0_S, bb_1_S;
562 Simd4Float x_S;
564 bb_0_S = load4(x);
565 bb_1_S = bb_0_S;
567 for (int i = 1; i < na; i++)
569 x_S = load4(x+i*NNBSBB_C);
570 bb_0_S = min(bb_0_S, x_S);
571 bb_1_S = max(bb_1_S, x_S);
574 store4(&bb->lower[0], bb_0_S);
575 store4(&bb->upper[0], bb_1_S);
578 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
579 static void calc_bounding_box_xxxx_simd4(int na, const float *x,
580 nbnxn_bb_t *bb_work_aligned,
581 real *bb)
583 calc_bounding_box_simd4(na, x, bb_work_aligned);
585 bb[0*STRIDE_PBB] = bb_work_aligned->lower[BB_X];
586 bb[1*STRIDE_PBB] = bb_work_aligned->lower[BB_Y];
587 bb[2*STRIDE_PBB] = bb_work_aligned->lower[BB_Z];
588 bb[3*STRIDE_PBB] = bb_work_aligned->upper[BB_X];
589 bb[4*STRIDE_PBB] = bb_work_aligned->upper[BB_Y];
590 bb[5*STRIDE_PBB] = bb_work_aligned->upper[BB_Z];
593 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
596 /* Combines pairs of consecutive bounding boxes */
597 static void combine_bounding_box_pairs(nbnxn_grid_t *grid,
598 gmx::ArrayRef<const nbnxn_bb_t> bb)
600 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
601 using namespace gmx;
603 for (int i = 0; i < grid->ncx*grid->ncy; i++)
605 /* Starting bb in a column is expected to be 2-aligned */
606 int sc2 = grid->cxy_ind[i]>>1;
607 /* For odd numbers skip the last bb here */
608 int nc2 = (grid->cxy_na[i]+3)>>(2+1);
609 int c2;
610 for (c2 = sc2; c2 < sc2+nc2; c2++)
612 #if NBNXN_SEARCH_BB_SIMD4
613 Simd4Float min_S, max_S;
615 min_S = min(load4(&bb[c2*2+0].lower[0]),
616 load4(&bb[c2*2+1].lower[0]));
617 max_S = max(load4(&bb[c2*2+0].upper[0]),
618 load4(&bb[c2*2+1].upper[0]));
619 store4(&grid->bbj[c2].lower[0], min_S);
620 store4(&grid->bbj[c2].upper[0], max_S);
621 #else
622 for (int j = 0; j < NNBSBB_C; j++)
624 grid->bbj[c2].lower[j] = std::min(bb[c2*2+0].lower[j],
625 bb[c2*2+1].lower[j]);
626 grid->bbj[c2].upper[j] = std::max(bb[c2*2+0].upper[j],
627 bb[c2*2+1].upper[j]);
629 #endif
631 if (((grid->cxy_na[i]+3)>>2) & 1)
633 /* The bb count in this column is odd: duplicate the last bb */
634 for (int j = 0; j < NNBSBB_C; j++)
636 grid->bbj[c2].lower[j] = bb[c2*2].lower[j];
637 grid->bbj[c2].upper[j] = bb[c2*2].upper[j];
644 /* Prints the average bb size, used for debug output */
645 static void print_bbsizes_simple(FILE *fp,
646 const nbnxn_grid_t *grid)
648 dvec ba;
650 clear_dvec(ba);
651 for (int c = 0; c < grid->nc; c++)
653 for (int d = 0; d < DIM; d++)
655 ba[d] += grid->bb[c].upper[d] - grid->bb[c].lower[d];
658 dsvmul(1.0/grid->nc, ba, ba);
660 fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
661 grid->sx,
662 grid->sy,
663 grid->atom_density > 0 ?
664 grid->na_c/(grid->atom_density*grid->sx*grid->sy) : 0.0,
665 ba[XX], ba[YY], ba[ZZ],
666 ba[XX]/grid->sx,
667 ba[YY]/grid->sy,
668 grid->atom_density > 0 ?
669 ba[ZZ]/(grid->na_c/(grid->atom_density*grid->sx*grid->sy)) : 0.0);
672 /* Prints the average bb size, used for debug output */
673 static void print_bbsizes_supersub(FILE *fp,
674 const nbnxn_grid_t *grid)
676 int ns;
677 dvec ba;
679 clear_dvec(ba);
680 ns = 0;
681 for (int c = 0; c < grid->nc; c++)
683 #if NBNXN_BBXXXX
684 for (int s = 0; s < grid->nsubc[c]; s += STRIDE_PBB)
686 int cs_w = (c*c_gpuNumClusterPerCell + s)/STRIDE_PBB;
687 for (int i = 0; i < STRIDE_PBB; i++)
689 for (int d = 0; d < DIM; d++)
691 ba[d] +=
692 grid->pbb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
693 grid->pbb[cs_w*NNBSBB_XXXX+ d *STRIDE_PBB+i];
697 #else
698 for (int s = 0; s < grid->nsubc[c]; s++)
700 int cs = c*c_gpuNumClusterPerCell + s;
701 for (int d = 0; d < DIM; d++)
703 ba[d] += grid->bb[cs].upper[d] - grid->bb[cs].lower[d];
706 #endif
707 ns += grid->nsubc[c];
709 dsvmul(1.0/ns, ba, ba);
711 fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
712 grid->sx/c_gpuNumClusterPerCellX,
713 grid->sy/c_gpuNumClusterPerCellY,
714 grid->atom_density > 0 ?
715 grid->na_sc/(grid->atom_density*grid->sx*grid->sy*c_gpuNumClusterPerCellZ) : 0.0,
716 ba[XX], ba[YY], ba[ZZ],
717 ba[XX]*c_gpuNumClusterPerCellX/grid->sx,
718 ba[YY]*c_gpuNumClusterPerCellY/grid->sy,
719 grid->atom_density > 0 ?
720 ba[ZZ]/(grid->na_sc/(grid->atom_density*grid->sx*grid->sy*c_gpuNumClusterPerCellZ)) : 0.0);
723 /* Set non-bonded interaction flags for the current cluster.
724 * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
726 static void sort_cluster_on_flag(int numAtomsInCluster,
727 int atomStart,
728 int atomEnd,
729 const int *atinfo,
730 int *order,
731 int *flags)
733 constexpr int c_maxNumAtomsInCluster = 8;
734 int sort1[c_maxNumAtomsInCluster];
735 int sort2[c_maxNumAtomsInCluster];
737 GMX_ASSERT(numAtomsInCluster <= c_maxNumAtomsInCluster, "Need to increase c_maxNumAtomsInCluster to support larger clusters");
739 *flags = 0;
741 int subc = 0;
742 for (int s = atomStart; s < atomEnd; s += numAtomsInCluster)
744 /* Make lists for this (sub-)cell on atoms with and without LJ */
745 int n1 = 0;
746 int n2 = 0;
747 gmx_bool haveQ = FALSE;
748 int a_lj_max = -1;
749 for (int a = s; a < std::min(s + numAtomsInCluster, atomEnd); a++)
751 haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
753 if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
755 sort1[n1++] = order[a];
756 a_lj_max = a;
758 else
760 sort2[n2++] = order[a];
764 /* If we don't have atoms with LJ, there's nothing to sort */
765 if (n1 > 0)
767 *flags |= NBNXN_CI_DO_LJ(subc);
769 if (2*n1 <= numAtomsInCluster)
771 /* Only sort when strictly necessary. Ordering particles
772 * Ordering particles can lead to less accurate summation
773 * due to rounding, both for LJ and Coulomb interactions.
775 if (2*(a_lj_max - s) >= numAtomsInCluster)
777 for (int i = 0; i < n1; i++)
779 order[atomStart + i] = sort1[i];
781 for (int j = 0; j < n2; j++)
783 order[atomStart + n1 + j] = sort2[j];
787 *flags |= NBNXN_CI_HALF_LJ(subc);
790 if (haveQ)
792 *flags |= NBNXN_CI_DO_COUL(subc);
794 subc++;
798 /* Fill a pair search cell with atoms.
799 * Potentially sorts atoms and sets the interaction flags.
801 static void fill_cell(const nbnxn_search_t nbs,
802 nbnxn_grid_t *grid,
803 nbnxn_atomdata_t *nbat,
804 int atomStart,
805 int atomEnd,
806 const int *atinfo,
807 const rvec *x,
808 nbnxn_bb_t gmx_unused *bb_work_aligned)
810 const int numAtoms = atomEnd - atomStart;
812 if (grid->bSimple)
814 /* Note that non-local grids are already sorted.
815 * Then sort_cluster_on_flag will only set the flags and the sorting
816 * will not affect the atom order.
818 sort_cluster_on_flag(grid->na_c, atomStart, atomEnd, atinfo, nbs->a,
819 grid->flags.data() + (atomStart >> grid->na_c_2log) - grid->cell0);
822 if (nbs->bFEP)
824 /* Set the fep flag for perturbed atoms in this (sub-)cell */
826 /* The grid-local cluster/(sub-)cell index */
827 int cell = (atomStart >> grid->na_c_2log) - grid->cell0*(grid->bSimple ? 1 : c_gpuNumClusterPerCell);
828 grid->fep[cell] = 0;
829 for (int at = atomStart; at < atomEnd; at++)
831 if (nbs->a[at] >= 0 && GET_CGINFO_FEP(atinfo[nbs->a[at]]))
833 grid->fep[cell] |= (1 << (at - atomStart));
838 /* Now we have sorted the atoms, set the cell indices */
839 for (int at = atomStart; at < atomEnd; at++)
841 nbs->cell[nbs->a[at]] = at;
844 copy_rvec_to_nbat_real(nbs->a + atomStart, numAtoms, grid->na_c, x,
845 nbat->XFormat, nbat->x, atomStart);
847 if (nbat->XFormat == nbatX4)
849 /* Store the bounding boxes as xyz.xyz. */
850 size_t offset = (atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log;
851 nbnxn_bb_t *bb_ptr = grid->bb.data() + offset;
853 #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2
854 if (2*grid->na_cj == grid->na_c)
856 calc_bounding_box_x_x4_halves(numAtoms, nbat->x + atom_to_x_index<c_packX4>(atomStart), bb_ptr,
857 grid->bbj.data() + offset*2);
859 else
860 #endif
862 calc_bounding_box_x_x4(numAtoms, nbat->x + atom_to_x_index<c_packX4>(atomStart), bb_ptr);
865 else if (nbat->XFormat == nbatX8)
867 /* Store the bounding boxes as xyz.xyz. */
868 size_t offset = (atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log;
869 nbnxn_bb_t *bb_ptr = grid->bb.data() + offset;
871 calc_bounding_box_x_x8(numAtoms, nbat->x + atom_to_x_index<c_packX8>(atomStart), bb_ptr);
873 #if NBNXN_BBXXXX
874 else if (!grid->bSimple)
876 /* Store the bounding boxes in a format convenient
877 * for SIMD4 calculations: xxxxyyyyzzzz...
879 float *pbb_ptr =
880 grid->pbb.data() +
881 ((atomStart - grid->cell0*grid->na_sc) >> (grid->na_c_2log + STRIDE_PBB_2LOG))*NNBSBB_XXXX +
882 (((atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log) & (STRIDE_PBB - 1));
884 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
885 if (nbat->XFormat == nbatXYZQ)
887 calc_bounding_box_xxxx_simd4(numAtoms, nbat->x + atomStart*nbat->xstride,
888 bb_work_aligned, pbb_ptr);
890 else
891 #endif
893 calc_bounding_box_xxxx(numAtoms, nbat->xstride, nbat->x + atomStart*nbat->xstride,
894 pbb_ptr);
896 if (gmx_debug_at)
898 fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
899 atomStart >> grid->na_c_2log,
900 pbb_ptr[0*STRIDE_PBB], pbb_ptr[3*STRIDE_PBB],
901 pbb_ptr[1*STRIDE_PBB], pbb_ptr[4*STRIDE_PBB],
902 pbb_ptr[2*STRIDE_PBB], pbb_ptr[5*STRIDE_PBB]);
905 #endif
906 else
908 /* Store the bounding boxes as xyz.xyz. */
909 nbnxn_bb_t *bb_ptr = grid->bb.data() + ((atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log);
911 calc_bounding_box(numAtoms, nbat->xstride, nbat->x + atomStart*nbat->xstride,
912 bb_ptr);
914 if (gmx_debug_at)
916 int bbo = (atomStart - grid->cell0*grid->na_sc)/grid->na_c;
917 fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
918 atomStart >> grid->na_c_2log,
919 grid->bb[bbo].lower[BB_X],
920 grid->bb[bbo].lower[BB_Y],
921 grid->bb[bbo].lower[BB_Z],
922 grid->bb[bbo].upper[BB_X],
923 grid->bb[bbo].upper[BB_Y],
924 grid->bb[bbo].upper[BB_Z]);
929 /* Spatially sort the atoms within one grid column */
930 static void sort_columns_simple(const nbnxn_search_t nbs,
931 int dd_zone,
932 nbnxn_grid_t *grid,
933 int atomStart, int atomEnd,
934 const int *atinfo,
935 const rvec *x,
936 nbnxn_atomdata_t *nbat,
937 int cxy_start, int cxy_end,
938 int *sort_work)
940 if (debug)
942 fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
943 grid->cell0, cxy_start, cxy_end, atomStart, atomEnd);
946 /* Sort the atoms within each x,y column in 3 dimensions */
947 for (int cxy = cxy_start; cxy < cxy_end; cxy++)
949 int na = grid->cxy_na[cxy];
950 int ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
951 int ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
953 /* Sort the atoms within each x,y column on z coordinate */
954 sort_atoms(ZZ, FALSE, dd_zone,
955 nbs->a+ash, na, x,
956 grid->c0[ZZ],
957 1.0/grid->size[ZZ], ncz*grid->na_sc,
958 sort_work);
960 /* Fill the ncz cells in this column */
961 int cfilled = grid->cxy_ind[cxy];
962 for (int cz = 0; cz < ncz; cz++)
964 int c = grid->cxy_ind[cxy] + cz;
966 int ash_c = ash + cz*grid->na_sc;
967 int na_c = std::min(grid->na_sc, na-(ash_c-ash));
969 fill_cell(nbs, grid, nbat,
970 ash_c, ash_c+na_c, atinfo, x,
971 nullptr);
973 /* This copy to bbcz is not really necessary.
974 * But it allows to use the same grid search code
975 * for the simple and supersub cell setups.
977 if (na_c > 0)
979 cfilled = c;
981 grid->bbcz[c*NNBSBB_D ] = grid->bb[cfilled].lower[BB_Z];
982 grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled].upper[BB_Z];
985 /* Set the unused atom indices to -1 */
986 for (int ind = na; ind < ncz*grid->na_sc; ind++)
988 nbs->a[ash+ind] = -1;
993 /* Spatially sort the atoms within one grid column */
994 static void sort_columns_supersub(const nbnxn_search_t nbs,
995 int dd_zone,
996 nbnxn_grid_t *grid,
997 int atomStart, int atomEnd,
998 const int *atinfo,
999 const rvec *x,
1000 nbnxn_atomdata_t *nbat,
1001 int cxy_start, int cxy_end,
1002 int *sort_work)
1004 nbnxn_bb_t bb_work_array[2];
1005 nbnxn_bb_t *bb_work_aligned = reinterpret_cast<nbnxn_bb_t *>((reinterpret_cast<std::size_t>(bb_work_array + 1)) & (~(static_cast<std::size_t>(15))));
1007 if (debug)
1009 fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1010 grid->cell0, cxy_start, cxy_end, atomStart, atomEnd);
1013 int subdiv_x = grid->na_c;
1014 int subdiv_y = c_gpuNumClusterPerCellX*subdiv_x;
1015 int subdiv_z = c_gpuNumClusterPerCellY*subdiv_y;
1017 /* Sort the atoms within each x,y column in 3 dimensions.
1018 * Loop over all columns on the x/y grid.
1020 for (int cxy = cxy_start; cxy < cxy_end; cxy++)
1022 int gridX = cxy/grid->ncy;
1023 int gridY = cxy - gridX*grid->ncy;
1025 int numAtomsInColumn = grid->cxy_na[cxy];
1026 int numCellsInColumn = grid->cxy_ind[cxy + 1] - grid->cxy_ind[cxy];
1027 int ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1029 /* Sort the atoms within each x,y column on z coordinate */
1030 sort_atoms(ZZ, FALSE, dd_zone,
1031 nbs->a + ash, numAtomsInColumn, x,
1032 grid->c0[ZZ],
1033 1.0/grid->size[ZZ], numCellsInColumn*grid->na_sc,
1034 sort_work);
1036 /* This loop goes over the cells and clusters along z at once */
1037 for (int sub_z = 0; sub_z < numCellsInColumn*c_gpuNumClusterPerCellZ; sub_z++)
1039 int ash_z = ash + sub_z*subdiv_z;
1040 int na_z = std::min(subdiv_z, numAtomsInColumn - (ash_z - ash));
1041 int cz = -1;
1042 /* We have already sorted on z */
1044 if (sub_z % c_gpuNumClusterPerCellZ == 0)
1046 cz = sub_z/c_gpuNumClusterPerCellZ;
1047 int cell = grid->cxy_ind[cxy] + cz;
1049 /* The number of atoms in this cell/super-cluster */
1050 int numAtoms = std::min(grid->na_sc, numAtomsInColumn - (ash_z - ash));
1052 grid->nsubc[cell] = std::min(c_gpuNumClusterPerCell,
1053 (numAtoms + grid->na_c - 1)/grid->na_c);
1055 /* Store the z-boundaries of the bounding box of the cell */
1056 grid->bbcz[cell*NNBSBB_D ] = x[nbs->a[ash_z]][ZZ];
1057 grid->bbcz[cell*NNBSBB_D+1] = x[nbs->a[ash_z + numAtoms - 1]][ZZ];
1060 if (c_gpuNumClusterPerCellY > 1)
1062 /* Sort the atoms along y */
1063 sort_atoms(YY, (sub_z & 1), dd_zone,
1064 nbs->a + ash_z, na_z, x,
1065 grid->c0[YY] + gridY*grid->sy,
1066 grid->inv_sy, subdiv_z,
1067 sort_work);
1070 for (int sub_y = 0; sub_y < c_gpuNumClusterPerCellY; sub_y++)
1072 int ash_y = ash_z + sub_y*subdiv_y;
1073 int na_y = std::min(subdiv_y, numAtomsInColumn - (ash_y - ash));
1075 if (c_gpuNumClusterPerCellX > 1)
1077 /* Sort the atoms along x */
1078 sort_atoms(XX, ((cz*c_gpuNumClusterPerCellY + sub_y) & 1), dd_zone,
1079 nbs->a + ash_y, na_y, x,
1080 grid->c0[XX] + gridX*grid->sx,
1081 grid->inv_sx, subdiv_y,
1082 sort_work);
1085 for (int sub_x = 0; sub_x < c_gpuNumClusterPerCellX; sub_x++)
1087 int ash_x = ash_y + sub_x*subdiv_x;
1088 int na_x = std::min(subdiv_x, numAtomsInColumn - (ash_x - ash));
1090 fill_cell(nbs, grid, nbat,
1091 ash_x, ash_x + na_x, atinfo, x,
1092 bb_work_aligned);
1097 /* Set the unused atom indices to -1 */
1098 for (int ind = numAtomsInColumn; ind < numCellsInColumn*grid->na_sc; ind++)
1100 nbs->a[ash + ind] = -1;
1105 /* Determine in which grid column atoms should go */
1106 static void calc_column_indices(nbnxn_grid_t *grid,
1107 int atomStart, int atomEnd,
1108 const rvec *x,
1109 int dd_zone, const int *move,
1110 int thread, int nthread,
1111 int *cell,
1112 int *cxy_na)
1114 /* We add one extra cell for particles which moved during DD */
1115 for (int i = 0; i < grid->ncx*grid->ncy+1; i++)
1117 cxy_na[i] = 0;
1120 int taskAtomStart = atomStart + static_cast<int>((thread + 0)*(atomEnd - atomStart))/nthread;
1121 int taskAtomEnd = atomStart + static_cast<int>((thread + 1)*(atomEnd - atomStart))/nthread;
1122 if (dd_zone == 0)
1124 /* Home zone */
1125 for (int i = taskAtomStart; i < taskAtomEnd; i++)
1127 if (move == nullptr || move[i] >= 0)
1129 /* We need to be careful with rounding,
1130 * particles might be a few bits outside the local zone.
1131 * The int cast takes care of the lower bound,
1132 * we will explicitly take care of the upper bound.
1134 int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1135 int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1137 #ifndef NDEBUG
1138 if (cx < 0 || cx > grid->ncx ||
1139 cy < 0 || cy > grid->ncy)
1141 gmx_fatal(FARGS,
1142 "grid cell cx %d cy %d out of range (max %d %d)\n"
1143 "atom %f %f %f, grid->c0 %f %f",
1144 cx, cy, grid->ncx, grid->ncy,
1145 x[i][XX], x[i][YY], x[i][ZZ], grid->c0[XX], grid->c0[YY]);
1147 #endif
1148 /* Take care of potential rouding issues */
1149 cx = std::min(cx, grid->ncx - 1);
1150 cy = std::min(cy, grid->ncy - 1);
1152 /* For the moment cell will contain only the, grid local,
1153 * x and y indices, not z.
1155 cell[i] = cx*grid->ncy + cy;
1157 else
1159 /* Put this moved particle after the end of the grid,
1160 * so we can process it later without using conditionals.
1162 cell[i] = grid->ncx*grid->ncy;
1165 cxy_na[cell[i]]++;
1168 else
1170 /* Non-home zone */
1171 for (int i = taskAtomStart; i < taskAtomEnd; i++)
1173 int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1174 int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1176 /* For non-home zones there could be particles outside
1177 * the non-bonded cut-off range, which have been communicated
1178 * for bonded interactions only. For the result it doesn't
1179 * matter where these end up on the grid. For performance
1180 * we put them in an extra row at the border.
1182 cx = std::max(cx, 0);
1183 cx = std::min(cx, grid->ncx - 1);
1184 cy = std::max(cy, 0);
1185 cy = std::min(cy, grid->ncy - 1);
1187 /* For the moment cell will contain only the, grid local,
1188 * x and y indices, not z.
1190 cell[i] = cx*grid->ncy + cy;
1192 cxy_na[cell[i]]++;
1197 /* Determine in which grid cells the atoms should go */
1198 static void calc_cell_indices(const nbnxn_search_t nbs,
1199 int ddZone,
1200 nbnxn_grid_t *grid,
1201 int atomStart,
1202 int atomEnd,
1203 const int *atinfo,
1204 const rvec *x,
1205 const int *move,
1206 nbnxn_atomdata_t *nbat)
1208 const int nthread = gmx_omp_nthreads_get(emntPairsearch);
1210 #pragma omp parallel for num_threads(nthread) schedule(static)
1211 for (int thread = 0; thread < nthread; thread++)
1215 calc_column_indices(grid, atomStart, atomEnd, x, ddZone, move, thread, nthread,
1216 nbs->cell, nbs->work[thread].cxy_na);
1218 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1221 /* Make the cell index as a function of x and y */
1222 int ncz_max = 0;
1223 int ncz = 0;
1224 grid->cxy_ind[0] = 0;
1225 for (int i = 0; i < grid->ncx*grid->ncy + 1; i++)
1227 /* We set ncz_max at the beginning of the loop iso at the end
1228 * to skip i=grid->ncx*grid->ncy which are moved particles
1229 * that do not need to be ordered on the grid.
1231 if (ncz > ncz_max)
1233 ncz_max = ncz;
1235 int cxy_na_i = nbs->work[0].cxy_na[i];
1236 for (int thread = 1; thread < nthread; thread++)
1238 cxy_na_i += nbs->work[thread].cxy_na[i];
1240 ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
1241 if (nbat->XFormat == nbatX8)
1243 /* Make the number of cell a multiple of 2 */
1244 ncz = (ncz + 1) & ~1;
1246 grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
1247 /* Clear cxy_na, so we can reuse the array below */
1248 grid->cxy_na[i] = 0;
1250 grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
1252 nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
1254 if (debug)
1256 fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1257 grid->na_sc, grid->na_c, grid->nc,
1258 grid->ncx, grid->ncy, grid->nc/((double)(grid->ncx*grid->ncy)),
1259 ncz_max);
1260 if (gmx_debug_at)
1262 int i = 0;
1263 for (int cy = 0; cy < grid->ncy; cy++)
1265 for (int cx = 0; cx < grid->ncx; cx++)
1267 fprintf(debug, " %2d", grid->cxy_ind[i+1]-grid->cxy_ind[i]);
1268 i++;
1270 fprintf(debug, "\n");
1275 /* Make sure the work array for sorting is large enough */
1276 if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
1278 for (int thread = 0; thread < nbs->nthread_max; thread++)
1280 nbs->work[thread].sort_work_nalloc =
1281 over_alloc_large(ncz_max*grid->na_sc*SGSF);
1282 srenew(nbs->work[thread].sort_work,
1283 nbs->work[thread].sort_work_nalloc);
1284 /* When not in use, all elements should be -1 */
1285 for (int i = 0; i < nbs->work[thread].sort_work_nalloc; i++)
1287 nbs->work[thread].sort_work[i] = -1;
1292 /* Now we know the dimensions we can fill the grid.
1293 * This is the first, unsorted fill. We sort the columns after this.
1295 for (int i = atomStart; i < atomEnd; i++)
1297 /* At this point nbs->cell contains the local grid x,y indices */
1298 int cxy = nbs->cell[i];
1299 nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
1302 if (ddZone == 0)
1304 /* Set the cell indices for the moved particles */
1305 int n0 = grid->nc*grid->na_sc;
1306 int n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
1307 if (ddZone == 0)
1309 for (int i = n0; i < n1; i++)
1311 nbs->cell[nbs->a[i]] = i;
1316 /* Sort the super-cell columns along z into the sub-cells. */
1317 #pragma omp parallel for num_threads(nthread) schedule(static)
1318 for (int thread = 0; thread < nthread; thread++)
1322 if (grid->bSimple)
1324 sort_columns_simple(nbs, ddZone, grid, atomStart, atomEnd, atinfo, x, nbat,
1325 ((thread+0)*grid->ncx*grid->ncy)/nthread,
1326 ((thread+1)*grid->ncx*grid->ncy)/nthread,
1327 nbs->work[thread].sort_work);
1329 else
1331 sort_columns_supersub(nbs, ddZone, grid, atomStart, atomEnd, atinfo, x, nbat,
1332 ((thread+0)*grid->ncx*grid->ncy)/nthread,
1333 ((thread+1)*grid->ncx*grid->ncy)/nthread,
1334 nbs->work[thread].sort_work);
1337 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1340 if (grid->bSimple && nbat->XFormat == nbatX8)
1342 combine_bounding_box_pairs(grid, grid->bb);
1345 if (!grid->bSimple)
1347 grid->nsubc_tot = 0;
1348 for (int i = 0; i < grid->nc; i++)
1350 grid->nsubc_tot += grid->nsubc[i];
1354 if (debug)
1356 if (grid->bSimple)
1358 print_bbsizes_simple(debug, grid);
1360 else
1362 fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
1363 grid->nsubc_tot, (atomEnd - atomStart)/(double)grid->nsubc_tot);
1365 print_bbsizes_supersub(debug, grid);
1370 /* Sets up a grid and puts the atoms on the grid.
1371 * This function only operates on one domain of the domain decompostion.
1372 * Note that without domain decomposition there is only one domain.
1374 void nbnxn_put_on_grid(nbnxn_search_t nbs,
1375 int ePBC,
1376 const matrix box,
1377 int ddZone,
1378 const rvec lowerCorner,
1379 const rvec upperCorner,
1380 int atomStart,
1381 int atomEnd,
1382 real atomDensity,
1383 const int *atinfo,
1384 const rvec *x,
1385 int numAtomsMoved,
1386 const int *move,
1387 int nb_kernel_type,
1388 nbnxn_atomdata_t *nbat)
1390 nbnxn_grid_t *grid = &nbs->grid[ddZone];
1392 nbs_cycle_start(&nbs->cc[enbsCCgrid]);
1394 grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
1396 grid->na_c = nbnxn_kernel_to_cluster_i_size(nb_kernel_type);
1397 grid->na_cj = nbnxn_kernel_to_cluster_j_size(nb_kernel_type);
1398 grid->na_sc = (grid->bSimple ? 1 : c_gpuNumClusterPerCell)*grid->na_c;
1399 grid->na_c_2log = get_2log(grid->na_c);
1401 nbat->na_c = grid->na_c;
1403 if (ddZone == 0)
1405 grid->cell0 = 0;
1407 else
1409 grid->cell0 =
1410 (nbs->grid[ddZone - 1].cell0 + nbs->grid[ddZone - 1].nc)*
1411 nbs->grid[ddZone- 1].na_sc/grid->na_sc;
1414 const int n = atomEnd - atomStart;
1416 if (ddZone == 0)
1418 nbs->ePBC = ePBC;
1419 copy_mat(box, nbs->box);
1421 /* Avoid zero density */
1422 if (atomDensity > 0)
1424 grid->atom_density = atomDensity;
1426 else
1428 grid->atom_density = grid_atom_density(n - numAtomsMoved, lowerCorner, upperCorner);
1431 grid->cell0 = 0;
1433 nbs->natoms_local = atomEnd - numAtomsMoved;
1434 /* We assume that nbnxn_put_on_grid is called first
1435 * for the local atoms (ddZone=0).
1437 nbs->natoms_nonlocal = atomEnd - numAtomsMoved;
1439 if (debug)
1441 fprintf(debug, "natoms_local = %5d atom_density = %5.1f\n",
1442 nbs->natoms_local, grid->atom_density);
1445 else
1447 nbs->natoms_nonlocal = std::max(nbs->natoms_nonlocal, atomEnd);
1450 /* We always use the home zone (grid[0]) for setting the cell size,
1451 * since determining densities for non-local zones is difficult.
1453 int maxNumCellsOnThisGrid =
1454 set_grid_size_xy(nbs, grid,
1455 ddZone, n - numAtomsMoved,
1456 lowerCorner, upperCorner,
1457 nbs->grid[0].atom_density);
1459 int maxNumCells = grid->cell0 + maxNumCellsOnThisGrid;
1461 if (atomEnd > nbs->cell_nalloc)
1463 nbs->cell_nalloc = over_alloc_large(atomEnd);
1464 srenew(nbs->cell, nbs->cell_nalloc);
1467 /* To avoid conditionals we store the moved particles at the end of a,
1468 * make sure we have enough space.
1470 if (maxNumCells*grid->na_sc + numAtomsMoved > nbs->a_nalloc)
1472 nbs->a_nalloc = over_alloc_large(maxNumCells*grid->na_sc + numAtomsMoved);
1473 srenew(nbs->a, nbs->a_nalloc);
1476 /* We need padding up to a multiple of the buffer flag size: simply add */
1477 if (maxNumCells*grid->na_sc + NBNXN_BUFFERFLAG_SIZE > nbat->nalloc)
1479 nbnxn_atomdata_realloc(nbat,
1480 maxNumCells*grid->na_sc + NBNXN_BUFFERFLAG_SIZE);
1483 calc_cell_indices(nbs, ddZone, grid, atomStart, atomEnd, atinfo, x, move, nbat);
1485 if (ddZone == 0)
1487 nbat->natoms_local = nbat->natoms;
1490 nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
1493 /* Calls nbnxn_put_on_grid for all non-local domains */
1494 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs,
1495 const struct gmx_domdec_zones_t *zones,
1496 const int *atinfo,
1497 const rvec *x,
1498 int nb_kernel_type,
1499 nbnxn_atomdata_t *nbat)
1501 for (int zone = 1; zone < zones->n; zone++)
1503 rvec c0, c1;
1504 for (int d = 0; d < DIM; d++)
1506 c0[d] = zones->size[zone].bb_x0[d];
1507 c1[d] = zones->size[zone].bb_x1[d];
1510 nbnxn_put_on_grid(nbs, nbs->ePBC, nullptr,
1511 zone, c0, c1,
1512 zones->cg_range[zone],
1513 zones->cg_range[zone+1],
1515 atinfo,
1517 0, nullptr,
1518 nb_kernel_type,
1519 nbat);
1523 void nbnxn_get_ncells(nbnxn_search_t nbs, int *ncx, int *ncy)
1525 *ncx = nbs->grid[0].ncx;
1526 *ncy = nbs->grid[0].ncy;
1529 void nbnxn_get_atomorder(const nbnxn_search_t nbs, const int **a, int *n)
1531 const nbnxn_grid_t *grid;
1533 grid = &nbs->grid[0];
1535 /* Return the atom order for the home cell (index 0) */
1536 *a = nbs->a;
1538 *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
1541 void nbnxn_set_atomorder(nbnxn_search_t nbs)
1543 /* Set the atom order for the home cell (index 0) */
1544 nbnxn_grid_t *grid = &nbs->grid[0];
1546 int ao = 0;
1547 for (int cx = 0; cx < grid->ncx; cx++)
1549 for (int cy = 0; cy < grid->ncy; cy++)
1551 int cxy = cx*grid->ncy + cy;
1552 int j = grid->cxy_ind[cxy]*grid->na_sc;
1553 for (int cz = 0; cz < grid->cxy_na[cxy]; cz++)
1555 nbs->a[j] = ao;
1556 nbs->cell[ao] = j;
1557 ao++;
1558 j++;