Change nbnxn grid reallocation
[gromacs.git] / src / gromacs / mdlib / nbnxn_grid.cpp
blobb7f561f4b69f518c4539f814e977f50380032cdf
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #include "gmxpre.h"
38 #include "nbnxn_grid.h"
40 #include <string.h>
42 #include <cmath>
44 #include <algorithm>
46 #include "gromacs/domdec/domdec_struct.h"
47 #include "gromacs/math/utilities.h"
48 #include "gromacs/math/vec.h"
49 #include "gromacs/mdlib/gmx_omp_nthreads.h"
50 #include "gromacs/mdlib/nb_verlet.h"
51 #include "gromacs/mdlib/nbnxn_atomdata.h"
52 #include "gromacs/mdlib/nbnxn_consts.h"
53 #include "gromacs/mdlib/nbnxn_internal.h"
54 #include "gromacs/mdlib/nbnxn_search.h"
55 #include "gromacs/mdlib/nbnxn_util.h"
56 #include "gromacs/mdtypes/forcerec.h" // only for GET_CGINFO_*
57 #include "gromacs/simd/simd.h"
58 #include "gromacs/simd/vector_operations.h"
59 #include "gromacs/utility/exceptions.h"
60 #include "gromacs/utility/smalloc.h"
62 struct gmx_domdec_zones_t;
64 static real grid_atom_density(int numAtoms,
65 const rvec lowerCorner,
66 const rvec upperCorner)
68 rvec size;
70 if (numAtoms == 0)
72 /* To avoid zero density we use a minimum of 1 atom */
73 numAtoms = 1;
76 rvec_sub(upperCorner, lowerCorner, size);
78 return numAtoms/(size[XX]*size[YY]*size[ZZ]);
81 static void set_grid_size_xy(const nbnxn_search_t nbs,
82 nbnxn_grid_t *grid,
83 int ddZone,
84 int numAtoms,
85 const rvec lowerCorner,
86 const rvec upperCorner,
87 real atomDensity)
89 rvec size;
90 real tlen, tlen_x, tlen_y;
92 rvec_sub(upperCorner, lowerCorner, size);
94 if (numAtoms > grid->na_sc)
96 GMX_ASSERT(atomDensity > 0, "With one or more atoms, the density should be positive");
98 /* target cell length */
99 if (grid->bSimple)
101 /* To minimize the zero interactions, we should make
102 * the largest of the i/j cell cubic.
104 int numAtomsInCell = std::max(grid->na_c, grid->na_cj);
106 /* Approximately cubic cells */
107 tlen = std::cbrt(numAtomsInCell/atomDensity);
108 tlen_x = tlen;
109 tlen_y = tlen;
111 else
113 /* Approximately cubic sub cells */
114 tlen = std::cbrt(grid->na_c/atomDensity);
115 tlen_x = tlen*c_gpuNumClusterPerCellX;
116 tlen_y = tlen*c_gpuNumClusterPerCellY;
118 /* We round ncx and ncy down, because we get less cell pairs
119 * in the nbsist when the fixed cell dimensions (x,y) are
120 * larger than the variable one (z) than the other way around.
122 grid->ncx = std::max(1, static_cast<int>(size[XX]/tlen_x));
123 grid->ncy = std::max(1, static_cast<int>(size[YY]/tlen_y));
125 else
127 grid->ncx = 1;
128 grid->ncy = 1;
131 grid->sx = size[XX]/grid->ncx;
132 grid->sy = size[YY]/grid->ncy;
133 grid->inv_sx = 1/grid->sx;
134 grid->inv_sy = 1/grid->sy;
136 if (ddZone > 0)
138 /* This is a non-home zone, add an extra row of cells
139 * for particles communicated for bonded interactions.
140 * These can be beyond the cut-off. It doesn't matter where
141 * they end up on the grid, but for performance it's better
142 * if they don't end up in cells that can be within cut-off range.
144 grid->ncx++;
145 grid->ncy++;
148 /* We need one additional cell entry for particles moved by DD */
149 grid->cxy_na.resize(grid->ncx*grid->ncy + 1);
150 grid->cxy_ind.resize(grid->ncx*grid->ncy + 2);
152 for (int t = 0; t < nbs->nthread_max; t++)
154 if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
156 nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
157 srenew(nbs->work[t].cxy_na, nbs->work[t].cxy_na_nalloc);
161 /* Worst case scenario of 1 atom in each last cell */
162 int maxNumCells;
163 if (grid->na_cj <= grid->na_c)
165 maxNumCells = numAtoms/grid->na_sc + grid->ncx*grid->ncy;
167 else
169 maxNumCells = numAtoms/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
172 grid->nsubc.resize(maxNumCells);
173 grid->bbcz.resize(maxNumCells*NNBSBB_D);
175 /* This resize also zeros the contents, this avoid possible
176 * floating exceptions in SIMD with the unused bb elements.
178 if (grid->bSimple)
180 grid->bb.resize(maxNumCells);
182 else
184 #if NBNXN_BBXXXX
185 grid->pbb.resize(maxNumCells*c_gpuNumClusterPerCell/STRIDE_PBB*NNBSBB_XXXX);
186 #else
187 grid->bb.resize(maxNumCells*c_gpuNumClusterPerCell);
188 #endif
191 if (grid->bSimple)
193 if (grid->na_cj == grid->na_c)
195 grid->bbj = grid->bb;
197 else
199 grid->bbjStorage.resize(maxNumCells*grid->na_c/grid->na_cj);
200 grid->bbj = grid->bbjStorage;
204 grid->flags.resize(maxNumCells);
205 if (nbs->bFEP)
207 grid->fep.resize(maxNumCells*grid->na_sc/grid->na_c);
210 copy_rvec(lowerCorner, grid->c0);
211 copy_rvec(upperCorner, grid->c1);
212 copy_rvec(size, grid->size);
215 /* We need to sort paricles in grid columns on z-coordinate.
216 * As particle are very often distributed homogeneously, we a sorting
217 * algorithm similar to pigeonhole sort. We multiply the z-coordinate
218 * by a factor, cast to an int and try to store in that hole. If the hole
219 * is full, we move this or another particle. A second pass is needed to make
220 * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
221 * 4 is the optimal value for homogeneous particle distribution and allows
222 * for an O(#particles) sort up till distributions were all particles are
223 * concentrated in 1/4 of the space. No NlogN fallback is implemented,
224 * as it can be expensive to detect imhomogeneous particle distributions.
225 * SGSF is the maximum ratio of holes used, in the worst case all particles
226 * end up in the last hole and we need #particles extra holes at the end.
228 #define SORT_GRID_OVERSIZE 4
229 #define SGSF (SORT_GRID_OVERSIZE + 1)
231 /* Sort particle index a on coordinates x along dim.
232 * Backwards tells if we want decreasing iso increasing coordinates.
233 * h0 is the minimum of the coordinate range.
234 * invh is the 1/length of the sorting range.
235 * n_per_h (>=n) is the expected average number of particles per 1/invh
236 * sort is the sorting work array.
237 * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
238 * or easier, allocate at least n*SGSF elements.
240 static void sort_atoms(int dim, gmx_bool Backwards,
241 int gmx_unused dd_zone,
242 int *a, int n, const rvec *x,
243 real h0, real invh, int n_per_h,
244 int *sort)
246 if (n <= 1)
248 /* Nothing to do */
249 return;
252 #ifndef NDEBUG
253 if (n > n_per_h)
255 gmx_incons("n > n_per_h");
257 #endif
259 /* Transform the inverse range height into the inverse hole height */
260 invh *= n_per_h*SORT_GRID_OVERSIZE;
262 /* Set nsort to the maximum possible number of holes used.
263 * In worst case all n elements end up in the last bin.
265 int nsort = n_per_h*SORT_GRID_OVERSIZE + n;
267 /* Determine the index range used, so we can limit it for the second pass */
268 int zi_min = INT_MAX;
269 int zi_max = -1;
271 /* Sort the particles using a simple index sort */
272 for (int i = 0; i < n; i++)
274 /* The cast takes care of float-point rounding effects below zero.
275 * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
276 * times the box height out of the box.
278 int zi = static_cast<int>((x[a[i]][dim] - h0)*invh);
280 #ifndef NDEBUG
281 /* As we can have rounding effect, we use > iso >= here */
282 if (zi < 0 || (dd_zone == 0 && zi > n_per_h*SORT_GRID_OVERSIZE))
284 gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
285 a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
286 n_per_h, SORT_GRID_OVERSIZE);
288 #endif
290 /* In a non-local domain, particles communcated for bonded interactions
291 * can be far beyond the grid size, which is set by the non-bonded
292 * cut-off distance. We sort such particles into the last cell.
294 if (zi > n_per_h*SORT_GRID_OVERSIZE)
296 zi = n_per_h*SORT_GRID_OVERSIZE;
299 /* Ideally this particle should go in sort cell zi,
300 * but that might already be in use,
301 * in that case find the first empty cell higher up
303 if (sort[zi] < 0)
305 sort[zi] = a[i];
306 zi_min = std::min(zi_min, zi);
307 zi_max = std::max(zi_max, zi);
309 else
311 /* We have multiple atoms in the same sorting slot.
312 * Sort on real z for minimal bounding box size.
313 * There is an extra check for identical z to ensure
314 * well-defined output order, independent of input order
315 * to ensure binary reproducibility after restarts.
317 while (sort[zi] >= 0 && ( x[a[i]][dim] > x[sort[zi]][dim] ||
318 (x[a[i]][dim] == x[sort[zi]][dim] &&
319 a[i] > sort[zi])))
321 zi++;
324 if (sort[zi] >= 0)
326 /* Shift all elements by one slot until we find an empty slot */
327 int cp = sort[zi];
328 int zim = zi + 1;
329 while (sort[zim] >= 0)
331 int tmp = sort[zim];
332 sort[zim] = cp;
333 cp = tmp;
334 zim++;
336 sort[zim] = cp;
337 zi_max = std::max(zi_max, zim);
339 sort[zi] = a[i];
340 zi_max = std::max(zi_max, zi);
344 int c = 0;
345 if (!Backwards)
347 for (int zi = 0; zi < nsort; zi++)
349 if (sort[zi] >= 0)
351 a[c++] = sort[zi];
352 sort[zi] = -1;
356 else
358 for (int zi = zi_max; zi >= zi_min; zi--)
360 if (sort[zi] >= 0)
362 a[c++] = sort[zi];
363 sort[zi] = -1;
367 if (c < n)
369 gmx_incons("Lost particles while sorting");
373 #if GMX_DOUBLE
374 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
375 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
376 #else
377 #define R2F_D(x) (x)
378 #define R2F_U(x) (x)
379 #endif
381 /* Coordinate order x,y,z, bb order xyz0 */
382 static void calc_bounding_box(int na, int stride, const real *x, nbnxn_bb_t *bb)
384 int i;
385 real xl, xh, yl, yh, zl, zh;
387 i = 0;
388 xl = x[i+XX];
389 xh = x[i+XX];
390 yl = x[i+YY];
391 yh = x[i+YY];
392 zl = x[i+ZZ];
393 zh = x[i+ZZ];
394 i += stride;
395 for (int j = 1; j < na; j++)
397 xl = std::min(xl, x[i+XX]);
398 xh = std::max(xh, x[i+XX]);
399 yl = std::min(yl, x[i+YY]);
400 yh = std::max(yh, x[i+YY]);
401 zl = std::min(zl, x[i+ZZ]);
402 zh = std::max(zh, x[i+ZZ]);
403 i += stride;
405 /* Note: possible double to float conversion here */
406 bb->lower[BB_X] = R2F_D(xl);
407 bb->lower[BB_Y] = R2F_D(yl);
408 bb->lower[BB_Z] = R2F_D(zl);
409 bb->upper[BB_X] = R2F_U(xh);
410 bb->upper[BB_Y] = R2F_U(yh);
411 bb->upper[BB_Z] = R2F_U(zh);
414 /* Packed coordinates, bb order xyz0 */
415 static void calc_bounding_box_x_x4(int na, const real *x, nbnxn_bb_t *bb)
417 real xl, xh, yl, yh, zl, zh;
419 xl = x[XX*c_packX4];
420 xh = x[XX*c_packX4];
421 yl = x[YY*c_packX4];
422 yh = x[YY*c_packX4];
423 zl = x[ZZ*c_packX4];
424 zh = x[ZZ*c_packX4];
425 for (int j = 1; j < na; j++)
427 xl = std::min(xl, x[j+XX*c_packX4]);
428 xh = std::max(xh, x[j+XX*c_packX4]);
429 yl = std::min(yl, x[j+YY*c_packX4]);
430 yh = std::max(yh, x[j+YY*c_packX4]);
431 zl = std::min(zl, x[j+ZZ*c_packX4]);
432 zh = std::max(zh, x[j+ZZ*c_packX4]);
434 /* Note: possible double to float conversion here */
435 bb->lower[BB_X] = R2F_D(xl);
436 bb->lower[BB_Y] = R2F_D(yl);
437 bb->lower[BB_Z] = R2F_D(zl);
438 bb->upper[BB_X] = R2F_U(xh);
439 bb->upper[BB_Y] = R2F_U(yh);
440 bb->upper[BB_Z] = R2F_U(zh);
443 /* Packed coordinates, bb order xyz0 */
444 static void calc_bounding_box_x_x8(int na, const real *x, nbnxn_bb_t *bb)
446 real xl, xh, yl, yh, zl, zh;
448 xl = x[XX*c_packX8];
449 xh = x[XX*c_packX8];
450 yl = x[YY*c_packX8];
451 yh = x[YY*c_packX8];
452 zl = x[ZZ*c_packX8];
453 zh = x[ZZ*c_packX8];
454 for (int j = 1; j < na; j++)
456 xl = std::min(xl, x[j+XX*c_packX8]);
457 xh = std::max(xh, x[j+XX*c_packX8]);
458 yl = std::min(yl, x[j+YY*c_packX8]);
459 yh = std::max(yh, x[j+YY*c_packX8]);
460 zl = std::min(zl, x[j+ZZ*c_packX8]);
461 zh = std::max(zh, x[j+ZZ*c_packX8]);
463 /* Note: possible double to float conversion here */
464 bb->lower[BB_X] = R2F_D(xl);
465 bb->lower[BB_Y] = R2F_D(yl);
466 bb->lower[BB_Z] = R2F_D(zl);
467 bb->upper[BB_X] = R2F_U(xh);
468 bb->upper[BB_Y] = R2F_U(yh);
469 bb->upper[BB_Z] = R2F_U(zh);
472 /* Packed coordinates, bb order xyz0 */
473 gmx_unused static void calc_bounding_box_x_x4_halves(int na, const real *x,
474 nbnxn_bb_t *bb, nbnxn_bb_t *bbj)
476 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
477 using namespace gmx;
479 calc_bounding_box_x_x4(std::min(na, 2), x, bbj);
481 if (na > 2)
483 calc_bounding_box_x_x4(std::min(na-2, 2), x+(c_packX4>>1), bbj+1);
485 else
487 /* Set the "empty" bounding box to the same as the first one,
488 * so we don't need to treat special cases in the rest of the code.
490 #if NBNXN_SEARCH_BB_SIMD4
491 store4(&bbj[1].lower[0], load4(&bbj[0].lower[0]));
492 store4(&bbj[1].upper[0], load4(&bbj[0].upper[0]));
493 #else
494 bbj[1] = bbj[0];
495 #endif
498 #if NBNXN_SEARCH_BB_SIMD4
499 store4(&bb->lower[0], min(load4(&bbj[0].lower[0]), load4(&bbj[1].lower[0])));
500 store4(&bb->upper[0], max(load4(&bbj[0].upper[0]), load4(&bbj[1].upper[0])));
501 #else
503 int i;
505 for (i = 0; i < NNBSBB_C; i++)
507 bb->lower[i] = std::min(bbj[0].lower[i], bbj[1].lower[i]);
508 bb->upper[i] = std::max(bbj[0].upper[i], bbj[1].upper[i]);
511 #endif
514 #if NBNXN_SEARCH_BB_SIMD4
516 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
517 static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
519 int i;
520 real xl, xh, yl, yh, zl, zh;
522 i = 0;
523 xl = x[i+XX];
524 xh = x[i+XX];
525 yl = x[i+YY];
526 yh = x[i+YY];
527 zl = x[i+ZZ];
528 zh = x[i+ZZ];
529 i += stride;
530 for (int j = 1; j < na; j++)
532 xl = std::min(xl, x[i+XX]);
533 xh = std::max(xh, x[i+XX]);
534 yl = std::min(yl, x[i+YY]);
535 yh = std::max(yh, x[i+YY]);
536 zl = std::min(zl, x[i+ZZ]);
537 zh = std::max(zh, x[i+ZZ]);
538 i += stride;
540 /* Note: possible double to float conversion here */
541 bb[0*STRIDE_PBB] = R2F_D(xl);
542 bb[1*STRIDE_PBB] = R2F_D(yl);
543 bb[2*STRIDE_PBB] = R2F_D(zl);
544 bb[3*STRIDE_PBB] = R2F_U(xh);
545 bb[4*STRIDE_PBB] = R2F_U(yh);
546 bb[5*STRIDE_PBB] = R2F_U(zh);
549 #endif /* NBNXN_SEARCH_BB_SIMD4 */
551 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
553 /* Coordinate order xyz?, bb order xyz0 */
554 static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb)
556 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
557 using namespace gmx;
559 Simd4Float bb_0_S, bb_1_S;
560 Simd4Float x_S;
562 bb_0_S = load4(x);
563 bb_1_S = bb_0_S;
565 for (int i = 1; i < na; i++)
567 x_S = load4(x+i*NNBSBB_C);
568 bb_0_S = min(bb_0_S, x_S);
569 bb_1_S = max(bb_1_S, x_S);
572 store4(&bb->lower[0], bb_0_S);
573 store4(&bb->upper[0], bb_1_S);
576 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
577 static void calc_bounding_box_xxxx_simd4(int na, const float *x,
578 nbnxn_bb_t *bb_work_aligned,
579 real *bb)
581 calc_bounding_box_simd4(na, x, bb_work_aligned);
583 bb[0*STRIDE_PBB] = bb_work_aligned->lower[BB_X];
584 bb[1*STRIDE_PBB] = bb_work_aligned->lower[BB_Y];
585 bb[2*STRIDE_PBB] = bb_work_aligned->lower[BB_Z];
586 bb[3*STRIDE_PBB] = bb_work_aligned->upper[BB_X];
587 bb[4*STRIDE_PBB] = bb_work_aligned->upper[BB_Y];
588 bb[5*STRIDE_PBB] = bb_work_aligned->upper[BB_Z];
591 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
594 /* Combines pairs of consecutive bounding boxes */
595 static void combine_bounding_box_pairs(nbnxn_grid_t *grid,
596 gmx::ArrayRef<const nbnxn_bb_t> bb)
598 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
599 using namespace gmx;
601 for (int i = 0; i < grid->ncx*grid->ncy; i++)
603 /* Starting bb in a column is expected to be 2-aligned */
604 int sc2 = grid->cxy_ind[i]>>1;
605 /* For odd numbers skip the last bb here */
606 int nc2 = (grid->cxy_na[i]+3)>>(2+1);
607 int c2;
608 for (c2 = sc2; c2 < sc2+nc2; c2++)
610 #if NBNXN_SEARCH_BB_SIMD4
611 Simd4Float min_S, max_S;
613 min_S = min(load4(&bb[c2*2+0].lower[0]),
614 load4(&bb[c2*2+1].lower[0]));
615 max_S = max(load4(&bb[c2*2+0].upper[0]),
616 load4(&bb[c2*2+1].upper[0]));
617 store4(&grid->bbj[c2].lower[0], min_S);
618 store4(&grid->bbj[c2].upper[0], max_S);
619 #else
620 for (int j = 0; j < NNBSBB_C; j++)
622 grid->bbj[c2].lower[j] = std::min(bb[c2*2+0].lower[j],
623 bb[c2*2+1].lower[j]);
624 grid->bbj[c2].upper[j] = std::max(bb[c2*2+0].upper[j],
625 bb[c2*2+1].upper[j]);
627 #endif
629 if (((grid->cxy_na[i]+3)>>2) & 1)
631 /* The bb count in this column is odd: duplicate the last bb */
632 for (int j = 0; j < NNBSBB_C; j++)
634 grid->bbj[c2].lower[j] = bb[c2*2].lower[j];
635 grid->bbj[c2].upper[j] = bb[c2*2].upper[j];
642 /* Prints the average bb size, used for debug output */
643 static void print_bbsizes_simple(FILE *fp,
644 const nbnxn_grid_t *grid)
646 dvec ba;
648 clear_dvec(ba);
649 for (int c = 0; c < grid->nc; c++)
651 for (int d = 0; d < DIM; d++)
653 ba[d] += grid->bb[c].upper[d] - grid->bb[c].lower[d];
656 dsvmul(1.0/grid->nc, ba, ba);
658 fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
659 grid->sx,
660 grid->sy,
661 grid->atom_density > 0 ?
662 grid->na_c/(grid->atom_density*grid->sx*grid->sy) : 0.0,
663 ba[XX], ba[YY], ba[ZZ],
664 ba[XX]/grid->sx,
665 ba[YY]/grid->sy,
666 grid->atom_density > 0 ?
667 ba[ZZ]/(grid->na_c/(grid->atom_density*grid->sx*grid->sy)) : 0.0);
670 /* Prints the average bb size, used for debug output */
671 static void print_bbsizes_supersub(FILE *fp,
672 const nbnxn_grid_t *grid)
674 int ns;
675 dvec ba;
677 clear_dvec(ba);
678 ns = 0;
679 for (int c = 0; c < grid->nc; c++)
681 #if NBNXN_BBXXXX
682 for (int s = 0; s < grid->nsubc[c]; s += STRIDE_PBB)
684 int cs_w = (c*c_gpuNumClusterPerCell + s)/STRIDE_PBB;
685 for (int i = 0; i < STRIDE_PBB; i++)
687 for (int d = 0; d < DIM; d++)
689 ba[d] +=
690 grid->pbb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
691 grid->pbb[cs_w*NNBSBB_XXXX+ d *STRIDE_PBB+i];
695 #else
696 for (int s = 0; s < grid->nsubc[c]; s++)
698 int cs = c*c_gpuNumClusterPerCell + s;
699 for (int d = 0; d < DIM; d++)
701 ba[d] += grid->bb[cs].upper[d] - grid->bb[cs].lower[d];
704 #endif
705 ns += grid->nsubc[c];
707 dsvmul(1.0/ns, ba, ba);
709 fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
710 grid->sx/c_gpuNumClusterPerCellX,
711 grid->sy/c_gpuNumClusterPerCellY,
712 grid->atom_density > 0 ?
713 grid->na_sc/(grid->atom_density*grid->sx*grid->sy*c_gpuNumClusterPerCellZ) : 0.0,
714 ba[XX], ba[YY], ba[ZZ],
715 ba[XX]*c_gpuNumClusterPerCellX/grid->sx,
716 ba[YY]*c_gpuNumClusterPerCellY/grid->sy,
717 grid->atom_density > 0 ?
718 ba[ZZ]/(grid->na_sc/(grid->atom_density*grid->sx*grid->sy*c_gpuNumClusterPerCellZ)) : 0.0);
721 /* Set non-bonded interaction flags for the current cluster.
722 * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
724 static void sort_cluster_on_flag(int numAtomsInCluster,
725 int atomStart,
726 int atomEnd,
727 const int *atinfo,
728 gmx::ArrayRef<int> order,
729 int *flags)
731 constexpr int c_maxNumAtomsInCluster = 8;
732 int sort1[c_maxNumAtomsInCluster];
733 int sort2[c_maxNumAtomsInCluster];
735 GMX_ASSERT(numAtomsInCluster <= c_maxNumAtomsInCluster, "Need to increase c_maxNumAtomsInCluster to support larger clusters");
737 *flags = 0;
739 int subc = 0;
740 for (int s = atomStart; s < atomEnd; s += numAtomsInCluster)
742 /* Make lists for this (sub-)cell on atoms with and without LJ */
743 int n1 = 0;
744 int n2 = 0;
745 gmx_bool haveQ = FALSE;
746 int a_lj_max = -1;
747 for (int a = s; a < std::min(s + numAtomsInCluster, atomEnd); a++)
749 haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
751 if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
753 sort1[n1++] = order[a];
754 a_lj_max = a;
756 else
758 sort2[n2++] = order[a];
762 /* If we don't have atoms with LJ, there's nothing to sort */
763 if (n1 > 0)
765 *flags |= NBNXN_CI_DO_LJ(subc);
767 if (2*n1 <= numAtomsInCluster)
769 /* Only sort when strictly necessary. Ordering particles
770 * Ordering particles can lead to less accurate summation
771 * due to rounding, both for LJ and Coulomb interactions.
773 if (2*(a_lj_max - s) >= numAtomsInCluster)
775 for (int i = 0; i < n1; i++)
777 order[atomStart + i] = sort1[i];
779 for (int j = 0; j < n2; j++)
781 order[atomStart + n1 + j] = sort2[j];
785 *flags |= NBNXN_CI_HALF_LJ(subc);
788 if (haveQ)
790 *flags |= NBNXN_CI_DO_COUL(subc);
792 subc++;
796 /* Fill a pair search cell with atoms.
797 * Potentially sorts atoms and sets the interaction flags.
799 static void fill_cell(const nbnxn_search_t nbs,
800 nbnxn_grid_t *grid,
801 nbnxn_atomdata_t *nbat,
802 int atomStart,
803 int atomEnd,
804 const int *atinfo,
805 const rvec *x,
806 nbnxn_bb_t gmx_unused *bb_work_aligned)
808 const int numAtoms = atomEnd - atomStart;
810 if (grid->bSimple)
812 /* Note that non-local grids are already sorted.
813 * Then sort_cluster_on_flag will only set the flags and the sorting
814 * will not affect the atom order.
816 sort_cluster_on_flag(grid->na_c, atomStart, atomEnd, atinfo, nbs->a,
817 grid->flags.data() + (atomStart >> grid->na_c_2log) - grid->cell0);
820 if (nbs->bFEP)
822 /* Set the fep flag for perturbed atoms in this (sub-)cell */
824 /* The grid-local cluster/(sub-)cell index */
825 int cell = (atomStart >> grid->na_c_2log) - grid->cell0*(grid->bSimple ? 1 : c_gpuNumClusterPerCell);
826 grid->fep[cell] = 0;
827 for (int at = atomStart; at < atomEnd; at++)
829 if (nbs->a[at] >= 0 && GET_CGINFO_FEP(atinfo[nbs->a[at]]))
831 grid->fep[cell] |= (1 << (at - atomStart));
836 /* Now we have sorted the atoms, set the cell indices */
837 for (int at = atomStart; at < atomEnd; at++)
839 nbs->cell[nbs->a[at]] = at;
842 copy_rvec_to_nbat_real(nbs->a.data() + atomStart, numAtoms, grid->na_c, x,
843 nbat->XFormat, nbat->x, atomStart);
845 if (nbat->XFormat == nbatX4)
847 /* Store the bounding boxes as xyz.xyz. */
848 size_t offset = (atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log;
849 nbnxn_bb_t *bb_ptr = grid->bb.data() + offset;
851 #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2
852 if (2*grid->na_cj == grid->na_c)
854 calc_bounding_box_x_x4_halves(numAtoms, nbat->x + atom_to_x_index<c_packX4>(atomStart), bb_ptr,
855 grid->bbj.data() + offset*2);
857 else
858 #endif
860 calc_bounding_box_x_x4(numAtoms, nbat->x + atom_to_x_index<c_packX4>(atomStart), bb_ptr);
863 else if (nbat->XFormat == nbatX8)
865 /* Store the bounding boxes as xyz.xyz. */
866 size_t offset = (atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log;
867 nbnxn_bb_t *bb_ptr = grid->bb.data() + offset;
869 calc_bounding_box_x_x8(numAtoms, nbat->x + atom_to_x_index<c_packX8>(atomStart), bb_ptr);
871 #if NBNXN_BBXXXX
872 else if (!grid->bSimple)
874 /* Store the bounding boxes in a format convenient
875 * for SIMD4 calculations: xxxxyyyyzzzz...
877 float *pbb_ptr =
878 grid->pbb.data() +
879 ((atomStart - grid->cell0*grid->na_sc) >> (grid->na_c_2log + STRIDE_PBB_2LOG))*NNBSBB_XXXX +
880 (((atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log) & (STRIDE_PBB - 1));
882 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
883 if (nbat->XFormat == nbatXYZQ)
885 calc_bounding_box_xxxx_simd4(numAtoms, nbat->x + atomStart*nbat->xstride,
886 bb_work_aligned, pbb_ptr);
888 else
889 #endif
891 calc_bounding_box_xxxx(numAtoms, nbat->xstride, nbat->x + atomStart*nbat->xstride,
892 pbb_ptr);
894 if (gmx_debug_at)
896 fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
897 atomStart >> grid->na_c_2log,
898 pbb_ptr[0*STRIDE_PBB], pbb_ptr[3*STRIDE_PBB],
899 pbb_ptr[1*STRIDE_PBB], pbb_ptr[4*STRIDE_PBB],
900 pbb_ptr[2*STRIDE_PBB], pbb_ptr[5*STRIDE_PBB]);
903 #endif
904 else
906 /* Store the bounding boxes as xyz.xyz. */
907 nbnxn_bb_t *bb_ptr = grid->bb.data() + ((atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log);
909 calc_bounding_box(numAtoms, nbat->xstride, nbat->x + atomStart*nbat->xstride,
910 bb_ptr);
912 if (gmx_debug_at)
914 int bbo = (atomStart - grid->cell0*grid->na_sc)/grid->na_c;
915 fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
916 atomStart >> grid->na_c_2log,
917 grid->bb[bbo].lower[BB_X],
918 grid->bb[bbo].lower[BB_Y],
919 grid->bb[bbo].lower[BB_Z],
920 grid->bb[bbo].upper[BB_X],
921 grid->bb[bbo].upper[BB_Y],
922 grid->bb[bbo].upper[BB_Z]);
927 /* Spatially sort the atoms within one grid column */
928 static void sort_columns_simple(const nbnxn_search_t nbs,
929 int dd_zone,
930 nbnxn_grid_t *grid,
931 int atomStart, int atomEnd,
932 const int *atinfo,
933 const rvec *x,
934 nbnxn_atomdata_t *nbat,
935 int cxy_start, int cxy_end,
936 int *sort_work)
938 if (debug)
940 fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
941 grid->cell0, cxy_start, cxy_end, atomStart, atomEnd);
944 /* Sort the atoms within each x,y column in 3 dimensions */
945 for (int cxy = cxy_start; cxy < cxy_end; cxy++)
947 int na = grid->cxy_na[cxy];
948 int ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
949 int ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
951 /* Sort the atoms within each x,y column on z coordinate */
952 sort_atoms(ZZ, FALSE, dd_zone,
953 nbs->a.data() + ash, na, x,
954 grid->c0[ZZ],
955 1.0/grid->size[ZZ], ncz*grid->na_sc,
956 sort_work);
958 /* Fill the ncz cells in this column */
959 int cfilled = grid->cxy_ind[cxy];
960 for (int cz = 0; cz < ncz; cz++)
962 int c = grid->cxy_ind[cxy] + cz;
964 int ash_c = ash + cz*grid->na_sc;
965 int na_c = std::min(grid->na_sc, na-(ash_c-ash));
967 fill_cell(nbs, grid, nbat,
968 ash_c, ash_c+na_c, atinfo, x,
969 nullptr);
971 /* This copy to bbcz is not really necessary.
972 * But it allows to use the same grid search code
973 * for the simple and supersub cell setups.
975 if (na_c > 0)
977 cfilled = c;
979 grid->bbcz[c*NNBSBB_D ] = grid->bb[cfilled].lower[BB_Z];
980 grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled].upper[BB_Z];
983 /* Set the unused atom indices to -1 */
984 for (int ind = na; ind < ncz*grid->na_sc; ind++)
986 nbs->a[ash+ind] = -1;
991 /* Spatially sort the atoms within one grid column */
992 static void sort_columns_supersub(const nbnxn_search_t nbs,
993 int dd_zone,
994 nbnxn_grid_t *grid,
995 int atomStart, int atomEnd,
996 const int *atinfo,
997 const rvec *x,
998 nbnxn_atomdata_t *nbat,
999 int cxy_start, int cxy_end,
1000 int *sort_work)
1002 nbnxn_bb_t bb_work_array[2];
1003 nbnxn_bb_t *bb_work_aligned = reinterpret_cast<nbnxn_bb_t *>((reinterpret_cast<std::size_t>(bb_work_array + 1)) & (~(static_cast<std::size_t>(15))));
1005 if (debug)
1007 fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1008 grid->cell0, cxy_start, cxy_end, atomStart, atomEnd);
1011 int subdiv_x = grid->na_c;
1012 int subdiv_y = c_gpuNumClusterPerCellX*subdiv_x;
1013 int subdiv_z = c_gpuNumClusterPerCellY*subdiv_y;
1015 /* Sort the atoms within each x,y column in 3 dimensions.
1016 * Loop over all columns on the x/y grid.
1018 for (int cxy = cxy_start; cxy < cxy_end; cxy++)
1020 int gridX = cxy/grid->ncy;
1021 int gridY = cxy - gridX*grid->ncy;
1023 int numAtomsInColumn = grid->cxy_na[cxy];
1024 int numCellsInColumn = grid->cxy_ind[cxy + 1] - grid->cxy_ind[cxy];
1025 int ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1027 /* Sort the atoms within each x,y column on z coordinate */
1028 sort_atoms(ZZ, FALSE, dd_zone,
1029 nbs->a.data() + ash, numAtomsInColumn, x,
1030 grid->c0[ZZ],
1031 1.0/grid->size[ZZ], numCellsInColumn*grid->na_sc,
1032 sort_work);
1034 /* This loop goes over the cells and clusters along z at once */
1035 for (int sub_z = 0; sub_z < numCellsInColumn*c_gpuNumClusterPerCellZ; sub_z++)
1037 int ash_z = ash + sub_z*subdiv_z;
1038 int na_z = std::min(subdiv_z, numAtomsInColumn - (ash_z - ash));
1039 int cz = -1;
1040 /* We have already sorted on z */
1042 if (sub_z % c_gpuNumClusterPerCellZ == 0)
1044 cz = sub_z/c_gpuNumClusterPerCellZ;
1045 int cell = grid->cxy_ind[cxy] + cz;
1047 /* The number of atoms in this cell/super-cluster */
1048 int numAtoms = std::min(grid->na_sc, numAtomsInColumn - (ash_z - ash));
1050 grid->nsubc[cell] = std::min(c_gpuNumClusterPerCell,
1051 (numAtoms + grid->na_c - 1)/grid->na_c);
1053 /* Store the z-boundaries of the bounding box of the cell */
1054 grid->bbcz[cell*NNBSBB_D ] = x[nbs->a[ash_z]][ZZ];
1055 grid->bbcz[cell*NNBSBB_D+1] = x[nbs->a[ash_z + numAtoms - 1]][ZZ];
1058 if (c_gpuNumClusterPerCellY > 1)
1060 /* Sort the atoms along y */
1061 sort_atoms(YY, (sub_z & 1), dd_zone,
1062 nbs->a.data() + ash_z, na_z, x,
1063 grid->c0[YY] + gridY*grid->sy,
1064 grid->inv_sy, subdiv_z,
1065 sort_work);
1068 for (int sub_y = 0; sub_y < c_gpuNumClusterPerCellY; sub_y++)
1070 int ash_y = ash_z + sub_y*subdiv_y;
1071 int na_y = std::min(subdiv_y, numAtomsInColumn - (ash_y - ash));
1073 if (c_gpuNumClusterPerCellX > 1)
1075 /* Sort the atoms along x */
1076 sort_atoms(XX, ((cz*c_gpuNumClusterPerCellY + sub_y) & 1), dd_zone,
1077 nbs->a.data() + ash_y, na_y, x,
1078 grid->c0[XX] + gridX*grid->sx,
1079 grid->inv_sx, subdiv_y,
1080 sort_work);
1083 for (int sub_x = 0; sub_x < c_gpuNumClusterPerCellX; sub_x++)
1085 int ash_x = ash_y + sub_x*subdiv_x;
1086 int na_x = std::min(subdiv_x, numAtomsInColumn - (ash_x - ash));
1088 fill_cell(nbs, grid, nbat,
1089 ash_x, ash_x + na_x, atinfo, x,
1090 bb_work_aligned);
1095 /* Set the unused atom indices to -1 */
1096 for (int ind = numAtomsInColumn; ind < numCellsInColumn*grid->na_sc; ind++)
1098 nbs->a[ash + ind] = -1;
1103 /* Determine in which grid column atoms should go */
1104 static void calc_column_indices(nbnxn_grid_t *grid,
1105 int atomStart, int atomEnd,
1106 const rvec *x,
1107 int dd_zone, const int *move,
1108 int thread, int nthread,
1109 gmx::ArrayRef<int> cell,
1110 int *cxy_na)
1112 /* We add one extra cell for particles which moved during DD */
1113 for (int i = 0; i < grid->ncx*grid->ncy+1; i++)
1115 cxy_na[i] = 0;
1118 int taskAtomStart = atomStart + static_cast<int>((thread + 0)*(atomEnd - atomStart))/nthread;
1119 int taskAtomEnd = atomStart + static_cast<int>((thread + 1)*(atomEnd - atomStart))/nthread;
1120 if (dd_zone == 0)
1122 /* Home zone */
1123 for (int i = taskAtomStart; i < taskAtomEnd; i++)
1125 if (move == nullptr || move[i] >= 0)
1127 /* We need to be careful with rounding,
1128 * particles might be a few bits outside the local zone.
1129 * The int cast takes care of the lower bound,
1130 * we will explicitly take care of the upper bound.
1132 int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1133 int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1135 #ifndef NDEBUG
1136 if (cx < 0 || cx > grid->ncx ||
1137 cy < 0 || cy > grid->ncy)
1139 gmx_fatal(FARGS,
1140 "grid cell cx %d cy %d out of range (max %d %d)\n"
1141 "atom %f %f %f, grid->c0 %f %f",
1142 cx, cy, grid->ncx, grid->ncy,
1143 x[i][XX], x[i][YY], x[i][ZZ], grid->c0[XX], grid->c0[YY]);
1145 #endif
1146 /* Take care of potential rouding issues */
1147 cx = std::min(cx, grid->ncx - 1);
1148 cy = std::min(cy, grid->ncy - 1);
1150 /* For the moment cell will contain only the, grid local,
1151 * x and y indices, not z.
1153 cell[i] = cx*grid->ncy + cy;
1155 else
1157 /* Put this moved particle after the end of the grid,
1158 * so we can process it later without using conditionals.
1160 cell[i] = grid->ncx*grid->ncy;
1163 cxy_na[cell[i]]++;
1166 else
1168 /* Non-home zone */
1169 for (int i = taskAtomStart; i < taskAtomEnd; i++)
1171 int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1172 int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1174 /* For non-home zones there could be particles outside
1175 * the non-bonded cut-off range, which have been communicated
1176 * for bonded interactions only. For the result it doesn't
1177 * matter where these end up on the grid. For performance
1178 * we put them in an extra row at the border.
1180 cx = std::max(cx, 0);
1181 cx = std::min(cx, grid->ncx - 1);
1182 cy = std::max(cy, 0);
1183 cy = std::min(cy, grid->ncy - 1);
1185 /* For the moment cell will contain only the, grid local,
1186 * x and y indices, not z.
1188 cell[i] = cx*grid->ncy + cy;
1190 cxy_na[cell[i]]++;
1195 /* Resizes grid and atom data which depend on the number of cells */
1196 static void resizeForNumberOfCells(const nbnxn_grid_t &grid,
1197 int numAtomsMoved,
1198 nbnxn_search *nbs,
1199 nbnxn_atomdata_t *nbat)
1201 int numNbnxnAtoms = (grid.cell0 + grid.nc)*grid.na_sc;
1203 /* Note: nbs->cell was already resized before */
1205 /* To avoid conditionals we store the moved particles at the end of a,
1206 * make sure we have enough space.
1208 nbs->a.resize(numNbnxnAtoms + numAtomsMoved);
1210 /* We need padding up to a multiple of the buffer flag size: simply add */
1211 if (numNbnxnAtoms + NBNXN_BUFFERFLAG_SIZE > nbat->nalloc)
1213 nbnxn_atomdata_realloc(nbat, numNbnxnAtoms + NBNXN_BUFFERFLAG_SIZE);
1216 nbat->natoms = numNbnxnAtoms;
1219 /* Determine in which grid cells the atoms should go */
1220 static void calc_cell_indices(const nbnxn_search_t nbs,
1221 int ddZone,
1222 nbnxn_grid_t *grid,
1223 int atomStart,
1224 int atomEnd,
1225 const int *atinfo,
1226 const rvec *x,
1227 int numAtomsMoved,
1228 const int *move,
1229 nbnxn_atomdata_t *nbat)
1231 /* First compute all grid/column indices and store them in nbs->cell */
1232 nbs->cell.resize(atomEnd);
1234 const int nthread = gmx_omp_nthreads_get(emntPairsearch);
1236 #pragma omp parallel for num_threads(nthread) schedule(static)
1237 for (int thread = 0; thread < nthread; thread++)
1241 calc_column_indices(grid, atomStart, atomEnd, x, ddZone, move, thread, nthread,
1242 nbs->cell, nbs->work[thread].cxy_na);
1244 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1247 /* Make the cell index as a function of x and y */
1248 int ncz_max = 0;
1249 int ncz = 0;
1250 grid->cxy_ind[0] = 0;
1251 for (int i = 0; i < grid->ncx*grid->ncy + 1; i++)
1253 /* We set ncz_max at the beginning of the loop iso at the end
1254 * to skip i=grid->ncx*grid->ncy which are moved particles
1255 * that do not need to be ordered on the grid.
1257 if (ncz > ncz_max)
1259 ncz_max = ncz;
1261 int cxy_na_i = nbs->work[0].cxy_na[i];
1262 for (int thread = 1; thread < nthread; thread++)
1264 cxy_na_i += nbs->work[thread].cxy_na[i];
1266 ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
1267 if (nbat->XFormat == nbatX8)
1269 /* Make the number of cell a multiple of 2 */
1270 ncz = (ncz + 1) & ~1;
1272 grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
1273 /* Clear cxy_na, so we can reuse the array below */
1274 grid->cxy_na[i] = 0;
1276 grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
1278 resizeForNumberOfCells(*grid, numAtomsMoved, nbs, nbat);
1280 if (debug)
1282 fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1283 grid->na_sc, grid->na_c, grid->nc,
1284 grid->ncx, grid->ncy, grid->nc/((double)(grid->ncx*grid->ncy)),
1285 ncz_max);
1286 if (gmx_debug_at)
1288 int i = 0;
1289 for (int cy = 0; cy < grid->ncy; cy++)
1291 for (int cx = 0; cx < grid->ncx; cx++)
1293 fprintf(debug, " %2d", grid->cxy_ind[i+1]-grid->cxy_ind[i]);
1294 i++;
1296 fprintf(debug, "\n");
1301 /* Make sure the work array for sorting is large enough */
1302 if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
1304 for (int thread = 0; thread < nbs->nthread_max; thread++)
1306 nbs->work[thread].sort_work_nalloc =
1307 over_alloc_large(ncz_max*grid->na_sc*SGSF);
1308 srenew(nbs->work[thread].sort_work,
1309 nbs->work[thread].sort_work_nalloc);
1310 /* When not in use, all elements should be -1 */
1311 for (int i = 0; i < nbs->work[thread].sort_work_nalloc; i++)
1313 nbs->work[thread].sort_work[i] = -1;
1318 /* Now we know the dimensions we can fill the grid.
1319 * This is the first, unsorted fill. We sort the columns after this.
1321 for (int i = atomStart; i < atomEnd; i++)
1323 /* At this point nbs->cell contains the local grid x,y indices */
1324 int cxy = nbs->cell[i];
1325 nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
1328 if (ddZone == 0)
1330 /* Set the cell indices for the moved particles */
1331 int n0 = grid->nc*grid->na_sc;
1332 int n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
1333 if (ddZone == 0)
1335 for (int i = n0; i < n1; i++)
1337 nbs->cell[nbs->a[i]] = i;
1342 /* Sort the super-cell columns along z into the sub-cells. */
1343 #pragma omp parallel for num_threads(nthread) schedule(static)
1344 for (int thread = 0; thread < nthread; thread++)
1348 if (grid->bSimple)
1350 sort_columns_simple(nbs, ddZone, grid, atomStart, atomEnd, atinfo, x, nbat,
1351 ((thread+0)*grid->ncx*grid->ncy)/nthread,
1352 ((thread+1)*grid->ncx*grid->ncy)/nthread,
1353 nbs->work[thread].sort_work);
1355 else
1357 sort_columns_supersub(nbs, ddZone, grid, atomStart, atomEnd, atinfo, x, nbat,
1358 ((thread+0)*grid->ncx*grid->ncy)/nthread,
1359 ((thread+1)*grid->ncx*grid->ncy)/nthread,
1360 nbs->work[thread].sort_work);
1363 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1366 if (grid->bSimple && nbat->XFormat == nbatX8)
1368 combine_bounding_box_pairs(grid, grid->bb);
1371 if (!grid->bSimple)
1373 grid->nsubc_tot = 0;
1374 for (int i = 0; i < grid->nc; i++)
1376 grid->nsubc_tot += grid->nsubc[i];
1380 if (debug)
1382 if (grid->bSimple)
1384 print_bbsizes_simple(debug, grid);
1386 else
1388 fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
1389 grid->nsubc_tot, (atomEnd - atomStart)/(double)grid->nsubc_tot);
1391 print_bbsizes_supersub(debug, grid);
1396 /* Sets up a grid and puts the atoms on the grid.
1397 * This function only operates on one domain of the domain decompostion.
1398 * Note that without domain decomposition there is only one domain.
1400 void nbnxn_put_on_grid(nbnxn_search_t nbs,
1401 int ePBC,
1402 const matrix box,
1403 int ddZone,
1404 const rvec lowerCorner,
1405 const rvec upperCorner,
1406 int atomStart,
1407 int atomEnd,
1408 real atomDensity,
1409 const int *atinfo,
1410 const rvec *x,
1411 int numAtomsMoved,
1412 const int *move,
1413 int nb_kernel_type,
1414 nbnxn_atomdata_t *nbat)
1416 nbnxn_grid_t *grid = &nbs->grid[ddZone];
1418 nbs_cycle_start(&nbs->cc[enbsCCgrid]);
1420 grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
1422 grid->na_c = nbnxn_kernel_to_cluster_i_size(nb_kernel_type);
1423 grid->na_cj = nbnxn_kernel_to_cluster_j_size(nb_kernel_type);
1424 grid->na_sc = (grid->bSimple ? 1 : c_gpuNumClusterPerCell)*grid->na_c;
1425 grid->na_c_2log = get_2log(grid->na_c);
1427 nbat->na_c = grid->na_c;
1429 if (ddZone == 0)
1431 grid->cell0 = 0;
1433 else
1435 grid->cell0 =
1436 (nbs->grid[ddZone - 1].cell0 + nbs->grid[ddZone - 1].nc)*
1437 nbs->grid[ddZone- 1].na_sc/grid->na_sc;
1440 const int n = atomEnd - atomStart;
1442 if (ddZone == 0)
1444 nbs->ePBC = ePBC;
1445 copy_mat(box, nbs->box);
1447 /* Avoid zero density */
1448 if (atomDensity > 0)
1450 grid->atom_density = atomDensity;
1452 else
1454 grid->atom_density = grid_atom_density(n - numAtomsMoved, lowerCorner, upperCorner);
1457 grid->cell0 = 0;
1459 nbs->natoms_local = atomEnd - numAtomsMoved;
1460 /* We assume that nbnxn_put_on_grid is called first
1461 * for the local atoms (ddZone=0).
1463 nbs->natoms_nonlocal = atomEnd - numAtomsMoved;
1465 if (debug)
1467 fprintf(debug, "natoms_local = %5d atom_density = %5.1f\n",
1468 nbs->natoms_local, grid->atom_density);
1471 else
1473 nbs->natoms_nonlocal = std::max(nbs->natoms_nonlocal, atomEnd);
1476 /* We always use the home zone (grid[0]) for setting the cell size,
1477 * since determining densities for non-local zones is difficult.
1479 set_grid_size_xy(nbs, grid,
1480 ddZone, n - numAtomsMoved,
1481 lowerCorner, upperCorner,
1482 nbs->grid[0].atom_density);
1484 calc_cell_indices(nbs, ddZone, grid, atomStart, atomEnd, atinfo, x, numAtomsMoved, move, nbat);
1486 if (ddZone == 0)
1488 nbat->natoms_local = nbat->natoms;
1491 nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
1494 /* Calls nbnxn_put_on_grid for all non-local domains */
1495 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs,
1496 const struct gmx_domdec_zones_t *zones,
1497 const int *atinfo,
1498 const rvec *x,
1499 int nb_kernel_type,
1500 nbnxn_atomdata_t *nbat)
1502 for (int zone = 1; zone < zones->n; zone++)
1504 rvec c0, c1;
1505 for (int d = 0; d < DIM; d++)
1507 c0[d] = zones->size[zone].bb_x0[d];
1508 c1[d] = zones->size[zone].bb_x1[d];
1511 nbnxn_put_on_grid(nbs, nbs->ePBC, nullptr,
1512 zone, c0, c1,
1513 zones->cg_range[zone],
1514 zones->cg_range[zone+1],
1516 atinfo,
1518 0, nullptr,
1519 nb_kernel_type,
1520 nbat);
1524 void nbnxn_get_ncells(nbnxn_search_t nbs, int *ncx, int *ncy)
1526 *ncx = nbs->grid[0].ncx;
1527 *ncy = nbs->grid[0].ncy;
1530 void nbnxn_get_atomorder(const nbnxn_search_t nbs, const int **a, int *n)
1532 const nbnxn_grid_t *grid;
1534 grid = &nbs->grid[0];
1536 /* Return the atom order for the home cell (index 0) */
1537 *a = nbs->a.data();
1539 *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
1542 void nbnxn_set_atomorder(nbnxn_search_t nbs)
1544 /* Set the atom order for the home cell (index 0) */
1545 nbnxn_grid_t *grid = &nbs->grid[0];
1547 int ao = 0;
1548 for (int cx = 0; cx < grid->ncx; cx++)
1550 for (int cy = 0; cy < grid->ncy; cy++)
1552 int cxy = cx*grid->ncy + cy;
1553 int j = grid->cxy_ind[cxy]*grid->na_sc;
1554 for (int cz = 0; cz < grid->cxy_na[cxy]; cz++)
1556 nbs->a[j] = ao;
1557 nbs->cell[ao] = j;
1558 ao++;
1559 j++;