Introduce ObservablesHistory container
[gromacs.git] / src / gromacs / mdlib / nbnxn_grid.cpp
blob39198000157b602c1a06c031602620291b411545
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #include "gmxpre.h"
38 #include "nbnxn_grid.h"
40 #include "config.h"
42 #include <assert.h>
43 #include <string.h>
45 #include <cmath>
47 #include <algorithm>
49 #include "gromacs/domdec/domdec_struct.h"
50 #include "gromacs/math/utilities.h"
51 #include "gromacs/math/vec.h"
52 #include "gromacs/mdlib/gmx_omp_nthreads.h"
53 #include "gromacs/mdlib/nb_verlet.h"
54 #include "gromacs/mdlib/nbnxn_atomdata.h"
55 #include "gromacs/mdlib/nbnxn_consts.h"
56 #include "gromacs/mdlib/nbnxn_internal.h"
57 #include "gromacs/mdlib/nbnxn_search.h"
58 #include "gromacs/mdlib/nbnxn_util.h"
59 #include "gromacs/simd/simd.h"
60 #include "gromacs/simd/vector_operations.h"
61 #include "gromacs/utility/exceptions.h"
62 #include "gromacs/utility/smalloc.h"
64 struct gmx_domdec_zones_t;
66 static void nbnxn_grid_init(nbnxn_grid_t * grid)
68 grid->cxy_na = nullptr;
69 grid->cxy_ind = nullptr;
70 grid->cxy_nalloc = 0;
71 grid->bb = nullptr;
72 grid->bbj = nullptr;
73 grid->nc_nalloc = 0;
76 void nbnxn_grids_init(nbnxn_search_t nbs, int ngrid)
78 int g;
80 nbs->ngrid = ngrid;
82 snew(nbs->grid, nbs->ngrid);
83 for (g = 0; g < nbs->ngrid; g++)
85 nbnxn_grid_init(&nbs->grid[g]);
89 static real grid_atom_density(int n, rvec corner0, rvec corner1)
91 rvec size;
93 if (n == 0)
95 /* To avoid zero density we use a minimum of 1 atom */
96 n = 1;
99 rvec_sub(corner1, corner0, size);
101 return n/(size[XX]*size[YY]*size[ZZ]);
104 static int set_grid_size_xy(const nbnxn_search_t nbs,
105 nbnxn_grid_t *grid,
106 int dd_zone,
107 int n, rvec corner0, rvec corner1,
108 real atom_density)
110 rvec size;
111 int na_c;
112 int nc_max;
113 real tlen, tlen_x, tlen_y;
115 rvec_sub(corner1, corner0, size);
117 if (n > grid->na_sc)
119 assert(atom_density > 0);
121 /* target cell length */
122 if (grid->bSimple)
124 /* To minimize the zero interactions, we should make
125 * the largest of the i/j cell cubic.
127 na_c = std::max(grid->na_c, grid->na_cj);
129 /* Approximately cubic cells */
130 tlen = std::cbrt(na_c/atom_density);
131 tlen_x = tlen;
132 tlen_y = tlen;
134 else
136 /* Approximately cubic sub cells */
137 tlen = std::cbrt(grid->na_c/atom_density);
138 tlen_x = tlen*c_gpuNumClusterPerCellX;
139 tlen_y = tlen*c_gpuNumClusterPerCellY;
141 /* We round ncx and ncy down, because we get less cell pairs
142 * in the nbsist when the fixed cell dimensions (x,y) are
143 * larger than the variable one (z) than the other way around.
145 grid->ncx = std::max(1, static_cast<int>(size[XX]/tlen_x));
146 grid->ncy = std::max(1, static_cast<int>(size[YY]/tlen_y));
148 else
150 grid->ncx = 1;
151 grid->ncy = 1;
154 grid->sx = size[XX]/grid->ncx;
155 grid->sy = size[YY]/grid->ncy;
156 grid->inv_sx = 1/grid->sx;
157 grid->inv_sy = 1/grid->sy;
159 if (dd_zone > 0)
161 /* This is a non-home zone, add an extra row of cells
162 * for particles communicated for bonded interactions.
163 * These can be beyond the cut-off. It doesn't matter where
164 * they end up on the grid, but for performance it's better
165 * if they don't end up in cells that can be within cut-off range.
167 grid->ncx++;
168 grid->ncy++;
171 /* We need one additional cell entry for particles moved by DD */
172 if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
174 grid->cxy_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
175 srenew(grid->cxy_na, grid->cxy_nalloc);
176 srenew(grid->cxy_ind, grid->cxy_nalloc+1);
178 for (int t = 0; t < nbs->nthread_max; t++)
180 if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
182 nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
183 srenew(nbs->work[t].cxy_na, nbs->work[t].cxy_na_nalloc);
187 /* Worst case scenario of 1 atom in each last cell */
188 if (grid->na_cj <= grid->na_c)
190 nc_max = n/grid->na_sc + grid->ncx*grid->ncy;
192 else
194 nc_max = n/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
197 if (nc_max > grid->nc_nalloc)
199 grid->nc_nalloc = over_alloc_large(nc_max);
200 srenew(grid->nsubc, grid->nc_nalloc);
201 srenew(grid->bbcz, grid->nc_nalloc*NNBSBB_D);
203 sfree_aligned(grid->bb);
204 /* This snew also zeros the contents, this avoid possible
205 * floating exceptions in SIMD with the unused bb elements.
207 if (grid->bSimple)
209 snew_aligned(grid->bb, grid->nc_nalloc, 16);
211 else
213 #if NBNXN_BBXXXX
214 int pbb_nalloc;
216 pbb_nalloc = grid->nc_nalloc*c_gpuNumClusterPerCell/STRIDE_PBB*NNBSBB_XXXX;
217 snew_aligned(grid->pbb, pbb_nalloc, 16);
218 #else
219 snew_aligned(grid->bb, grid->nc_nalloc*c_gpuNumClusterPerCell, 16);
220 #endif
223 if (grid->bSimple)
225 if (grid->na_cj == grid->na_c)
227 grid->bbj = grid->bb;
229 else
231 sfree_aligned(grid->bbj);
232 snew_aligned(grid->bbj, grid->nc_nalloc*grid->na_c/grid->na_cj, 16);
236 srenew(grid->flags, grid->nc_nalloc);
237 if (nbs->bFEP)
239 srenew(grid->fep, grid->nc_nalloc*grid->na_sc/grid->na_c);
243 copy_rvec(corner0, grid->c0);
244 copy_rvec(corner1, grid->c1);
245 copy_rvec(size, grid->size);
247 return nc_max;
250 /* We need to sort paricles in grid columns on z-coordinate.
251 * As particle are very often distributed homogeneously, we a sorting
252 * algorithm similar to pigeonhole sort. We multiply the z-coordinate
253 * by a factor, cast to an int and try to store in that hole. If the hole
254 * is full, we move this or another particle. A second pass is needed to make
255 * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
256 * 4 is the optimal value for homogeneous particle distribution and allows
257 * for an O(#particles) sort up till distributions were all particles are
258 * concentrated in 1/4 of the space. No NlogN fallback is implemented,
259 * as it can be expensive to detect imhomogeneous particle distributions.
260 * SGSF is the maximum ratio of holes used, in the worst case all particles
261 * end up in the last hole and we need #particles extra holes at the end.
263 #define SORT_GRID_OVERSIZE 4
264 #define SGSF (SORT_GRID_OVERSIZE + 1)
266 /* Sort particle index a on coordinates x along dim.
267 * Backwards tells if we want decreasing iso increasing coordinates.
268 * h0 is the minimum of the coordinate range.
269 * invh is the 1/length of the sorting range.
270 * n_per_h (>=n) is the expected average number of particles per 1/invh
271 * sort is the sorting work array.
272 * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
273 * or easier, allocate at least n*SGSF elements.
275 static void sort_atoms(int dim, gmx_bool Backwards,
276 int gmx_unused dd_zone,
277 int *a, int n, rvec *x,
278 real h0, real invh, int n_per_h,
279 int *sort)
281 if (n <= 1)
283 /* Nothing to do */
284 return;
287 #ifndef NDEBUG
288 if (n > n_per_h)
290 gmx_incons("n > n_per_h");
292 #endif
294 /* Transform the inverse range height into the inverse hole height */
295 invh *= n_per_h*SORT_GRID_OVERSIZE;
297 /* Set nsort to the maximum possible number of holes used.
298 * In worst case all n elements end up in the last bin.
300 int nsort = n_per_h*SORT_GRID_OVERSIZE + n;
302 /* Determine the index range used, so we can limit it for the second pass */
303 int zi_min = INT_MAX;
304 int zi_max = -1;
306 /* Sort the particles using a simple index sort */
307 for (int i = 0; i < n; i++)
309 /* The cast takes care of float-point rounding effects below zero.
310 * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
311 * times the box height out of the box.
313 int zi = static_cast<int>((x[a[i]][dim] - h0)*invh);
315 #ifndef NDEBUG
316 /* As we can have rounding effect, we use > iso >= here */
317 if (zi < 0 || (dd_zone == 0 && zi > n_per_h*SORT_GRID_OVERSIZE))
319 gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
320 a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
321 n_per_h, SORT_GRID_OVERSIZE);
323 #endif
325 /* In a non-local domain, particles communcated for bonded interactions
326 * can be far beyond the grid size, which is set by the non-bonded
327 * cut-off distance. We sort such particles into the last cell.
329 if (zi > n_per_h*SORT_GRID_OVERSIZE)
331 zi = n_per_h*SORT_GRID_OVERSIZE;
334 /* Ideally this particle should go in sort cell zi,
335 * but that might already be in use,
336 * in that case find the first empty cell higher up
338 if (sort[zi] < 0)
340 sort[zi] = a[i];
341 zi_min = std::min(zi_min, zi);
342 zi_max = std::max(zi_max, zi);
344 else
346 /* We have multiple atoms in the same sorting slot.
347 * Sort on real z for minimal bounding box size.
348 * There is an extra check for identical z to ensure
349 * well-defined output order, independent of input order
350 * to ensure binary reproducibility after restarts.
352 while (sort[zi] >= 0 && ( x[a[i]][dim] > x[sort[zi]][dim] ||
353 (x[a[i]][dim] == x[sort[zi]][dim] &&
354 a[i] > sort[zi])))
356 zi++;
359 if (sort[zi] >= 0)
361 /* Shift all elements by one slot until we find an empty slot */
362 int cp = sort[zi];
363 int zim = zi + 1;
364 while (sort[zim] >= 0)
366 int tmp = sort[zim];
367 sort[zim] = cp;
368 cp = tmp;
369 zim++;
371 sort[zim] = cp;
372 zi_max = std::max(zi_max, zim);
374 sort[zi] = a[i];
375 zi_max = std::max(zi_max, zi);
379 int c = 0;
380 if (!Backwards)
382 for (int zi = 0; zi < nsort; zi++)
384 if (sort[zi] >= 0)
386 a[c++] = sort[zi];
387 sort[zi] = -1;
391 else
393 for (int zi = zi_max; zi >= zi_min; zi--)
395 if (sort[zi] >= 0)
397 a[c++] = sort[zi];
398 sort[zi] = -1;
402 if (c < n)
404 gmx_incons("Lost particles while sorting");
408 #if GMX_DOUBLE
409 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
410 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
411 #else
412 #define R2F_D(x) (x)
413 #define R2F_U(x) (x)
414 #endif
416 /* Coordinate order x,y,z, bb order xyz0 */
417 static void calc_bounding_box(int na, int stride, const real *x, nbnxn_bb_t *bb)
419 int i;
420 real xl, xh, yl, yh, zl, zh;
422 i = 0;
423 xl = x[i+XX];
424 xh = x[i+XX];
425 yl = x[i+YY];
426 yh = x[i+YY];
427 zl = x[i+ZZ];
428 zh = x[i+ZZ];
429 i += stride;
430 for (int j = 1; j < na; j++)
432 xl = std::min(xl, x[i+XX]);
433 xh = std::max(xh, x[i+XX]);
434 yl = std::min(yl, x[i+YY]);
435 yh = std::max(yh, x[i+YY]);
436 zl = std::min(zl, x[i+ZZ]);
437 zh = std::max(zh, x[i+ZZ]);
438 i += stride;
440 /* Note: possible double to float conversion here */
441 bb->lower[BB_X] = R2F_D(xl);
442 bb->lower[BB_Y] = R2F_D(yl);
443 bb->lower[BB_Z] = R2F_D(zl);
444 bb->upper[BB_X] = R2F_U(xh);
445 bb->upper[BB_Y] = R2F_U(yh);
446 bb->upper[BB_Z] = R2F_U(zh);
449 /* Packed coordinates, bb order xyz0 */
450 static void calc_bounding_box_x_x4(int na, const real *x, nbnxn_bb_t *bb)
452 real xl, xh, yl, yh, zl, zh;
454 xl = x[XX*c_packX4];
455 xh = x[XX*c_packX4];
456 yl = x[YY*c_packX4];
457 yh = x[YY*c_packX4];
458 zl = x[ZZ*c_packX4];
459 zh = x[ZZ*c_packX4];
460 for (int j = 1; j < na; j++)
462 xl = std::min(xl, x[j+XX*c_packX4]);
463 xh = std::max(xh, x[j+XX*c_packX4]);
464 yl = std::min(yl, x[j+YY*c_packX4]);
465 yh = std::max(yh, x[j+YY*c_packX4]);
466 zl = std::min(zl, x[j+ZZ*c_packX4]);
467 zh = std::max(zh, x[j+ZZ*c_packX4]);
469 /* Note: possible double to float conversion here */
470 bb->lower[BB_X] = R2F_D(xl);
471 bb->lower[BB_Y] = R2F_D(yl);
472 bb->lower[BB_Z] = R2F_D(zl);
473 bb->upper[BB_X] = R2F_U(xh);
474 bb->upper[BB_Y] = R2F_U(yh);
475 bb->upper[BB_Z] = R2F_U(zh);
478 /* Packed coordinates, bb order xyz0 */
479 static void calc_bounding_box_x_x8(int na, const real *x, nbnxn_bb_t *bb)
481 real xl, xh, yl, yh, zl, zh;
483 xl = x[XX*c_packX8];
484 xh = x[XX*c_packX8];
485 yl = x[YY*c_packX8];
486 yh = x[YY*c_packX8];
487 zl = x[ZZ*c_packX8];
488 zh = x[ZZ*c_packX8];
489 for (int j = 1; j < na; j++)
491 xl = std::min(xl, x[j+XX*c_packX8]);
492 xh = std::max(xh, x[j+XX*c_packX8]);
493 yl = std::min(yl, x[j+YY*c_packX8]);
494 yh = std::max(yh, x[j+YY*c_packX8]);
495 zl = std::min(zl, x[j+ZZ*c_packX8]);
496 zh = std::max(zh, x[j+ZZ*c_packX8]);
498 /* Note: possible double to float conversion here */
499 bb->lower[BB_X] = R2F_D(xl);
500 bb->lower[BB_Y] = R2F_D(yl);
501 bb->lower[BB_Z] = R2F_D(zl);
502 bb->upper[BB_X] = R2F_U(xh);
503 bb->upper[BB_Y] = R2F_U(yh);
504 bb->upper[BB_Z] = R2F_U(zh);
507 /* Packed coordinates, bb order xyz0 */
508 static void calc_bounding_box_x_x4_halves(int na, const real *x,
509 nbnxn_bb_t *bb, nbnxn_bb_t *bbj)
511 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
512 using namespace gmx;
514 calc_bounding_box_x_x4(std::min(na, 2), x, bbj);
516 if (na > 2)
518 calc_bounding_box_x_x4(std::min(na-2, 2), x+(c_packX4>>1), bbj+1);
520 else
522 /* Set the "empty" bounding box to the same as the first one,
523 * so we don't need to treat special cases in the rest of the code.
525 #if NBNXN_SEARCH_BB_SIMD4
526 store4(&bbj[1].lower[0], load4(&bbj[0].lower[0]));
527 store4(&bbj[1].upper[0], load4(&bbj[0].upper[0]));
528 #else
529 bbj[1] = bbj[0];
530 #endif
533 #if NBNXN_SEARCH_BB_SIMD4
534 store4(&bb->lower[0], min(load4(&bbj[0].lower[0]), load4(&bbj[1].lower[0])));
535 store4(&bb->upper[0], max(load4(&bbj[0].upper[0]), load4(&bbj[1].upper[0])));
536 #else
538 int i;
540 for (i = 0; i < NNBSBB_C; i++)
542 bb->lower[i] = std::min(bbj[0].lower[i], bbj[1].lower[i]);
543 bb->upper[i] = std::max(bbj[0].upper[i], bbj[1].upper[i]);
546 #endif
549 #if NBNXN_SEARCH_BB_SIMD4
551 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
552 static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
554 int i;
555 real xl, xh, yl, yh, zl, zh;
557 i = 0;
558 xl = x[i+XX];
559 xh = x[i+XX];
560 yl = x[i+YY];
561 yh = x[i+YY];
562 zl = x[i+ZZ];
563 zh = x[i+ZZ];
564 i += stride;
565 for (int j = 1; j < na; j++)
567 xl = std::min(xl, x[i+XX]);
568 xh = std::max(xh, x[i+XX]);
569 yl = std::min(yl, x[i+YY]);
570 yh = std::max(yh, x[i+YY]);
571 zl = std::min(zl, x[i+ZZ]);
572 zh = std::max(zh, x[i+ZZ]);
573 i += stride;
575 /* Note: possible double to float conversion here */
576 bb[0*STRIDE_PBB] = R2F_D(xl);
577 bb[1*STRIDE_PBB] = R2F_D(yl);
578 bb[2*STRIDE_PBB] = R2F_D(zl);
579 bb[3*STRIDE_PBB] = R2F_U(xh);
580 bb[4*STRIDE_PBB] = R2F_U(yh);
581 bb[5*STRIDE_PBB] = R2F_U(zh);
584 #endif /* NBNXN_SEARCH_BB_SIMD4 */
586 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
588 /* Coordinate order xyz?, bb order xyz0 */
589 static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb)
591 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
592 using namespace gmx;
594 Simd4Float bb_0_S, bb_1_S;
595 Simd4Float x_S;
597 bb_0_S = load4(x);
598 bb_1_S = bb_0_S;
600 for (int i = 1; i < na; i++)
602 x_S = load4(x+i*NNBSBB_C);
603 bb_0_S = min(bb_0_S, x_S);
604 bb_1_S = max(bb_1_S, x_S);
607 store4(&bb->lower[0], bb_0_S);
608 store4(&bb->upper[0], bb_1_S);
611 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
612 static void calc_bounding_box_xxxx_simd4(int na, const float *x,
613 nbnxn_bb_t *bb_work_aligned,
614 real *bb)
616 calc_bounding_box_simd4(na, x, bb_work_aligned);
618 bb[0*STRIDE_PBB] = bb_work_aligned->lower[BB_X];
619 bb[1*STRIDE_PBB] = bb_work_aligned->lower[BB_Y];
620 bb[2*STRIDE_PBB] = bb_work_aligned->lower[BB_Z];
621 bb[3*STRIDE_PBB] = bb_work_aligned->upper[BB_X];
622 bb[4*STRIDE_PBB] = bb_work_aligned->upper[BB_Y];
623 bb[5*STRIDE_PBB] = bb_work_aligned->upper[BB_Z];
626 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
629 /* Combines pairs of consecutive bounding boxes */
630 static void combine_bounding_box_pairs(nbnxn_grid_t *grid, const nbnxn_bb_t *bb)
632 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
633 using namespace gmx;
635 for (int i = 0; i < grid->ncx*grid->ncy; i++)
637 /* Starting bb in a column is expected to be 2-aligned */
638 int sc2 = grid->cxy_ind[i]>>1;
639 /* For odd numbers skip the last bb here */
640 int nc2 = (grid->cxy_na[i]+3)>>(2+1);
641 int c2;
642 for (c2 = sc2; c2 < sc2+nc2; c2++)
644 #if NBNXN_SEARCH_BB_SIMD4
645 Simd4Float min_S, max_S;
647 min_S = min(load4(&bb[c2*2+0].lower[0]),
648 load4(&bb[c2*2+1].lower[0]));
649 max_S = max(load4(&bb[c2*2+0].upper[0]),
650 load4(&bb[c2*2+1].upper[0]));
651 store4(&grid->bbj[c2].lower[0], min_S);
652 store4(&grid->bbj[c2].upper[0], max_S);
653 #else
654 for (int j = 0; j < NNBSBB_C; j++)
656 grid->bbj[c2].lower[j] = std::min(bb[c2*2+0].lower[j],
657 bb[c2*2+1].lower[j]);
658 grid->bbj[c2].upper[j] = std::max(bb[c2*2+0].upper[j],
659 bb[c2*2+1].upper[j]);
661 #endif
663 if (((grid->cxy_na[i]+3)>>2) & 1)
665 /* The bb count in this column is odd: duplicate the last bb */
666 for (int j = 0; j < NNBSBB_C; j++)
668 grid->bbj[c2].lower[j] = bb[c2*2].lower[j];
669 grid->bbj[c2].upper[j] = bb[c2*2].upper[j];
676 /* Prints the average bb size, used for debug output */
677 static void print_bbsizes_simple(FILE *fp,
678 const nbnxn_grid_t *grid)
680 dvec ba;
682 clear_dvec(ba);
683 for (int c = 0; c < grid->nc; c++)
685 for (int d = 0; d < DIM; d++)
687 ba[d] += grid->bb[c].upper[d] - grid->bb[c].lower[d];
690 dsvmul(1.0/grid->nc, ba, ba);
692 fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
693 grid->sx,
694 grid->sy,
695 grid->atom_density > 0 ?
696 grid->na_c/(grid->atom_density*grid->sx*grid->sy) : 0.0,
697 ba[XX], ba[YY], ba[ZZ],
698 ba[XX]/grid->sx,
699 ba[YY]/grid->sy,
700 grid->atom_density > 0 ?
701 ba[ZZ]/(grid->na_c/(grid->atom_density*grid->sx*grid->sy)) : 0.0);
704 /* Prints the average bb size, used for debug output */
705 static void print_bbsizes_supersub(FILE *fp,
706 const nbnxn_grid_t *grid)
708 int ns;
709 dvec ba;
711 clear_dvec(ba);
712 ns = 0;
713 for (int c = 0; c < grid->nc; c++)
715 #if NBNXN_BBXXXX
716 for (int s = 0; s < grid->nsubc[c]; s += STRIDE_PBB)
718 int cs_w = (c*c_gpuNumClusterPerCell + s)/STRIDE_PBB;
719 for (int i = 0; i < STRIDE_PBB; i++)
721 for (int d = 0; d < DIM; d++)
723 ba[d] +=
724 grid->pbb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
725 grid->pbb[cs_w*NNBSBB_XXXX+ d *STRIDE_PBB+i];
729 #else
730 for (int s = 0; s < grid->nsubc[c]; s++)
732 int cs = c*c_gpuNumClusterPerCell + s;
733 for (int d = 0; d < DIM; d++)
735 ba[d] += grid->bb[cs].upper[d] - grid->bb[cs].lower[d];
738 #endif
739 ns += grid->nsubc[c];
741 dsvmul(1.0/ns, ba, ba);
743 fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
744 grid->sx/c_gpuNumClusterPerCellX,
745 grid->sy/c_gpuNumClusterPerCellY,
746 grid->atom_density > 0 ?
747 grid->na_sc/(grid->atom_density*grid->sx*grid->sy*c_gpuNumClusterPerCellZ) : 0.0,
748 ba[XX], ba[YY], ba[ZZ],
749 ba[XX]*c_gpuNumClusterPerCellX/grid->sx,
750 ba[YY]*c_gpuNumClusterPerCellY/grid->sy,
751 grid->atom_density > 0 ?
752 ba[ZZ]/(grid->na_sc/(grid->atom_density*grid->sx*grid->sy*c_gpuNumClusterPerCellZ)) : 0.0);
755 /* Set non-bonded interaction flags for the current cluster.
756 * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
758 static void sort_cluster_on_flag(int natoms_cluster,
759 int a0, int a1, const int *atinfo,
760 int *order,
761 int *flags)
763 const int natoms_cluster_max = 8;
764 int sort1[natoms_cluster_max];
765 int sort2[natoms_cluster_max];
767 assert(natoms_cluster <= natoms_cluster_max);
769 *flags = 0;
771 int subc = 0;
772 for (int s = a0; s < a1; s += natoms_cluster)
774 /* Make lists for this (sub-)cell on atoms with and without LJ */
775 int n1 = 0;
776 int n2 = 0;
777 gmx_bool haveQ = FALSE;
778 int a_lj_max = -1;
779 for (int a = s; a < std::min(s + natoms_cluster, a1); a++)
781 haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
783 if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
785 sort1[n1++] = order[a];
786 a_lj_max = a;
788 else
790 sort2[n2++] = order[a];
794 /* If we don't have atoms with LJ, there's nothing to sort */
795 if (n1 > 0)
797 *flags |= NBNXN_CI_DO_LJ(subc);
799 if (2*n1 <= natoms_cluster)
801 /* Only sort when strictly necessary. Ordering particles
802 * Ordering particles can lead to less accurate summation
803 * due to rounding, both for LJ and Coulomb interactions.
805 if (2*(a_lj_max - s) >= natoms_cluster)
807 for (int i = 0; i < n1; i++)
809 order[a0 + i] = sort1[i];
811 for (int j = 0; j < n2; j++)
813 order[a0 + n1 + j] = sort2[j];
817 *flags |= NBNXN_CI_HALF_LJ(subc);
820 if (haveQ)
822 *flags |= NBNXN_CI_DO_COUL(subc);
824 subc++;
828 /* Fill a pair search cell with atoms.
829 * Potentially sorts atoms and sets the interaction flags.
831 static void fill_cell(const nbnxn_search_t nbs,
832 nbnxn_grid_t *grid,
833 nbnxn_atomdata_t *nbat,
834 int a0, int a1,
835 const int *atinfo,
836 const rvec *x,
837 nbnxn_bb_t gmx_unused *bb_work_aligned)
839 int na, a;
840 size_t offset;
841 nbnxn_bb_t *bb_ptr;
842 #if NBNXN_BBXXXX
843 float *pbb_ptr;
844 #endif
846 na = a1 - a0;
848 if (grid->bSimple)
850 /* Note that non-local grids are already sorted.
851 * Then sort_cluster_on_flag will only set the flags and the sorting
852 * will not affect the atom order.
854 sort_cluster_on_flag(grid->na_c, a0, a1, atinfo, nbs->a,
855 grid->flags+(a0>>grid->na_c_2log)-grid->cell0);
858 if (nbs->bFEP)
860 /* Set the fep flag for perturbed atoms in this (sub-)cell */
861 int c;
863 /* The grid-local cluster/(sub-)cell index */
864 c = (a0 >> grid->na_c_2log) - grid->cell0*(grid->bSimple ? 1 : c_gpuNumClusterPerCell);
865 grid->fep[c] = 0;
866 for (int at = a0; at < a1; at++)
868 if (nbs->a[at] >= 0 && GET_CGINFO_FEP(atinfo[nbs->a[at]]))
870 grid->fep[c] |= (1 << (at - a0));
875 /* Now we have sorted the atoms, set the cell indices */
876 for (a = a0; a < a1; a++)
878 nbs->cell[nbs->a[a]] = a;
881 copy_rvec_to_nbat_real(nbs->a+a0, a1-a0, grid->na_c, x,
882 nbat->XFormat, nbat->x, a0);
884 if (nbat->XFormat == nbatX4)
886 /* Store the bounding boxes as xyz.xyz. */
887 offset = (a0 - grid->cell0*grid->na_sc) >> grid->na_c_2log;
888 bb_ptr = grid->bb + offset;
890 #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2
891 if (2*grid->na_cj == grid->na_c)
893 calc_bounding_box_x_x4_halves(na, nbat->x + atom_to_x_index<c_packX4>(a0), bb_ptr,
894 grid->bbj+offset*2);
896 else
897 #endif
899 calc_bounding_box_x_x4(na, nbat->x + atom_to_x_index<c_packX4>(a0), bb_ptr);
902 else if (nbat->XFormat == nbatX8)
904 /* Store the bounding boxes as xyz.xyz. */
905 offset = (a0 - grid->cell0*grid->na_sc) >> grid->na_c_2log;
906 bb_ptr = grid->bb + offset;
908 calc_bounding_box_x_x8(na, nbat->x + atom_to_x_index<c_packX8>(a0), bb_ptr);
910 #if NBNXN_BBXXXX
911 else if (!grid->bSimple)
913 /* Store the bounding boxes in a format convenient
914 * for SIMD4 calculations: xxxxyyyyzzzz...
916 pbb_ptr =
917 grid->pbb +
918 ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG))*NNBSBB_XXXX +
919 (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB-1));
921 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
922 if (nbat->XFormat == nbatXYZQ)
924 calc_bounding_box_xxxx_simd4(na, nbat->x+a0*nbat->xstride,
925 bb_work_aligned, pbb_ptr);
927 else
928 #endif
930 calc_bounding_box_xxxx(na, nbat->xstride, nbat->x+a0*nbat->xstride,
931 pbb_ptr);
933 if (gmx_debug_at)
935 fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
936 a0 >> grid->na_c_2log,
937 pbb_ptr[0*STRIDE_PBB], pbb_ptr[3*STRIDE_PBB],
938 pbb_ptr[1*STRIDE_PBB], pbb_ptr[4*STRIDE_PBB],
939 pbb_ptr[2*STRIDE_PBB], pbb_ptr[5*STRIDE_PBB]);
942 #endif
943 else
945 /* Store the bounding boxes as xyz.xyz. */
946 bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log);
948 calc_bounding_box(na, nbat->xstride, nbat->x+a0*nbat->xstride,
949 bb_ptr);
951 if (gmx_debug_at)
953 int bbo;
954 bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
955 fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
956 a0 >> grid->na_c_2log,
957 grid->bb[bbo].lower[BB_X],
958 grid->bb[bbo].lower[BB_Y],
959 grid->bb[bbo].lower[BB_Z],
960 grid->bb[bbo].upper[BB_X],
961 grid->bb[bbo].upper[BB_Y],
962 grid->bb[bbo].upper[BB_Z]);
967 /* Spatially sort the atoms within one grid column */
968 static void sort_columns_simple(const nbnxn_search_t nbs,
969 int dd_zone,
970 nbnxn_grid_t *grid,
971 int a0, int a1,
972 const int *atinfo,
973 rvec *x,
974 nbnxn_atomdata_t *nbat,
975 int cxy_start, int cxy_end,
976 int *sort_work)
978 int cfilled, c;
980 if (debug)
982 fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
983 grid->cell0, cxy_start, cxy_end, a0, a1);
986 /* Sort the atoms within each x,y column in 3 dimensions */
987 for (int cxy = cxy_start; cxy < cxy_end; cxy++)
989 int na = grid->cxy_na[cxy];
990 int ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
991 int ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
993 /* Sort the atoms within each x,y column on z coordinate */
994 sort_atoms(ZZ, FALSE, dd_zone,
995 nbs->a+ash, na, x,
996 grid->c0[ZZ],
997 1.0/grid->size[ZZ], ncz*grid->na_sc,
998 sort_work);
1000 /* Fill the ncz cells in this column */
1001 cfilled = grid->cxy_ind[cxy];
1002 for (int cz = 0; cz < ncz; cz++)
1004 c = grid->cxy_ind[cxy] + cz;
1006 int ash_c = ash + cz*grid->na_sc;
1007 int na_c = std::min(grid->na_sc, na-(ash_c-ash));
1009 fill_cell(nbs, grid, nbat,
1010 ash_c, ash_c+na_c, atinfo, x,
1011 nullptr);
1013 /* This copy to bbcz is not really necessary.
1014 * But it allows to use the same grid search code
1015 * for the simple and supersub cell setups.
1017 if (na_c > 0)
1019 cfilled = c;
1021 grid->bbcz[c*NNBSBB_D ] = grid->bb[cfilled].lower[BB_Z];
1022 grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled].upper[BB_Z];
1025 /* Set the unused atom indices to -1 */
1026 for (int ind = na; ind < ncz*grid->na_sc; ind++)
1028 nbs->a[ash+ind] = -1;
1033 /* Spatially sort the atoms within one grid column */
1034 static void sort_columns_supersub(const nbnxn_search_t nbs,
1035 int dd_zone,
1036 nbnxn_grid_t *grid,
1037 int a0, int a1,
1038 const int *atinfo,
1039 rvec *x,
1040 nbnxn_atomdata_t *nbat,
1041 int cxy_start, int cxy_end,
1042 int *sort_work)
1044 nbnxn_bb_t bb_work_array[2], *bb_work_aligned;
1046 bb_work_aligned = reinterpret_cast<nbnxn_bb_t *>((reinterpret_cast<std::size_t>(bb_work_array+1)) & (~(static_cast<std::size_t>(15))));
1048 if (debug)
1050 fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1051 grid->cell0, cxy_start, cxy_end, a0, a1);
1054 int subdiv_x = grid->na_c;
1055 int subdiv_y = c_gpuNumClusterPerCellX*subdiv_x;
1056 int subdiv_z = c_gpuNumClusterPerCellY*subdiv_y;
1058 /* Sort the atoms within each x,y column in 3 dimensions */
1059 for (int cxy = cxy_start; cxy < cxy_end; cxy++)
1061 int cx = cxy/grid->ncy;
1062 int cy = cxy - cx*grid->ncy;
1064 int na = grid->cxy_na[cxy];
1065 int ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
1066 int ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1068 /* Sort the atoms within each x,y column on z coordinate */
1069 sort_atoms(ZZ, FALSE, dd_zone,
1070 nbs->a + ash, na, x,
1071 grid->c0[ZZ],
1072 1.0/grid->size[ZZ], ncz*grid->na_sc,
1073 sort_work);
1075 /* This loop goes over the supercells and subcells along z at once */
1076 for (int sub_z = 0; sub_z < ncz*c_gpuNumClusterPerCellZ; sub_z++)
1078 int ash_z = ash + sub_z*subdiv_z;
1079 int na_z = std::min(subdiv_z, na - (ash_z - ash));
1080 int cz = -1;
1081 /* We have already sorted on z */
1083 if (sub_z % c_gpuNumClusterPerCellZ == 0)
1085 cz = sub_z/c_gpuNumClusterPerCellZ;
1086 int c = grid->cxy_ind[cxy] + cz;
1088 /* The number of atoms in this supercell */
1089 int na_c = std::min(grid->na_sc, na - (ash_z - ash));
1091 grid->nsubc[c] = std::min(c_gpuNumClusterPerCell, (na_c + grid->na_c - 1)/grid->na_c);
1093 /* Store the z-boundaries of the super cell */
1094 grid->bbcz[c*NNBSBB_D ] = x[nbs->a[ash_z]][ZZ];
1095 grid->bbcz[c*NNBSBB_D+1] = x[nbs->a[ash_z + na_c - 1]][ZZ];
1098 if (c_gpuNumClusterPerCellY > 1)
1100 /* Sort the atoms along y */
1101 sort_atoms(YY, (sub_z & 1), dd_zone,
1102 nbs->a+ash_z, na_z, x,
1103 grid->c0[YY] + cy*grid->sy,
1104 grid->inv_sy, subdiv_z,
1105 sort_work);
1108 for (int sub_y = 0; sub_y < c_gpuNumClusterPerCellY; sub_y++)
1110 int ash_y = ash_z + sub_y*subdiv_y;
1111 int na_y = std::min(subdiv_y, na - (ash_y - ash));
1113 if (c_gpuNumClusterPerCellX > 1)
1115 /* Sort the atoms along x */
1116 sort_atoms(XX, ((cz*c_gpuNumClusterPerCellY + sub_y) & 1), dd_zone,
1117 nbs->a + ash_y, na_y, x,
1118 grid->c0[XX] + cx*grid->sx,
1119 grid->inv_sx, subdiv_y,
1120 sort_work);
1123 for (int sub_x = 0; sub_x < c_gpuNumClusterPerCellX; sub_x++)
1125 int ash_x = ash_y + sub_x*subdiv_x;
1126 int na_x = std::min(subdiv_x, na - (ash_x - ash));
1128 fill_cell(nbs, grid, nbat,
1129 ash_x, ash_x + na_x, atinfo, x,
1130 bb_work_aligned);
1135 /* Set the unused atom indices to -1 */
1136 for (int ind = na; ind < ncz*grid->na_sc; ind++)
1138 nbs->a[ash+ind] = -1;
1143 /* Determine in which grid column atoms should go */
1144 static void calc_column_indices(nbnxn_grid_t *grid,
1145 int a0, int a1,
1146 rvec *x,
1147 int dd_zone, const int *move,
1148 int thread, int nthread,
1149 int *cell,
1150 int *cxy_na)
1152 /* We add one extra cell for particles which moved during DD */
1153 for (int i = 0; i < grid->ncx*grid->ncy+1; i++)
1155 cxy_na[i] = 0;
1158 int n0 = a0 + static_cast<int>((thread+0)*(a1 - a0))/nthread;
1159 int n1 = a0 + static_cast<int>((thread+1)*(a1 - a0))/nthread;
1160 if (dd_zone == 0)
1162 /* Home zone */
1163 for (int i = n0; i < n1; i++)
1165 if (move == nullptr || move[i] >= 0)
1167 /* We need to be careful with rounding,
1168 * particles might be a few bits outside the local zone.
1169 * The int cast takes care of the lower bound,
1170 * we will explicitly take care of the upper bound.
1172 int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1173 int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1175 #ifndef NDEBUG
1176 if (cx < 0 || cx > grid->ncx ||
1177 cy < 0 || cy > grid->ncy)
1179 gmx_fatal(FARGS,
1180 "grid cell cx %d cy %d out of range (max %d %d)\n"
1181 "atom %f %f %f, grid->c0 %f %f",
1182 cx, cy, grid->ncx, grid->ncy,
1183 x[i][XX], x[i][YY], x[i][ZZ], grid->c0[XX], grid->c0[YY]);
1185 #endif
1186 /* Take care of potential rouding issues */
1187 cx = std::min(cx, grid->ncx - 1);
1188 cy = std::min(cy, grid->ncy - 1);
1190 /* For the moment cell will contain only the, grid local,
1191 * x and y indices, not z.
1193 cell[i] = cx*grid->ncy + cy;
1195 else
1197 /* Put this moved particle after the end of the grid,
1198 * so we can process it later without using conditionals.
1200 cell[i] = grid->ncx*grid->ncy;
1203 cxy_na[cell[i]]++;
1206 else
1208 /* Non-home zone */
1209 for (int i = n0; i < n1; i++)
1211 int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1212 int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1214 /* For non-home zones there could be particles outside
1215 * the non-bonded cut-off range, which have been communicated
1216 * for bonded interactions only. For the result it doesn't
1217 * matter where these end up on the grid. For performance
1218 * we put them in an extra row at the border.
1220 cx = std::max(cx, 0);
1221 cx = std::min(cx, grid->ncx - 1);
1222 cy = std::max(cy, 0);
1223 cy = std::min(cy, grid->ncy - 1);
1225 /* For the moment cell will contain only the, grid local,
1226 * x and y indices, not z.
1228 cell[i] = cx*grid->ncy + cy;
1230 cxy_na[cell[i]]++;
1235 /* Determine in which grid cells the atoms should go */
1236 static void calc_cell_indices(const nbnxn_search_t nbs,
1237 int dd_zone,
1238 nbnxn_grid_t *grid,
1239 int a0, int a1,
1240 const int *atinfo,
1241 rvec *x,
1242 const int *move,
1243 nbnxn_atomdata_t *nbat)
1245 int n0, n1;
1246 int cx, cy, cxy, ncz_max, ncz;
1247 int nthread;
1248 int cxy_na_i;
1250 nthread = gmx_omp_nthreads_get(emntPairsearch);
1252 #pragma omp parallel for num_threads(nthread) schedule(static)
1253 for (int thread = 0; thread < nthread; thread++)
1257 calc_column_indices(grid, a0, a1, x, dd_zone, move, thread, nthread,
1258 nbs->cell, nbs->work[thread].cxy_na);
1260 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1263 /* Make the cell index as a function of x and y */
1264 ncz_max = 0;
1265 ncz = 0;
1266 grid->cxy_ind[0] = 0;
1267 for (int i = 0; i < grid->ncx*grid->ncy+1; i++)
1269 /* We set ncz_max at the beginning of the loop iso at the end
1270 * to skip i=grid->ncx*grid->ncy which are moved particles
1271 * that do not need to be ordered on the grid.
1273 if (ncz > ncz_max)
1275 ncz_max = ncz;
1277 cxy_na_i = nbs->work[0].cxy_na[i];
1278 for (int thread = 1; thread < nthread; thread++)
1280 cxy_na_i += nbs->work[thread].cxy_na[i];
1282 ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
1283 if (nbat->XFormat == nbatX8)
1285 /* Make the number of cell a multiple of 2 */
1286 ncz = (ncz + 1) & ~1;
1288 grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
1289 /* Clear cxy_na, so we can reuse the array below */
1290 grid->cxy_na[i] = 0;
1292 grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
1294 nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
1296 if (debug)
1298 fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1299 grid->na_sc, grid->na_c, grid->nc,
1300 grid->ncx, grid->ncy, grid->nc/((double)(grid->ncx*grid->ncy)),
1301 ncz_max);
1302 if (gmx_debug_at)
1304 int i = 0;
1305 for (cy = 0; cy < grid->ncy; cy++)
1307 for (cx = 0; cx < grid->ncx; cx++)
1309 fprintf(debug, " %2d", grid->cxy_ind[i+1]-grid->cxy_ind[i]);
1310 i++;
1312 fprintf(debug, "\n");
1317 /* Make sure the work array for sorting is large enough */
1318 if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
1320 for (int thread = 0; thread < nbs->nthread_max; thread++)
1322 nbs->work[thread].sort_work_nalloc =
1323 over_alloc_large(ncz_max*grid->na_sc*SGSF);
1324 srenew(nbs->work[thread].sort_work,
1325 nbs->work[thread].sort_work_nalloc);
1326 /* When not in use, all elements should be -1 */
1327 for (int i = 0; i < nbs->work[thread].sort_work_nalloc; i++)
1329 nbs->work[thread].sort_work[i] = -1;
1334 /* Now we know the dimensions we can fill the grid.
1335 * This is the first, unsorted fill. We sort the columns after this.
1337 for (int i = a0; i < a1; i++)
1339 /* At this point nbs->cell contains the local grid x,y indices */
1340 cxy = nbs->cell[i];
1341 nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
1344 if (dd_zone == 0)
1346 /* Set the cell indices for the moved particles */
1347 n0 = grid->nc*grid->na_sc;
1348 n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
1349 if (dd_zone == 0)
1351 for (int i = n0; i < n1; i++)
1353 nbs->cell[nbs->a[i]] = i;
1358 /* Sort the super-cell columns along z into the sub-cells. */
1359 #pragma omp parallel for num_threads(nthread) schedule(static)
1360 for (int thread = 0; thread < nthread; thread++)
1364 if (grid->bSimple)
1366 sort_columns_simple(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
1367 ((thread+0)*grid->ncx*grid->ncy)/nthread,
1368 ((thread+1)*grid->ncx*grid->ncy)/nthread,
1369 nbs->work[thread].sort_work);
1371 else
1373 sort_columns_supersub(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
1374 ((thread+0)*grid->ncx*grid->ncy)/nthread,
1375 ((thread+1)*grid->ncx*grid->ncy)/nthread,
1376 nbs->work[thread].sort_work);
1379 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1382 if (grid->bSimple && nbat->XFormat == nbatX8)
1384 combine_bounding_box_pairs(grid, grid->bb);
1387 if (!grid->bSimple)
1389 grid->nsubc_tot = 0;
1390 for (int i = 0; i < grid->nc; i++)
1392 grid->nsubc_tot += grid->nsubc[i];
1396 if (debug)
1398 if (grid->bSimple)
1400 print_bbsizes_simple(debug, grid);
1402 else
1404 fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
1405 grid->nsubc_tot, (a1-a0)/(double)grid->nsubc_tot);
1407 print_bbsizes_supersub(debug, grid);
1412 static void init_buffer_flags(nbnxn_buffer_flags_t *flags,
1413 int natoms)
1415 flags->nflag = (natoms + NBNXN_BUFFERFLAG_SIZE - 1)/NBNXN_BUFFERFLAG_SIZE;
1416 if (flags->nflag > flags->flag_nalloc)
1418 flags->flag_nalloc = over_alloc_large(flags->nflag);
1419 srenew(flags->flag, flags->flag_nalloc);
1421 for (int b = 0; b < flags->nflag; b++)
1423 bitmask_clear(&(flags->flag[b]));
1427 /* Sets up a grid and puts the atoms on the grid.
1428 * This function only operates on one domain of the domain decompostion.
1429 * Note that without domain decomposition there is only one domain.
1431 void nbnxn_put_on_grid(nbnxn_search_t nbs,
1432 int ePBC, matrix box,
1433 int dd_zone,
1434 rvec corner0, rvec corner1,
1435 int a0, int a1,
1436 real atom_density,
1437 const int *atinfo,
1438 rvec *x,
1439 int nmoved, int *move,
1440 int nb_kernel_type,
1441 nbnxn_atomdata_t *nbat)
1443 nbnxn_grid_t *grid;
1444 int n;
1445 int nc_max_grid, nc_max;
1447 grid = &nbs->grid[dd_zone];
1449 nbs_cycle_start(&nbs->cc[enbsCCgrid]);
1451 grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
1453 grid->na_c = nbnxn_kernel_to_cluster_i_size(nb_kernel_type);
1454 grid->na_cj = nbnxn_kernel_to_cluster_j_size(nb_kernel_type);
1455 grid->na_sc = (grid->bSimple ? 1 : c_gpuNumClusterPerCell)*grid->na_c;
1456 grid->na_c_2log = get_2log(grid->na_c);
1458 nbat->na_c = grid->na_c;
1460 if (dd_zone == 0)
1462 grid->cell0 = 0;
1464 else
1466 grid->cell0 =
1467 (nbs->grid[dd_zone-1].cell0 + nbs->grid[dd_zone-1].nc)*
1468 nbs->grid[dd_zone-1].na_sc/grid->na_sc;
1471 n = a1 - a0;
1473 if (dd_zone == 0)
1475 nbs->ePBC = ePBC;
1476 copy_mat(box, nbs->box);
1478 /* Avoid zero density */
1479 if (atom_density > 0)
1481 grid->atom_density = atom_density;
1483 else
1485 grid->atom_density = grid_atom_density(n-nmoved, corner0, corner1);
1488 grid->cell0 = 0;
1490 nbs->natoms_local = a1 - nmoved;
1491 /* We assume that nbnxn_put_on_grid is called first
1492 * for the local atoms (dd_zone=0).
1494 nbs->natoms_nonlocal = a1 - nmoved;
1496 if (debug)
1498 fprintf(debug, "natoms_local = %5d atom_density = %5.1f\n",
1499 nbs->natoms_local, grid->atom_density);
1502 else
1504 nbs->natoms_nonlocal = std::max(nbs->natoms_nonlocal, a1);
1507 /* We always use the home zone (grid[0]) for setting the cell size,
1508 * since determining densities for non-local zones is difficult.
1510 nc_max_grid = set_grid_size_xy(nbs, grid,
1511 dd_zone, n-nmoved, corner0, corner1,
1512 nbs->grid[0].atom_density);
1514 nc_max = grid->cell0 + nc_max_grid;
1516 if (a1 > nbs->cell_nalloc)
1518 nbs->cell_nalloc = over_alloc_large(a1);
1519 srenew(nbs->cell, nbs->cell_nalloc);
1522 /* To avoid conditionals we store the moved particles at the end of a,
1523 * make sure we have enough space.
1525 if (nc_max*grid->na_sc + nmoved > nbs->a_nalloc)
1527 nbs->a_nalloc = over_alloc_large(nc_max*grid->na_sc + nmoved);
1528 srenew(nbs->a, nbs->a_nalloc);
1531 /* We need padding up to a multiple of the buffer flag size: simply add */
1532 if (nc_max*grid->na_sc + NBNXN_BUFFERFLAG_SIZE > nbat->nalloc)
1534 nbnxn_atomdata_realloc(nbat, nc_max*grid->na_sc+NBNXN_BUFFERFLAG_SIZE);
1537 calc_cell_indices(nbs, dd_zone, grid, a0, a1, atinfo, x, move, nbat);
1539 if (dd_zone == 0)
1541 nbat->natoms_local = nbat->natoms;
1544 nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
1547 /* Calls nbnxn_put_on_grid for all non-local domains */
1548 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs,
1549 const struct gmx_domdec_zones_t *zones,
1550 const int *atinfo,
1551 rvec *x,
1552 int nb_kernel_type,
1553 nbnxn_atomdata_t *nbat)
1555 rvec c0, c1;
1557 for (int zone = 1; zone < zones->n; zone++)
1559 for (int d = 0; d < DIM; d++)
1561 c0[d] = zones->size[zone].bb_x0[d];
1562 c1[d] = zones->size[zone].bb_x1[d];
1565 nbnxn_put_on_grid(nbs, nbs->ePBC, nullptr,
1566 zone, c0, c1,
1567 zones->cg_range[zone],
1568 zones->cg_range[zone+1],
1570 atinfo,
1572 0, nullptr,
1573 nb_kernel_type,
1574 nbat);
1578 /* Add simple grid type information to the local super/sub grid */
1579 void nbnxn_grid_add_simple(nbnxn_search_t nbs,
1580 nbnxn_atomdata_t *nbat)
1582 nbnxn_grid_t *grid;
1583 float *bbcz;
1584 nbnxn_bb_t *bb;
1585 int ncd;
1587 grid = &nbs->grid[0];
1589 if (grid->bSimple)
1591 gmx_incons("nbnxn_grid_simple called with a simple grid");
1594 ncd = grid->na_sc/NBNXN_CPU_CLUSTER_I_SIZE;
1596 if (grid->nc*ncd > grid->nc_nalloc_simple)
1598 grid->nc_nalloc_simple = over_alloc_large(grid->nc*ncd);
1599 srenew(grid->bbcz_simple, grid->nc_nalloc_simple*NNBSBB_D);
1600 srenew(grid->bb_simple, grid->nc_nalloc_simple);
1601 srenew(grid->flags_simple, grid->nc_nalloc_simple);
1602 if (nbat->XFormat)
1604 sfree_aligned(grid->bbj);
1605 snew_aligned(grid->bbj, grid->nc_nalloc_simple/2, 16);
1609 bbcz = grid->bbcz_simple;
1610 bb = grid->bb_simple;
1612 #if GMX_OPENMP && !(defined __clang_analyzer__)
1613 // cppcheck-suppress unreadVariable
1614 int nthreads = gmx_omp_nthreads_get(emntPairsearch);
1615 #endif
1617 #pragma omp parallel for num_threads(nthreads) schedule(static)
1618 for (int sc = 0; sc < grid->nc; sc++)
1622 for (int c = 0; c < ncd; c++)
1624 int tx = sc*ncd + c;
1625 int na = NBNXN_CPU_CLUSTER_I_SIZE;
1626 while (na > 0 &&
1627 nbat->type[tx*NBNXN_CPU_CLUSTER_I_SIZE+na-1] == nbat->ntype-1)
1629 na--;
1632 if (na > 0)
1634 switch (nbat->XFormat)
1636 case nbatX4:
1637 /* c_packX4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
1638 calc_bounding_box_x_x4(na, nbat->x+tx*STRIDE_P4,
1639 bb+tx);
1640 break;
1641 case nbatX8:
1642 /* c_packX8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
1643 calc_bounding_box_x_x8(na, nbat->x + atom_to_x_index<c_packX8>(tx*NBNXN_CPU_CLUSTER_I_SIZE),
1644 bb+tx);
1645 break;
1646 default:
1647 calc_bounding_box(na, nbat->xstride,
1648 nbat->x+tx*NBNXN_CPU_CLUSTER_I_SIZE*nbat->xstride,
1649 bb+tx);
1650 break;
1652 bbcz[tx*NNBSBB_D+0] = bb[tx].lower[BB_Z];
1653 bbcz[tx*NNBSBB_D+1] = bb[tx].upper[BB_Z];
1655 /* No interaction optimization yet here */
1656 grid->flags_simple[tx] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
1658 else
1660 grid->flags_simple[tx] = 0;
1664 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1667 if (grid->bSimple && nbat->XFormat == nbatX8)
1669 combine_bounding_box_pairs(grid, grid->bb_simple);
1673 void nbnxn_get_ncells(nbnxn_search_t nbs, int *ncx, int *ncy)
1675 *ncx = nbs->grid[0].ncx;
1676 *ncy = nbs->grid[0].ncy;
1679 void nbnxn_get_atomorder(const nbnxn_search_t nbs, const int **a, int *n)
1681 const nbnxn_grid_t *grid;
1683 grid = &nbs->grid[0];
1685 /* Return the atom order for the home cell (index 0) */
1686 *a = nbs->a;
1688 *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
1691 void nbnxn_set_atomorder(nbnxn_search_t nbs)
1693 /* Set the atom order for the home cell (index 0) */
1694 nbnxn_grid_t *grid = &nbs->grid[0];
1696 int ao = 0;
1697 for (int cx = 0; cx < grid->ncx; cx++)
1699 for (int cy = 0; cy < grid->ncy; cy++)
1701 int cxy = cx*grid->ncy + cy;
1702 int j = grid->cxy_ind[cxy]*grid->na_sc;
1703 for (int cz = 0; cz < grid->cxy_na[cxy]; cz++)
1705 nbs->a[j] = ao;
1706 nbs->cell[ao] = j;
1707 ao++;
1708 j++;