Prepared t_mdatoms for using vector
[gromacs.git] / src / gromacs / mdlib / nbnxn_grid.cpp
blob7347ddc018dc5a5eb588d7cf17baf606a8a8d119
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #include "gmxpre.h"
38 #include "nbnxn_grid.h"
40 #include <assert.h>
41 #include <string.h>
43 #include <cmath>
45 #include <algorithm>
47 #include "gromacs/domdec/domdec_struct.h"
48 #include "gromacs/math/utilities.h"
49 #include "gromacs/math/vec.h"
50 #include "gromacs/mdlib/gmx_omp_nthreads.h"
51 #include "gromacs/mdlib/nb_verlet.h"
52 #include "gromacs/mdlib/nbnxn_atomdata.h"
53 #include "gromacs/mdlib/nbnxn_consts.h"
54 #include "gromacs/mdlib/nbnxn_internal.h"
55 #include "gromacs/mdlib/nbnxn_search.h"
56 #include "gromacs/mdlib/nbnxn_util.h"
57 #include "gromacs/simd/simd.h"
58 #include "gromacs/simd/vector_operations.h"
59 #include "gromacs/utility/exceptions.h"
60 #include "gromacs/utility/smalloc.h"
62 struct gmx_domdec_zones_t;
64 static void nbnxn_grid_init(nbnxn_grid_t * grid)
66 grid->cxy_na = nullptr;
67 grid->cxy_ind = nullptr;
68 grid->cxy_nalloc = 0;
69 grid->bb = nullptr;
70 grid->bbj = nullptr;
71 grid->nc_nalloc = 0;
74 void nbnxn_grids_init(nbnxn_search_t nbs, int ngrid)
76 int g;
78 nbs->ngrid = ngrid;
80 snew(nbs->grid, nbs->ngrid);
81 for (g = 0; g < nbs->ngrid; g++)
83 nbnxn_grid_init(&nbs->grid[g]);
87 static real grid_atom_density(int n, rvec corner0, rvec corner1)
89 rvec size;
91 if (n == 0)
93 /* To avoid zero density we use a minimum of 1 atom */
94 n = 1;
97 rvec_sub(corner1, corner0, size);
99 return n/(size[XX]*size[YY]*size[ZZ]);
102 static int set_grid_size_xy(const nbnxn_search_t nbs,
103 nbnxn_grid_t *grid,
104 int dd_zone,
105 int n, rvec corner0, rvec corner1,
106 real atom_density)
108 rvec size;
109 int na_c;
110 int nc_max;
111 real tlen, tlen_x, tlen_y;
113 rvec_sub(corner1, corner0, size);
115 if (n > grid->na_sc)
117 assert(atom_density > 0);
119 /* target cell length */
120 if (grid->bSimple)
122 /* To minimize the zero interactions, we should make
123 * the largest of the i/j cell cubic.
125 na_c = std::max(grid->na_c, grid->na_cj);
127 /* Approximately cubic cells */
128 tlen = std::cbrt(na_c/atom_density);
129 tlen_x = tlen;
130 tlen_y = tlen;
132 else
134 /* Approximately cubic sub cells */
135 tlen = std::cbrt(grid->na_c/atom_density);
136 tlen_x = tlen*c_gpuNumClusterPerCellX;
137 tlen_y = tlen*c_gpuNumClusterPerCellY;
139 /* We round ncx and ncy down, because we get less cell pairs
140 * in the nbsist when the fixed cell dimensions (x,y) are
141 * larger than the variable one (z) than the other way around.
143 grid->ncx = std::max(1, static_cast<int>(size[XX]/tlen_x));
144 grid->ncy = std::max(1, static_cast<int>(size[YY]/tlen_y));
146 else
148 grid->ncx = 1;
149 grid->ncy = 1;
152 grid->sx = size[XX]/grid->ncx;
153 grid->sy = size[YY]/grid->ncy;
154 grid->inv_sx = 1/grid->sx;
155 grid->inv_sy = 1/grid->sy;
157 if (dd_zone > 0)
159 /* This is a non-home zone, add an extra row of cells
160 * for particles communicated for bonded interactions.
161 * These can be beyond the cut-off. It doesn't matter where
162 * they end up on the grid, but for performance it's better
163 * if they don't end up in cells that can be within cut-off range.
165 grid->ncx++;
166 grid->ncy++;
169 /* We need one additional cell entry for particles moved by DD */
170 if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
172 grid->cxy_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
173 srenew(grid->cxy_na, grid->cxy_nalloc);
174 srenew(grid->cxy_ind, grid->cxy_nalloc+1);
176 for (int t = 0; t < nbs->nthread_max; t++)
178 if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
180 nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
181 srenew(nbs->work[t].cxy_na, nbs->work[t].cxy_na_nalloc);
185 /* Worst case scenario of 1 atom in each last cell */
186 if (grid->na_cj <= grid->na_c)
188 nc_max = n/grid->na_sc + grid->ncx*grid->ncy;
190 else
192 nc_max = n/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
195 if (nc_max > grid->nc_nalloc)
197 grid->nc_nalloc = over_alloc_large(nc_max);
198 srenew(grid->nsubc, grid->nc_nalloc);
199 srenew(grid->bbcz, grid->nc_nalloc*NNBSBB_D);
201 sfree_aligned(grid->bb);
202 /* This snew also zeros the contents, this avoid possible
203 * floating exceptions in SIMD with the unused bb elements.
205 if (grid->bSimple)
207 snew_aligned(grid->bb, grid->nc_nalloc, 16);
209 else
211 #if NBNXN_BBXXXX
212 int pbb_nalloc;
214 pbb_nalloc = grid->nc_nalloc*c_gpuNumClusterPerCell/STRIDE_PBB*NNBSBB_XXXX;
215 snew_aligned(grid->pbb, pbb_nalloc, 16);
216 #else
217 snew_aligned(grid->bb, grid->nc_nalloc*c_gpuNumClusterPerCell, 16);
218 #endif
221 if (grid->bSimple)
223 if (grid->na_cj == grid->na_c)
225 grid->bbj = grid->bb;
227 else
229 sfree_aligned(grid->bbj);
230 snew_aligned(grid->bbj, grid->nc_nalloc*grid->na_c/grid->na_cj, 16);
234 srenew(grid->flags, grid->nc_nalloc);
235 if (nbs->bFEP)
237 srenew(grid->fep, grid->nc_nalloc*grid->na_sc/grid->na_c);
241 copy_rvec(corner0, grid->c0);
242 copy_rvec(corner1, grid->c1);
243 copy_rvec(size, grid->size);
245 return nc_max;
248 /* We need to sort paricles in grid columns on z-coordinate.
249 * As particle are very often distributed homogeneously, we a sorting
250 * algorithm similar to pigeonhole sort. We multiply the z-coordinate
251 * by a factor, cast to an int and try to store in that hole. If the hole
252 * is full, we move this or another particle. A second pass is needed to make
253 * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
254 * 4 is the optimal value for homogeneous particle distribution and allows
255 * for an O(#particles) sort up till distributions were all particles are
256 * concentrated in 1/4 of the space. No NlogN fallback is implemented,
257 * as it can be expensive to detect imhomogeneous particle distributions.
258 * SGSF is the maximum ratio of holes used, in the worst case all particles
259 * end up in the last hole and we need #particles extra holes at the end.
261 #define SORT_GRID_OVERSIZE 4
262 #define SGSF (SORT_GRID_OVERSIZE + 1)
264 /* Sort particle index a on coordinates x along dim.
265 * Backwards tells if we want decreasing iso increasing coordinates.
266 * h0 is the minimum of the coordinate range.
267 * invh is the 1/length of the sorting range.
268 * n_per_h (>=n) is the expected average number of particles per 1/invh
269 * sort is the sorting work array.
270 * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
271 * or easier, allocate at least n*SGSF elements.
273 static void sort_atoms(int dim, gmx_bool Backwards,
274 int gmx_unused dd_zone,
275 int *a, int n, rvec *x,
276 real h0, real invh, int n_per_h,
277 int *sort)
279 if (n <= 1)
281 /* Nothing to do */
282 return;
285 #ifndef NDEBUG
286 if (n > n_per_h)
288 gmx_incons("n > n_per_h");
290 #endif
292 /* Transform the inverse range height into the inverse hole height */
293 invh *= n_per_h*SORT_GRID_OVERSIZE;
295 /* Set nsort to the maximum possible number of holes used.
296 * In worst case all n elements end up in the last bin.
298 int nsort = n_per_h*SORT_GRID_OVERSIZE + n;
300 /* Determine the index range used, so we can limit it for the second pass */
301 int zi_min = INT_MAX;
302 int zi_max = -1;
304 /* Sort the particles using a simple index sort */
305 for (int i = 0; i < n; i++)
307 /* The cast takes care of float-point rounding effects below zero.
308 * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
309 * times the box height out of the box.
311 int zi = static_cast<int>((x[a[i]][dim] - h0)*invh);
313 #ifndef NDEBUG
314 /* As we can have rounding effect, we use > iso >= here */
315 if (zi < 0 || (dd_zone == 0 && zi > n_per_h*SORT_GRID_OVERSIZE))
317 gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
318 a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
319 n_per_h, SORT_GRID_OVERSIZE);
321 #endif
323 /* In a non-local domain, particles communcated for bonded interactions
324 * can be far beyond the grid size, which is set by the non-bonded
325 * cut-off distance. We sort such particles into the last cell.
327 if (zi > n_per_h*SORT_GRID_OVERSIZE)
329 zi = n_per_h*SORT_GRID_OVERSIZE;
332 /* Ideally this particle should go in sort cell zi,
333 * but that might already be in use,
334 * in that case find the first empty cell higher up
336 if (sort[zi] < 0)
338 sort[zi] = a[i];
339 zi_min = std::min(zi_min, zi);
340 zi_max = std::max(zi_max, zi);
342 else
344 /* We have multiple atoms in the same sorting slot.
345 * Sort on real z for minimal bounding box size.
346 * There is an extra check for identical z to ensure
347 * well-defined output order, independent of input order
348 * to ensure binary reproducibility after restarts.
350 while (sort[zi] >= 0 && ( x[a[i]][dim] > x[sort[zi]][dim] ||
351 (x[a[i]][dim] == x[sort[zi]][dim] &&
352 a[i] > sort[zi])))
354 zi++;
357 if (sort[zi] >= 0)
359 /* Shift all elements by one slot until we find an empty slot */
360 int cp = sort[zi];
361 int zim = zi + 1;
362 while (sort[zim] >= 0)
364 int tmp = sort[zim];
365 sort[zim] = cp;
366 cp = tmp;
367 zim++;
369 sort[zim] = cp;
370 zi_max = std::max(zi_max, zim);
372 sort[zi] = a[i];
373 zi_max = std::max(zi_max, zi);
377 int c = 0;
378 if (!Backwards)
380 for (int zi = 0; zi < nsort; zi++)
382 if (sort[zi] >= 0)
384 a[c++] = sort[zi];
385 sort[zi] = -1;
389 else
391 for (int zi = zi_max; zi >= zi_min; zi--)
393 if (sort[zi] >= 0)
395 a[c++] = sort[zi];
396 sort[zi] = -1;
400 if (c < n)
402 gmx_incons("Lost particles while sorting");
406 #if GMX_DOUBLE
407 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
408 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
409 #else
410 #define R2F_D(x) (x)
411 #define R2F_U(x) (x)
412 #endif
414 /* Coordinate order x,y,z, bb order xyz0 */
415 static void calc_bounding_box(int na, int stride, const real *x, nbnxn_bb_t *bb)
417 int i;
418 real xl, xh, yl, yh, zl, zh;
420 i = 0;
421 xl = x[i+XX];
422 xh = x[i+XX];
423 yl = x[i+YY];
424 yh = x[i+YY];
425 zl = x[i+ZZ];
426 zh = x[i+ZZ];
427 i += stride;
428 for (int j = 1; j < na; j++)
430 xl = std::min(xl, x[i+XX]);
431 xh = std::max(xh, x[i+XX]);
432 yl = std::min(yl, x[i+YY]);
433 yh = std::max(yh, x[i+YY]);
434 zl = std::min(zl, x[i+ZZ]);
435 zh = std::max(zh, x[i+ZZ]);
436 i += stride;
438 /* Note: possible double to float conversion here */
439 bb->lower[BB_X] = R2F_D(xl);
440 bb->lower[BB_Y] = R2F_D(yl);
441 bb->lower[BB_Z] = R2F_D(zl);
442 bb->upper[BB_X] = R2F_U(xh);
443 bb->upper[BB_Y] = R2F_U(yh);
444 bb->upper[BB_Z] = R2F_U(zh);
447 /* Packed coordinates, bb order xyz0 */
448 static void calc_bounding_box_x_x4(int na, const real *x, nbnxn_bb_t *bb)
450 real xl, xh, yl, yh, zl, zh;
452 xl = x[XX*c_packX4];
453 xh = x[XX*c_packX4];
454 yl = x[YY*c_packX4];
455 yh = x[YY*c_packX4];
456 zl = x[ZZ*c_packX4];
457 zh = x[ZZ*c_packX4];
458 for (int j = 1; j < na; j++)
460 xl = std::min(xl, x[j+XX*c_packX4]);
461 xh = std::max(xh, x[j+XX*c_packX4]);
462 yl = std::min(yl, x[j+YY*c_packX4]);
463 yh = std::max(yh, x[j+YY*c_packX4]);
464 zl = std::min(zl, x[j+ZZ*c_packX4]);
465 zh = std::max(zh, x[j+ZZ*c_packX4]);
467 /* Note: possible double to float conversion here */
468 bb->lower[BB_X] = R2F_D(xl);
469 bb->lower[BB_Y] = R2F_D(yl);
470 bb->lower[BB_Z] = R2F_D(zl);
471 bb->upper[BB_X] = R2F_U(xh);
472 bb->upper[BB_Y] = R2F_U(yh);
473 bb->upper[BB_Z] = R2F_U(zh);
476 /* Packed coordinates, bb order xyz0 */
477 static void calc_bounding_box_x_x8(int na, const real *x, nbnxn_bb_t *bb)
479 real xl, xh, yl, yh, zl, zh;
481 xl = x[XX*c_packX8];
482 xh = x[XX*c_packX8];
483 yl = x[YY*c_packX8];
484 yh = x[YY*c_packX8];
485 zl = x[ZZ*c_packX8];
486 zh = x[ZZ*c_packX8];
487 for (int j = 1; j < na; j++)
489 xl = std::min(xl, x[j+XX*c_packX8]);
490 xh = std::max(xh, x[j+XX*c_packX8]);
491 yl = std::min(yl, x[j+YY*c_packX8]);
492 yh = std::max(yh, x[j+YY*c_packX8]);
493 zl = std::min(zl, x[j+ZZ*c_packX8]);
494 zh = std::max(zh, x[j+ZZ*c_packX8]);
496 /* Note: possible double to float conversion here */
497 bb->lower[BB_X] = R2F_D(xl);
498 bb->lower[BB_Y] = R2F_D(yl);
499 bb->lower[BB_Z] = R2F_D(zl);
500 bb->upper[BB_X] = R2F_U(xh);
501 bb->upper[BB_Y] = R2F_U(yh);
502 bb->upper[BB_Z] = R2F_U(zh);
505 /* Packed coordinates, bb order xyz0 */
506 gmx_unused static void calc_bounding_box_x_x4_halves(int na, const real *x,
507 nbnxn_bb_t *bb, nbnxn_bb_t *bbj)
509 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
510 using namespace gmx;
512 calc_bounding_box_x_x4(std::min(na, 2), x, bbj);
514 if (na > 2)
516 calc_bounding_box_x_x4(std::min(na-2, 2), x+(c_packX4>>1), bbj+1);
518 else
520 /* Set the "empty" bounding box to the same as the first one,
521 * so we don't need to treat special cases in the rest of the code.
523 #if NBNXN_SEARCH_BB_SIMD4
524 store4(&bbj[1].lower[0], load4(&bbj[0].lower[0]));
525 store4(&bbj[1].upper[0], load4(&bbj[0].upper[0]));
526 #else
527 bbj[1] = bbj[0];
528 #endif
531 #if NBNXN_SEARCH_BB_SIMD4
532 store4(&bb->lower[0], min(load4(&bbj[0].lower[0]), load4(&bbj[1].lower[0])));
533 store4(&bb->upper[0], max(load4(&bbj[0].upper[0]), load4(&bbj[1].upper[0])));
534 #else
536 int i;
538 for (i = 0; i < NNBSBB_C; i++)
540 bb->lower[i] = std::min(bbj[0].lower[i], bbj[1].lower[i]);
541 bb->upper[i] = std::max(bbj[0].upper[i], bbj[1].upper[i]);
544 #endif
547 #if NBNXN_SEARCH_BB_SIMD4
549 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
550 static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
552 int i;
553 real xl, xh, yl, yh, zl, zh;
555 i = 0;
556 xl = x[i+XX];
557 xh = x[i+XX];
558 yl = x[i+YY];
559 yh = x[i+YY];
560 zl = x[i+ZZ];
561 zh = x[i+ZZ];
562 i += stride;
563 for (int j = 1; j < na; j++)
565 xl = std::min(xl, x[i+XX]);
566 xh = std::max(xh, x[i+XX]);
567 yl = std::min(yl, x[i+YY]);
568 yh = std::max(yh, x[i+YY]);
569 zl = std::min(zl, x[i+ZZ]);
570 zh = std::max(zh, x[i+ZZ]);
571 i += stride;
573 /* Note: possible double to float conversion here */
574 bb[0*STRIDE_PBB] = R2F_D(xl);
575 bb[1*STRIDE_PBB] = R2F_D(yl);
576 bb[2*STRIDE_PBB] = R2F_D(zl);
577 bb[3*STRIDE_PBB] = R2F_U(xh);
578 bb[4*STRIDE_PBB] = R2F_U(yh);
579 bb[5*STRIDE_PBB] = R2F_U(zh);
582 #endif /* NBNXN_SEARCH_BB_SIMD4 */
584 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
586 /* Coordinate order xyz?, bb order xyz0 */
587 static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb)
589 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
590 using namespace gmx;
592 Simd4Float bb_0_S, bb_1_S;
593 Simd4Float x_S;
595 bb_0_S = load4(x);
596 bb_1_S = bb_0_S;
598 for (int i = 1; i < na; i++)
600 x_S = load4(x+i*NNBSBB_C);
601 bb_0_S = min(bb_0_S, x_S);
602 bb_1_S = max(bb_1_S, x_S);
605 store4(&bb->lower[0], bb_0_S);
606 store4(&bb->upper[0], bb_1_S);
609 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
610 static void calc_bounding_box_xxxx_simd4(int na, const float *x,
611 nbnxn_bb_t *bb_work_aligned,
612 real *bb)
614 calc_bounding_box_simd4(na, x, bb_work_aligned);
616 bb[0*STRIDE_PBB] = bb_work_aligned->lower[BB_X];
617 bb[1*STRIDE_PBB] = bb_work_aligned->lower[BB_Y];
618 bb[2*STRIDE_PBB] = bb_work_aligned->lower[BB_Z];
619 bb[3*STRIDE_PBB] = bb_work_aligned->upper[BB_X];
620 bb[4*STRIDE_PBB] = bb_work_aligned->upper[BB_Y];
621 bb[5*STRIDE_PBB] = bb_work_aligned->upper[BB_Z];
624 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
627 /* Combines pairs of consecutive bounding boxes */
628 static void combine_bounding_box_pairs(nbnxn_grid_t *grid, const nbnxn_bb_t *bb)
630 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
631 using namespace gmx;
633 for (int i = 0; i < grid->ncx*grid->ncy; i++)
635 /* Starting bb in a column is expected to be 2-aligned */
636 int sc2 = grid->cxy_ind[i]>>1;
637 /* For odd numbers skip the last bb here */
638 int nc2 = (grid->cxy_na[i]+3)>>(2+1);
639 int c2;
640 for (c2 = sc2; c2 < sc2+nc2; c2++)
642 #if NBNXN_SEARCH_BB_SIMD4
643 Simd4Float min_S, max_S;
645 min_S = min(load4(&bb[c2*2+0].lower[0]),
646 load4(&bb[c2*2+1].lower[0]));
647 max_S = max(load4(&bb[c2*2+0].upper[0]),
648 load4(&bb[c2*2+1].upper[0]));
649 store4(&grid->bbj[c2].lower[0], min_S);
650 store4(&grid->bbj[c2].upper[0], max_S);
651 #else
652 for (int j = 0; j < NNBSBB_C; j++)
654 grid->bbj[c2].lower[j] = std::min(bb[c2*2+0].lower[j],
655 bb[c2*2+1].lower[j]);
656 grid->bbj[c2].upper[j] = std::max(bb[c2*2+0].upper[j],
657 bb[c2*2+1].upper[j]);
659 #endif
661 if (((grid->cxy_na[i]+3)>>2) & 1)
663 /* The bb count in this column is odd: duplicate the last bb */
664 for (int j = 0; j < NNBSBB_C; j++)
666 grid->bbj[c2].lower[j] = bb[c2*2].lower[j];
667 grid->bbj[c2].upper[j] = bb[c2*2].upper[j];
674 /* Prints the average bb size, used for debug output */
675 static void print_bbsizes_simple(FILE *fp,
676 const nbnxn_grid_t *grid)
678 dvec ba;
680 clear_dvec(ba);
681 for (int c = 0; c < grid->nc; c++)
683 for (int d = 0; d < DIM; d++)
685 ba[d] += grid->bb[c].upper[d] - grid->bb[c].lower[d];
688 dsvmul(1.0/grid->nc, ba, ba);
690 fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
691 grid->sx,
692 grid->sy,
693 grid->atom_density > 0 ?
694 grid->na_c/(grid->atom_density*grid->sx*grid->sy) : 0.0,
695 ba[XX], ba[YY], ba[ZZ],
696 ba[XX]/grid->sx,
697 ba[YY]/grid->sy,
698 grid->atom_density > 0 ?
699 ba[ZZ]/(grid->na_c/(grid->atom_density*grid->sx*grid->sy)) : 0.0);
702 /* Prints the average bb size, used for debug output */
703 static void print_bbsizes_supersub(FILE *fp,
704 const nbnxn_grid_t *grid)
706 int ns;
707 dvec ba;
709 clear_dvec(ba);
710 ns = 0;
711 for (int c = 0; c < grid->nc; c++)
713 #if NBNXN_BBXXXX
714 for (int s = 0; s < grid->nsubc[c]; s += STRIDE_PBB)
716 int cs_w = (c*c_gpuNumClusterPerCell + s)/STRIDE_PBB;
717 for (int i = 0; i < STRIDE_PBB; i++)
719 for (int d = 0; d < DIM; d++)
721 ba[d] +=
722 grid->pbb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
723 grid->pbb[cs_w*NNBSBB_XXXX+ d *STRIDE_PBB+i];
727 #else
728 for (int s = 0; s < grid->nsubc[c]; s++)
730 int cs = c*c_gpuNumClusterPerCell + s;
731 for (int d = 0; d < DIM; d++)
733 ba[d] += grid->bb[cs].upper[d] - grid->bb[cs].lower[d];
736 #endif
737 ns += grid->nsubc[c];
739 dsvmul(1.0/ns, ba, ba);
741 fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
742 grid->sx/c_gpuNumClusterPerCellX,
743 grid->sy/c_gpuNumClusterPerCellY,
744 grid->atom_density > 0 ?
745 grid->na_sc/(grid->atom_density*grid->sx*grid->sy*c_gpuNumClusterPerCellZ) : 0.0,
746 ba[XX], ba[YY], ba[ZZ],
747 ba[XX]*c_gpuNumClusterPerCellX/grid->sx,
748 ba[YY]*c_gpuNumClusterPerCellY/grid->sy,
749 grid->atom_density > 0 ?
750 ba[ZZ]/(grid->na_sc/(grid->atom_density*grid->sx*grid->sy*c_gpuNumClusterPerCellZ)) : 0.0);
753 /* Set non-bonded interaction flags for the current cluster.
754 * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
756 static void sort_cluster_on_flag(int natoms_cluster,
757 int a0, int a1, const int *atinfo,
758 int *order,
759 int *flags)
761 const int natoms_cluster_max = 8;
762 int sort1[natoms_cluster_max];
763 int sort2[natoms_cluster_max];
765 assert(natoms_cluster <= natoms_cluster_max);
767 *flags = 0;
769 int subc = 0;
770 for (int s = a0; s < a1; s += natoms_cluster)
772 /* Make lists for this (sub-)cell on atoms with and without LJ */
773 int n1 = 0;
774 int n2 = 0;
775 gmx_bool haveQ = FALSE;
776 int a_lj_max = -1;
777 for (int a = s; a < std::min(s + natoms_cluster, a1); a++)
779 haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
781 if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
783 sort1[n1++] = order[a];
784 a_lj_max = a;
786 else
788 sort2[n2++] = order[a];
792 /* If we don't have atoms with LJ, there's nothing to sort */
793 if (n1 > 0)
795 *flags |= NBNXN_CI_DO_LJ(subc);
797 if (2*n1 <= natoms_cluster)
799 /* Only sort when strictly necessary. Ordering particles
800 * Ordering particles can lead to less accurate summation
801 * due to rounding, both for LJ and Coulomb interactions.
803 if (2*(a_lj_max - s) >= natoms_cluster)
805 for (int i = 0; i < n1; i++)
807 order[a0 + i] = sort1[i];
809 for (int j = 0; j < n2; j++)
811 order[a0 + n1 + j] = sort2[j];
815 *flags |= NBNXN_CI_HALF_LJ(subc);
818 if (haveQ)
820 *flags |= NBNXN_CI_DO_COUL(subc);
822 subc++;
826 /* Fill a pair search cell with atoms.
827 * Potentially sorts atoms and sets the interaction flags.
829 static void fill_cell(const nbnxn_search_t nbs,
830 nbnxn_grid_t *grid,
831 nbnxn_atomdata_t *nbat,
832 int a0, int a1,
833 const int *atinfo,
834 const rvec *x,
835 nbnxn_bb_t gmx_unused *bb_work_aligned)
837 int na, a;
838 size_t offset;
839 nbnxn_bb_t *bb_ptr;
840 #if NBNXN_BBXXXX
841 float *pbb_ptr;
842 #endif
844 na = a1 - a0;
846 if (grid->bSimple)
848 /* Note that non-local grids are already sorted.
849 * Then sort_cluster_on_flag will only set the flags and the sorting
850 * will not affect the atom order.
852 sort_cluster_on_flag(grid->na_c, a0, a1, atinfo, nbs->a,
853 grid->flags+(a0>>grid->na_c_2log)-grid->cell0);
856 if (nbs->bFEP)
858 /* Set the fep flag for perturbed atoms in this (sub-)cell */
859 int c;
861 /* The grid-local cluster/(sub-)cell index */
862 c = (a0 >> grid->na_c_2log) - grid->cell0*(grid->bSimple ? 1 : c_gpuNumClusterPerCell);
863 grid->fep[c] = 0;
864 for (int at = a0; at < a1; at++)
866 if (nbs->a[at] >= 0 && GET_CGINFO_FEP(atinfo[nbs->a[at]]))
868 grid->fep[c] |= (1 << (at - a0));
873 /* Now we have sorted the atoms, set the cell indices */
874 for (a = a0; a < a1; a++)
876 nbs->cell[nbs->a[a]] = a;
879 copy_rvec_to_nbat_real(nbs->a+a0, a1-a0, grid->na_c, x,
880 nbat->XFormat, nbat->x, a0);
882 if (nbat->XFormat == nbatX4)
884 /* Store the bounding boxes as xyz.xyz. */
885 offset = (a0 - grid->cell0*grid->na_sc) >> grid->na_c_2log;
886 bb_ptr = grid->bb + offset;
888 #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2
889 if (2*grid->na_cj == grid->na_c)
891 calc_bounding_box_x_x4_halves(na, nbat->x + atom_to_x_index<c_packX4>(a0), bb_ptr,
892 grid->bbj+offset*2);
894 else
895 #endif
897 calc_bounding_box_x_x4(na, nbat->x + atom_to_x_index<c_packX4>(a0), bb_ptr);
900 else if (nbat->XFormat == nbatX8)
902 /* Store the bounding boxes as xyz.xyz. */
903 offset = (a0 - grid->cell0*grid->na_sc) >> grid->na_c_2log;
904 bb_ptr = grid->bb + offset;
906 calc_bounding_box_x_x8(na, nbat->x + atom_to_x_index<c_packX8>(a0), bb_ptr);
908 #if NBNXN_BBXXXX
909 else if (!grid->bSimple)
911 /* Store the bounding boxes in a format convenient
912 * for SIMD4 calculations: xxxxyyyyzzzz...
914 pbb_ptr =
915 grid->pbb +
916 ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG))*NNBSBB_XXXX +
917 (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB-1));
919 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
920 if (nbat->XFormat == nbatXYZQ)
922 calc_bounding_box_xxxx_simd4(na, nbat->x+a0*nbat->xstride,
923 bb_work_aligned, pbb_ptr);
925 else
926 #endif
928 calc_bounding_box_xxxx(na, nbat->xstride, nbat->x+a0*nbat->xstride,
929 pbb_ptr);
931 if (gmx_debug_at)
933 fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
934 a0 >> grid->na_c_2log,
935 pbb_ptr[0*STRIDE_PBB], pbb_ptr[3*STRIDE_PBB],
936 pbb_ptr[1*STRIDE_PBB], pbb_ptr[4*STRIDE_PBB],
937 pbb_ptr[2*STRIDE_PBB], pbb_ptr[5*STRIDE_PBB]);
940 #endif
941 else
943 /* Store the bounding boxes as xyz.xyz. */
944 bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log);
946 calc_bounding_box(na, nbat->xstride, nbat->x+a0*nbat->xstride,
947 bb_ptr);
949 if (gmx_debug_at)
951 int bbo;
952 bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
953 fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
954 a0 >> grid->na_c_2log,
955 grid->bb[bbo].lower[BB_X],
956 grid->bb[bbo].lower[BB_Y],
957 grid->bb[bbo].lower[BB_Z],
958 grid->bb[bbo].upper[BB_X],
959 grid->bb[bbo].upper[BB_Y],
960 grid->bb[bbo].upper[BB_Z]);
965 /* Spatially sort the atoms within one grid column */
966 static void sort_columns_simple(const nbnxn_search_t nbs,
967 int dd_zone,
968 nbnxn_grid_t *grid,
969 int a0, int a1,
970 const int *atinfo,
971 rvec *x,
972 nbnxn_atomdata_t *nbat,
973 int cxy_start, int cxy_end,
974 int *sort_work)
976 int cfilled, c;
978 if (debug)
980 fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
981 grid->cell0, cxy_start, cxy_end, a0, a1);
984 /* Sort the atoms within each x,y column in 3 dimensions */
985 for (int cxy = cxy_start; cxy < cxy_end; cxy++)
987 int na = grid->cxy_na[cxy];
988 int ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
989 int ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
991 /* Sort the atoms within each x,y column on z coordinate */
992 sort_atoms(ZZ, FALSE, dd_zone,
993 nbs->a+ash, na, x,
994 grid->c0[ZZ],
995 1.0/grid->size[ZZ], ncz*grid->na_sc,
996 sort_work);
998 /* Fill the ncz cells in this column */
999 cfilled = grid->cxy_ind[cxy];
1000 for (int cz = 0; cz < ncz; cz++)
1002 c = grid->cxy_ind[cxy] + cz;
1004 int ash_c = ash + cz*grid->na_sc;
1005 int na_c = std::min(grid->na_sc, na-(ash_c-ash));
1007 fill_cell(nbs, grid, nbat,
1008 ash_c, ash_c+na_c, atinfo, x,
1009 nullptr);
1011 /* This copy to bbcz is not really necessary.
1012 * But it allows to use the same grid search code
1013 * for the simple and supersub cell setups.
1015 if (na_c > 0)
1017 cfilled = c;
1019 grid->bbcz[c*NNBSBB_D ] = grid->bb[cfilled].lower[BB_Z];
1020 grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled].upper[BB_Z];
1023 /* Set the unused atom indices to -1 */
1024 for (int ind = na; ind < ncz*grid->na_sc; ind++)
1026 nbs->a[ash+ind] = -1;
1031 /* Spatially sort the atoms within one grid column */
1032 static void sort_columns_supersub(const nbnxn_search_t nbs,
1033 int dd_zone,
1034 nbnxn_grid_t *grid,
1035 int a0, int a1,
1036 const int *atinfo,
1037 rvec *x,
1038 nbnxn_atomdata_t *nbat,
1039 int cxy_start, int cxy_end,
1040 int *sort_work)
1042 nbnxn_bb_t bb_work_array[2], *bb_work_aligned;
1044 bb_work_aligned = reinterpret_cast<nbnxn_bb_t *>((reinterpret_cast<std::size_t>(bb_work_array+1)) & (~(static_cast<std::size_t>(15))));
1046 if (debug)
1048 fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1049 grid->cell0, cxy_start, cxy_end, a0, a1);
1052 int subdiv_x = grid->na_c;
1053 int subdiv_y = c_gpuNumClusterPerCellX*subdiv_x;
1054 int subdiv_z = c_gpuNumClusterPerCellY*subdiv_y;
1056 /* Sort the atoms within each x,y column in 3 dimensions */
1057 for (int cxy = cxy_start; cxy < cxy_end; cxy++)
1059 int cx = cxy/grid->ncy;
1060 int cy = cxy - cx*grid->ncy;
1062 int na = grid->cxy_na[cxy];
1063 int ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
1064 int ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1066 /* Sort the atoms within each x,y column on z coordinate */
1067 sort_atoms(ZZ, FALSE, dd_zone,
1068 nbs->a + ash, na, x,
1069 grid->c0[ZZ],
1070 1.0/grid->size[ZZ], ncz*grid->na_sc,
1071 sort_work);
1073 /* This loop goes over the supercells and subcells along z at once */
1074 for (int sub_z = 0; sub_z < ncz*c_gpuNumClusterPerCellZ; sub_z++)
1076 int ash_z = ash + sub_z*subdiv_z;
1077 int na_z = std::min(subdiv_z, na - (ash_z - ash));
1078 int cz = -1;
1079 /* We have already sorted on z */
1081 if (sub_z % c_gpuNumClusterPerCellZ == 0)
1083 cz = sub_z/c_gpuNumClusterPerCellZ;
1084 int c = grid->cxy_ind[cxy] + cz;
1086 /* The number of atoms in this supercell */
1087 int na_c = std::min(grid->na_sc, na - (ash_z - ash));
1089 grid->nsubc[c] = std::min(c_gpuNumClusterPerCell, (na_c + grid->na_c - 1)/grid->na_c);
1091 /* Store the z-boundaries of the super cell */
1092 grid->bbcz[c*NNBSBB_D ] = x[nbs->a[ash_z]][ZZ];
1093 grid->bbcz[c*NNBSBB_D+1] = x[nbs->a[ash_z + na_c - 1]][ZZ];
1096 if (c_gpuNumClusterPerCellY > 1)
1098 /* Sort the atoms along y */
1099 sort_atoms(YY, (sub_z & 1), dd_zone,
1100 nbs->a+ash_z, na_z, x,
1101 grid->c0[YY] + cy*grid->sy,
1102 grid->inv_sy, subdiv_z,
1103 sort_work);
1106 for (int sub_y = 0; sub_y < c_gpuNumClusterPerCellY; sub_y++)
1108 int ash_y = ash_z + sub_y*subdiv_y;
1109 int na_y = std::min(subdiv_y, na - (ash_y - ash));
1111 if (c_gpuNumClusterPerCellX > 1)
1113 /* Sort the atoms along x */
1114 sort_atoms(XX, ((cz*c_gpuNumClusterPerCellY + sub_y) & 1), dd_zone,
1115 nbs->a + ash_y, na_y, x,
1116 grid->c0[XX] + cx*grid->sx,
1117 grid->inv_sx, subdiv_y,
1118 sort_work);
1121 for (int sub_x = 0; sub_x < c_gpuNumClusterPerCellX; sub_x++)
1123 int ash_x = ash_y + sub_x*subdiv_x;
1124 int na_x = std::min(subdiv_x, na - (ash_x - ash));
1126 fill_cell(nbs, grid, nbat,
1127 ash_x, ash_x + na_x, atinfo, x,
1128 bb_work_aligned);
1133 /* Set the unused atom indices to -1 */
1134 for (int ind = na; ind < ncz*grid->na_sc; ind++)
1136 nbs->a[ash+ind] = -1;
1141 /* Determine in which grid column atoms should go */
1142 static void calc_column_indices(nbnxn_grid_t *grid,
1143 int a0, int a1,
1144 rvec *x,
1145 int dd_zone, const int *move,
1146 int thread, int nthread,
1147 int *cell,
1148 int *cxy_na)
1150 /* We add one extra cell for particles which moved during DD */
1151 for (int i = 0; i < grid->ncx*grid->ncy+1; i++)
1153 cxy_na[i] = 0;
1156 int n0 = a0 + static_cast<int>((thread+0)*(a1 - a0))/nthread;
1157 int n1 = a0 + static_cast<int>((thread+1)*(a1 - a0))/nthread;
1158 if (dd_zone == 0)
1160 /* Home zone */
1161 for (int i = n0; i < n1; i++)
1163 if (move == nullptr || move[i] >= 0)
1165 /* We need to be careful with rounding,
1166 * particles might be a few bits outside the local zone.
1167 * The int cast takes care of the lower bound,
1168 * we will explicitly take care of the upper bound.
1170 int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1171 int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1173 #ifndef NDEBUG
1174 if (cx < 0 || cx > grid->ncx ||
1175 cy < 0 || cy > grid->ncy)
1177 gmx_fatal(FARGS,
1178 "grid cell cx %d cy %d out of range (max %d %d)\n"
1179 "atom %f %f %f, grid->c0 %f %f",
1180 cx, cy, grid->ncx, grid->ncy,
1181 x[i][XX], x[i][YY], x[i][ZZ], grid->c0[XX], grid->c0[YY]);
1183 #endif
1184 /* Take care of potential rouding issues */
1185 cx = std::min(cx, grid->ncx - 1);
1186 cy = std::min(cy, grid->ncy - 1);
1188 /* For the moment cell will contain only the, grid local,
1189 * x and y indices, not z.
1191 cell[i] = cx*grid->ncy + cy;
1193 else
1195 /* Put this moved particle after the end of the grid,
1196 * so we can process it later without using conditionals.
1198 cell[i] = grid->ncx*grid->ncy;
1201 cxy_na[cell[i]]++;
1204 else
1206 /* Non-home zone */
1207 for (int i = n0; i < n1; i++)
1209 int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1210 int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1212 /* For non-home zones there could be particles outside
1213 * the non-bonded cut-off range, which have been communicated
1214 * for bonded interactions only. For the result it doesn't
1215 * matter where these end up on the grid. For performance
1216 * we put them in an extra row at the border.
1218 cx = std::max(cx, 0);
1219 cx = std::min(cx, grid->ncx - 1);
1220 cy = std::max(cy, 0);
1221 cy = std::min(cy, grid->ncy - 1);
1223 /* For the moment cell will contain only the, grid local,
1224 * x and y indices, not z.
1226 cell[i] = cx*grid->ncy + cy;
1228 cxy_na[cell[i]]++;
1233 /* Determine in which grid cells the atoms should go */
1234 static void calc_cell_indices(const nbnxn_search_t nbs,
1235 int dd_zone,
1236 nbnxn_grid_t *grid,
1237 int a0, int a1,
1238 const int *atinfo,
1239 rvec *x,
1240 const int *move,
1241 nbnxn_atomdata_t *nbat)
1243 int n0, n1;
1244 int cx, cy, cxy, ncz_max, ncz;
1245 int nthread;
1246 int cxy_na_i;
1248 nthread = gmx_omp_nthreads_get(emntPairsearch);
1250 #pragma omp parallel for num_threads(nthread) schedule(static)
1251 for (int thread = 0; thread < nthread; thread++)
1255 calc_column_indices(grid, a0, a1, x, dd_zone, move, thread, nthread,
1256 nbs->cell, nbs->work[thread].cxy_na);
1258 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1261 /* Make the cell index as a function of x and y */
1262 ncz_max = 0;
1263 ncz = 0;
1264 grid->cxy_ind[0] = 0;
1265 for (int i = 0; i < grid->ncx*grid->ncy+1; i++)
1267 /* We set ncz_max at the beginning of the loop iso at the end
1268 * to skip i=grid->ncx*grid->ncy which are moved particles
1269 * that do not need to be ordered on the grid.
1271 if (ncz > ncz_max)
1273 ncz_max = ncz;
1275 cxy_na_i = nbs->work[0].cxy_na[i];
1276 for (int thread = 1; thread < nthread; thread++)
1278 cxy_na_i += nbs->work[thread].cxy_na[i];
1280 ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
1281 if (nbat->XFormat == nbatX8)
1283 /* Make the number of cell a multiple of 2 */
1284 ncz = (ncz + 1) & ~1;
1286 grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
1287 /* Clear cxy_na, so we can reuse the array below */
1288 grid->cxy_na[i] = 0;
1290 grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
1292 nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
1294 if (debug)
1296 fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1297 grid->na_sc, grid->na_c, grid->nc,
1298 grid->ncx, grid->ncy, grid->nc/((double)(grid->ncx*grid->ncy)),
1299 ncz_max);
1300 if (gmx_debug_at)
1302 int i = 0;
1303 for (cy = 0; cy < grid->ncy; cy++)
1305 for (cx = 0; cx < grid->ncx; cx++)
1307 fprintf(debug, " %2d", grid->cxy_ind[i+1]-grid->cxy_ind[i]);
1308 i++;
1310 fprintf(debug, "\n");
1315 /* Make sure the work array for sorting is large enough */
1316 if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
1318 for (int thread = 0; thread < nbs->nthread_max; thread++)
1320 nbs->work[thread].sort_work_nalloc =
1321 over_alloc_large(ncz_max*grid->na_sc*SGSF);
1322 srenew(nbs->work[thread].sort_work,
1323 nbs->work[thread].sort_work_nalloc);
1324 /* When not in use, all elements should be -1 */
1325 for (int i = 0; i < nbs->work[thread].sort_work_nalloc; i++)
1327 nbs->work[thread].sort_work[i] = -1;
1332 /* Now we know the dimensions we can fill the grid.
1333 * This is the first, unsorted fill. We sort the columns after this.
1335 for (int i = a0; i < a1; i++)
1337 /* At this point nbs->cell contains the local grid x,y indices */
1338 cxy = nbs->cell[i];
1339 nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
1342 if (dd_zone == 0)
1344 /* Set the cell indices for the moved particles */
1345 n0 = grid->nc*grid->na_sc;
1346 n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
1347 if (dd_zone == 0)
1349 for (int i = n0; i < n1; i++)
1351 nbs->cell[nbs->a[i]] = i;
1356 /* Sort the super-cell columns along z into the sub-cells. */
1357 #pragma omp parallel for num_threads(nthread) schedule(static)
1358 for (int thread = 0; thread < nthread; thread++)
1362 if (grid->bSimple)
1364 sort_columns_simple(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
1365 ((thread+0)*grid->ncx*grid->ncy)/nthread,
1366 ((thread+1)*grid->ncx*grid->ncy)/nthread,
1367 nbs->work[thread].sort_work);
1369 else
1371 sort_columns_supersub(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
1372 ((thread+0)*grid->ncx*grid->ncy)/nthread,
1373 ((thread+1)*grid->ncx*grid->ncy)/nthread,
1374 nbs->work[thread].sort_work);
1377 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1380 if (grid->bSimple && nbat->XFormat == nbatX8)
1382 combine_bounding_box_pairs(grid, grid->bb);
1385 if (!grid->bSimple)
1387 grid->nsubc_tot = 0;
1388 for (int i = 0; i < grid->nc; i++)
1390 grid->nsubc_tot += grid->nsubc[i];
1394 if (debug)
1396 if (grid->bSimple)
1398 print_bbsizes_simple(debug, grid);
1400 else
1402 fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
1403 grid->nsubc_tot, (a1-a0)/(double)grid->nsubc_tot);
1405 print_bbsizes_supersub(debug, grid);
1410 /* Sets up a grid and puts the atoms on the grid.
1411 * This function only operates on one domain of the domain decompostion.
1412 * Note that without domain decomposition there is only one domain.
1414 void nbnxn_put_on_grid(nbnxn_search_t nbs,
1415 int ePBC, matrix box,
1416 int dd_zone,
1417 rvec corner0, rvec corner1,
1418 int a0, int a1,
1419 real atom_density,
1420 const int *atinfo,
1421 rvec *x,
1422 int nmoved, int *move,
1423 int nb_kernel_type,
1424 nbnxn_atomdata_t *nbat)
1426 nbnxn_grid_t *grid;
1427 int n;
1428 int nc_max_grid, nc_max;
1430 grid = &nbs->grid[dd_zone];
1432 nbs_cycle_start(&nbs->cc[enbsCCgrid]);
1434 grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
1436 grid->na_c = nbnxn_kernel_to_cluster_i_size(nb_kernel_type);
1437 grid->na_cj = nbnxn_kernel_to_cluster_j_size(nb_kernel_type);
1438 grid->na_sc = (grid->bSimple ? 1 : c_gpuNumClusterPerCell)*grid->na_c;
1439 grid->na_c_2log = get_2log(grid->na_c);
1441 nbat->na_c = grid->na_c;
1443 if (dd_zone == 0)
1445 grid->cell0 = 0;
1447 else
1449 grid->cell0 =
1450 (nbs->grid[dd_zone-1].cell0 + nbs->grid[dd_zone-1].nc)*
1451 nbs->grid[dd_zone-1].na_sc/grid->na_sc;
1454 n = a1 - a0;
1456 if (dd_zone == 0)
1458 nbs->ePBC = ePBC;
1459 copy_mat(box, nbs->box);
1461 /* Avoid zero density */
1462 if (atom_density > 0)
1464 grid->atom_density = atom_density;
1466 else
1468 grid->atom_density = grid_atom_density(n-nmoved, corner0, corner1);
1471 grid->cell0 = 0;
1473 nbs->natoms_local = a1 - nmoved;
1474 /* We assume that nbnxn_put_on_grid is called first
1475 * for the local atoms (dd_zone=0).
1477 nbs->natoms_nonlocal = a1 - nmoved;
1479 if (debug)
1481 fprintf(debug, "natoms_local = %5d atom_density = %5.1f\n",
1482 nbs->natoms_local, grid->atom_density);
1485 else
1487 nbs->natoms_nonlocal = std::max(nbs->natoms_nonlocal, a1);
1490 /* We always use the home zone (grid[0]) for setting the cell size,
1491 * since determining densities for non-local zones is difficult.
1493 nc_max_grid = set_grid_size_xy(nbs, grid,
1494 dd_zone, n-nmoved, corner0, corner1,
1495 nbs->grid[0].atom_density);
1497 nc_max = grid->cell0 + nc_max_grid;
1499 if (a1 > nbs->cell_nalloc)
1501 nbs->cell_nalloc = over_alloc_large(a1);
1502 srenew(nbs->cell, nbs->cell_nalloc);
1505 /* To avoid conditionals we store the moved particles at the end of a,
1506 * make sure we have enough space.
1508 if (nc_max*grid->na_sc + nmoved > nbs->a_nalloc)
1510 nbs->a_nalloc = over_alloc_large(nc_max*grid->na_sc + nmoved);
1511 srenew(nbs->a, nbs->a_nalloc);
1514 /* We need padding up to a multiple of the buffer flag size: simply add */
1515 if (nc_max*grid->na_sc + NBNXN_BUFFERFLAG_SIZE > nbat->nalloc)
1517 nbnxn_atomdata_realloc(nbat, nc_max*grid->na_sc+NBNXN_BUFFERFLAG_SIZE);
1520 calc_cell_indices(nbs, dd_zone, grid, a0, a1, atinfo, x, move, nbat);
1522 if (dd_zone == 0)
1524 nbat->natoms_local = nbat->natoms;
1527 nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
1530 /* Calls nbnxn_put_on_grid for all non-local domains */
1531 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs,
1532 const struct gmx_domdec_zones_t *zones,
1533 const int *atinfo,
1534 rvec *x,
1535 int nb_kernel_type,
1536 nbnxn_atomdata_t *nbat)
1538 rvec c0, c1;
1540 for (int zone = 1; zone < zones->n; zone++)
1542 for (int d = 0; d < DIM; d++)
1544 c0[d] = zones->size[zone].bb_x0[d];
1545 c1[d] = zones->size[zone].bb_x1[d];
1548 nbnxn_put_on_grid(nbs, nbs->ePBC, nullptr,
1549 zone, c0, c1,
1550 zones->cg_range[zone],
1551 zones->cg_range[zone+1],
1553 atinfo,
1555 0, nullptr,
1556 nb_kernel_type,
1557 nbat);
1561 void nbnxn_get_ncells(nbnxn_search_t nbs, int *ncx, int *ncy)
1563 *ncx = nbs->grid[0].ncx;
1564 *ncy = nbs->grid[0].ncy;
1567 void nbnxn_get_atomorder(const nbnxn_search_t nbs, const int **a, int *n)
1569 const nbnxn_grid_t *grid;
1571 grid = &nbs->grid[0];
1573 /* Return the atom order for the home cell (index 0) */
1574 *a = nbs->a;
1576 *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
1579 void nbnxn_set_atomorder(nbnxn_search_t nbs)
1581 /* Set the atom order for the home cell (index 0) */
1582 nbnxn_grid_t *grid = &nbs->grid[0];
1584 int ao = 0;
1585 for (int cx = 0; cx < grid->ncx; cx++)
1587 for (int cy = 0; cy < grid->ncy; cy++)
1589 int cxy = cx*grid->ncy + cy;
1590 int j = grid->cxy_ind[cxy]*grid->na_sc;
1591 for (int cz = 0; cz < grid->cxy_na[cxy]; cz++)
1593 nbs->a[j] = ao;
1594 nbs->cell[ao] = j;
1595 ao++;
1596 j++;