2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 #include "nbnxn_grid.h"
49 #include "gromacs/domdec/domdec_struct.h"
50 #include "gromacs/math/utilities.h"
51 #include "gromacs/math/vec.h"
52 #include "gromacs/mdlib/gmx_omp_nthreads.h"
53 #include "gromacs/mdlib/nb_verlet.h"
54 #include "gromacs/mdlib/nbnxn_atomdata.h"
55 #include "gromacs/mdlib/nbnxn_consts.h"
56 #include "gromacs/mdlib/nbnxn_internal.h"
57 #include "gromacs/mdlib/nbnxn_search.h"
58 #include "gromacs/mdlib/nbnxn_util.h"
59 #include "gromacs/simd/simd.h"
60 #include "gromacs/simd/vector_operations.h"
61 #include "gromacs/utility/exceptions.h"
62 #include "gromacs/utility/smalloc.h"
64 struct gmx_domdec_zones_t
;
66 static void nbnxn_grid_init(nbnxn_grid_t
* grid
)
68 grid
->cxy_na
= nullptr;
69 grid
->cxy_ind
= nullptr;
76 void nbnxn_grids_init(nbnxn_search_t nbs
, int ngrid
)
82 snew(nbs
->grid
, nbs
->ngrid
);
83 for (g
= 0; g
< nbs
->ngrid
; g
++)
85 nbnxn_grid_init(&nbs
->grid
[g
]);
89 static real
grid_atom_density(int n
, rvec corner0
, rvec corner1
)
95 /* To avoid zero density we use a minimum of 1 atom */
99 rvec_sub(corner1
, corner0
, size
);
101 return n
/(size
[XX
]*size
[YY
]*size
[ZZ
]);
104 static int set_grid_size_xy(const nbnxn_search_t nbs
,
107 int n
, rvec corner0
, rvec corner1
,
113 real tlen
, tlen_x
, tlen_y
;
115 rvec_sub(corner1
, corner0
, size
);
119 assert(atom_density
> 0);
121 /* target cell length */
124 /* To minimize the zero interactions, we should make
125 * the largest of the i/j cell cubic.
127 na_c
= std::max(grid
->na_c
, grid
->na_cj
);
129 /* Approximately cubic cells */
130 tlen
= std::cbrt(na_c
/atom_density
);
136 /* Approximately cubic sub cells */
137 tlen
= std::cbrt(grid
->na_c
/atom_density
);
138 tlen_x
= tlen
*c_gpuNumClusterPerCellX
;
139 tlen_y
= tlen
*c_gpuNumClusterPerCellY
;
141 /* We round ncx and ncy down, because we get less cell pairs
142 * in the nbsist when the fixed cell dimensions (x,y) are
143 * larger than the variable one (z) than the other way around.
145 grid
->ncx
= std::max(1, static_cast<int>(size
[XX
]/tlen_x
));
146 grid
->ncy
= std::max(1, static_cast<int>(size
[YY
]/tlen_y
));
154 grid
->sx
= size
[XX
]/grid
->ncx
;
155 grid
->sy
= size
[YY
]/grid
->ncy
;
156 grid
->inv_sx
= 1/grid
->sx
;
157 grid
->inv_sy
= 1/grid
->sy
;
161 /* This is a non-home zone, add an extra row of cells
162 * for particles communicated for bonded interactions.
163 * These can be beyond the cut-off. It doesn't matter where
164 * they end up on the grid, but for performance it's better
165 * if they don't end up in cells that can be within cut-off range.
171 /* We need one additional cell entry for particles moved by DD */
172 if (grid
->ncx
*grid
->ncy
+1 > grid
->cxy_nalloc
)
174 grid
->cxy_nalloc
= over_alloc_large(grid
->ncx
*grid
->ncy
+1);
175 srenew(grid
->cxy_na
, grid
->cxy_nalloc
);
176 srenew(grid
->cxy_ind
, grid
->cxy_nalloc
+1);
178 for (int t
= 0; t
< nbs
->nthread_max
; t
++)
180 if (grid
->ncx
*grid
->ncy
+1 > nbs
->work
[t
].cxy_na_nalloc
)
182 nbs
->work
[t
].cxy_na_nalloc
= over_alloc_large(grid
->ncx
*grid
->ncy
+1);
183 srenew(nbs
->work
[t
].cxy_na
, nbs
->work
[t
].cxy_na_nalloc
);
187 /* Worst case scenario of 1 atom in each last cell */
188 if (grid
->na_cj
<= grid
->na_c
)
190 nc_max
= n
/grid
->na_sc
+ grid
->ncx
*grid
->ncy
;
194 nc_max
= n
/grid
->na_sc
+ grid
->ncx
*grid
->ncy
*grid
->na_cj
/grid
->na_c
;
197 if (nc_max
> grid
->nc_nalloc
)
199 grid
->nc_nalloc
= over_alloc_large(nc_max
);
200 srenew(grid
->nsubc
, grid
->nc_nalloc
);
201 srenew(grid
->bbcz
, grid
->nc_nalloc
*NNBSBB_D
);
203 sfree_aligned(grid
->bb
);
204 /* This snew also zeros the contents, this avoid possible
205 * floating exceptions in SIMD with the unused bb elements.
209 snew_aligned(grid
->bb
, grid
->nc_nalloc
, 16);
216 pbb_nalloc
= grid
->nc_nalloc
*c_gpuNumClusterPerCell
/STRIDE_PBB
*NNBSBB_XXXX
;
217 snew_aligned(grid
->pbb
, pbb_nalloc
, 16);
219 snew_aligned(grid
->bb
, grid
->nc_nalloc
*c_gpuNumClusterPerCell
, 16);
225 if (grid
->na_cj
== grid
->na_c
)
227 grid
->bbj
= grid
->bb
;
231 sfree_aligned(grid
->bbj
);
232 snew_aligned(grid
->bbj
, grid
->nc_nalloc
*grid
->na_c
/grid
->na_cj
, 16);
236 srenew(grid
->flags
, grid
->nc_nalloc
);
239 srenew(grid
->fep
, grid
->nc_nalloc
*grid
->na_sc
/grid
->na_c
);
243 copy_rvec(corner0
, grid
->c0
);
244 copy_rvec(corner1
, grid
->c1
);
245 copy_rvec(size
, grid
->size
);
250 /* We need to sort paricles in grid columns on z-coordinate.
251 * As particle are very often distributed homogeneously, we a sorting
252 * algorithm similar to pigeonhole sort. We multiply the z-coordinate
253 * by a factor, cast to an int and try to store in that hole. If the hole
254 * is full, we move this or another particle. A second pass is needed to make
255 * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
256 * 4 is the optimal value for homogeneous particle distribution and allows
257 * for an O(#particles) sort up till distributions were all particles are
258 * concentrated in 1/4 of the space. No NlogN fallback is implemented,
259 * as it can be expensive to detect imhomogeneous particle distributions.
260 * SGSF is the maximum ratio of holes used, in the worst case all particles
261 * end up in the last hole and we need #particles extra holes at the end.
263 #define SORT_GRID_OVERSIZE 4
264 #define SGSF (SORT_GRID_OVERSIZE + 1)
266 /* Sort particle index a on coordinates x along dim.
267 * Backwards tells if we want decreasing iso increasing coordinates.
268 * h0 is the minimum of the coordinate range.
269 * invh is the 1/length of the sorting range.
270 * n_per_h (>=n) is the expected average number of particles per 1/invh
271 * sort is the sorting work array.
272 * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
273 * or easier, allocate at least n*SGSF elements.
275 static void sort_atoms(int dim
, gmx_bool Backwards
,
276 int gmx_unused dd_zone
,
277 int *a
, int n
, rvec
*x
,
278 real h0
, real invh
, int n_per_h
,
290 gmx_incons("n > n_per_h");
294 /* Transform the inverse range height into the inverse hole height */
295 invh
*= n_per_h
*SORT_GRID_OVERSIZE
;
297 /* Set nsort to the maximum possible number of holes used.
298 * In worst case all n elements end up in the last bin.
300 int nsort
= n_per_h
*SORT_GRID_OVERSIZE
+ n
;
302 /* Determine the index range used, so we can limit it for the second pass */
303 int zi_min
= INT_MAX
;
306 /* Sort the particles using a simple index sort */
307 for (int i
= 0; i
< n
; i
++)
309 /* The cast takes care of float-point rounding effects below zero.
310 * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
311 * times the box height out of the box.
313 int zi
= static_cast<int>((x
[a
[i
]][dim
] - h0
)*invh
);
316 /* As we can have rounding effect, we use > iso >= here */
317 if (zi
< 0 || (dd_zone
== 0 && zi
> n_per_h
*SORT_GRID_OVERSIZE
))
319 gmx_fatal(FARGS
, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
320 a
[i
], 'x'+dim
, x
[a
[i
]][dim
], h0
, invh
, zi
,
321 n_per_h
, SORT_GRID_OVERSIZE
);
325 /* In a non-local domain, particles communcated for bonded interactions
326 * can be far beyond the grid size, which is set by the non-bonded
327 * cut-off distance. We sort such particles into the last cell.
329 if (zi
> n_per_h
*SORT_GRID_OVERSIZE
)
331 zi
= n_per_h
*SORT_GRID_OVERSIZE
;
334 /* Ideally this particle should go in sort cell zi,
335 * but that might already be in use,
336 * in that case find the first empty cell higher up
341 zi_min
= std::min(zi_min
, zi
);
342 zi_max
= std::max(zi_max
, zi
);
346 /* We have multiple atoms in the same sorting slot.
347 * Sort on real z for minimal bounding box size.
348 * There is an extra check for identical z to ensure
349 * well-defined output order, independent of input order
350 * to ensure binary reproducibility after restarts.
352 while (sort
[zi
] >= 0 && ( x
[a
[i
]][dim
] > x
[sort
[zi
]][dim
] ||
353 (x
[a
[i
]][dim
] == x
[sort
[zi
]][dim
] &&
361 /* Shift all elements by one slot until we find an empty slot */
364 while (sort
[zim
] >= 0)
372 zi_max
= std::max(zi_max
, zim
);
375 zi_max
= std::max(zi_max
, zi
);
382 for (int zi
= 0; zi
< nsort
; zi
++)
393 for (int zi
= zi_max
; zi
>= zi_min
; zi
--)
404 gmx_incons("Lost particles while sorting");
409 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
410 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
416 /* Coordinate order x,y,z, bb order xyz0 */
417 static void calc_bounding_box(int na
, int stride
, const real
*x
, nbnxn_bb_t
*bb
)
420 real xl
, xh
, yl
, yh
, zl
, zh
;
430 for (int j
= 1; j
< na
; j
++)
432 xl
= std::min(xl
, x
[i
+XX
]);
433 xh
= std::max(xh
, x
[i
+XX
]);
434 yl
= std::min(yl
, x
[i
+YY
]);
435 yh
= std::max(yh
, x
[i
+YY
]);
436 zl
= std::min(zl
, x
[i
+ZZ
]);
437 zh
= std::max(zh
, x
[i
+ZZ
]);
440 /* Note: possible double to float conversion here */
441 bb
->lower
[BB_X
] = R2F_D(xl
);
442 bb
->lower
[BB_Y
] = R2F_D(yl
);
443 bb
->lower
[BB_Z
] = R2F_D(zl
);
444 bb
->upper
[BB_X
] = R2F_U(xh
);
445 bb
->upper
[BB_Y
] = R2F_U(yh
);
446 bb
->upper
[BB_Z
] = R2F_U(zh
);
449 /* Packed coordinates, bb order xyz0 */
450 static void calc_bounding_box_x_x4(int na
, const real
*x
, nbnxn_bb_t
*bb
)
452 real xl
, xh
, yl
, yh
, zl
, zh
;
460 for (int j
= 1; j
< na
; j
++)
462 xl
= std::min(xl
, x
[j
+XX
*c_packX4
]);
463 xh
= std::max(xh
, x
[j
+XX
*c_packX4
]);
464 yl
= std::min(yl
, x
[j
+YY
*c_packX4
]);
465 yh
= std::max(yh
, x
[j
+YY
*c_packX4
]);
466 zl
= std::min(zl
, x
[j
+ZZ
*c_packX4
]);
467 zh
= std::max(zh
, x
[j
+ZZ
*c_packX4
]);
469 /* Note: possible double to float conversion here */
470 bb
->lower
[BB_X
] = R2F_D(xl
);
471 bb
->lower
[BB_Y
] = R2F_D(yl
);
472 bb
->lower
[BB_Z
] = R2F_D(zl
);
473 bb
->upper
[BB_X
] = R2F_U(xh
);
474 bb
->upper
[BB_Y
] = R2F_U(yh
);
475 bb
->upper
[BB_Z
] = R2F_U(zh
);
478 /* Packed coordinates, bb order xyz0 */
479 static void calc_bounding_box_x_x8(int na
, const real
*x
, nbnxn_bb_t
*bb
)
481 real xl
, xh
, yl
, yh
, zl
, zh
;
489 for (int j
= 1; j
< na
; j
++)
491 xl
= std::min(xl
, x
[j
+XX
*c_packX8
]);
492 xh
= std::max(xh
, x
[j
+XX
*c_packX8
]);
493 yl
= std::min(yl
, x
[j
+YY
*c_packX8
]);
494 yh
= std::max(yh
, x
[j
+YY
*c_packX8
]);
495 zl
= std::min(zl
, x
[j
+ZZ
*c_packX8
]);
496 zh
= std::max(zh
, x
[j
+ZZ
*c_packX8
]);
498 /* Note: possible double to float conversion here */
499 bb
->lower
[BB_X
] = R2F_D(xl
);
500 bb
->lower
[BB_Y
] = R2F_D(yl
);
501 bb
->lower
[BB_Z
] = R2F_D(zl
);
502 bb
->upper
[BB_X
] = R2F_U(xh
);
503 bb
->upper
[BB_Y
] = R2F_U(yh
);
504 bb
->upper
[BB_Z
] = R2F_U(zh
);
507 /* Packed coordinates, bb order xyz0 */
508 static void calc_bounding_box_x_x4_halves(int na
, const real
*x
,
509 nbnxn_bb_t
*bb
, nbnxn_bb_t
*bbj
)
511 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
514 calc_bounding_box_x_x4(std::min(na
, 2), x
, bbj
);
518 calc_bounding_box_x_x4(std::min(na
-2, 2), x
+(c_packX4
>>1), bbj
+1);
522 /* Set the "empty" bounding box to the same as the first one,
523 * so we don't need to treat special cases in the rest of the code.
525 #if NBNXN_SEARCH_BB_SIMD4
526 store4(&bbj
[1].lower
[0], load4(&bbj
[0].lower
[0]));
527 store4(&bbj
[1].upper
[0], load4(&bbj
[0].upper
[0]));
533 #if NBNXN_SEARCH_BB_SIMD4
534 store4(&bb
->lower
[0], min(load4(&bbj
[0].lower
[0]), load4(&bbj
[1].lower
[0])));
535 store4(&bb
->upper
[0], max(load4(&bbj
[0].upper
[0]), load4(&bbj
[1].upper
[0])));
540 for (i
= 0; i
< NNBSBB_C
; i
++)
542 bb
->lower
[i
] = std::min(bbj
[0].lower
[i
], bbj
[1].lower
[i
]);
543 bb
->upper
[i
] = std::max(bbj
[0].upper
[i
], bbj
[1].upper
[i
]);
549 #if NBNXN_SEARCH_BB_SIMD4
551 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
552 static void calc_bounding_box_xxxx(int na
, int stride
, const real
*x
, float *bb
)
555 real xl
, xh
, yl
, yh
, zl
, zh
;
565 for (int j
= 1; j
< na
; j
++)
567 xl
= std::min(xl
, x
[i
+XX
]);
568 xh
= std::max(xh
, x
[i
+XX
]);
569 yl
= std::min(yl
, x
[i
+YY
]);
570 yh
= std::max(yh
, x
[i
+YY
]);
571 zl
= std::min(zl
, x
[i
+ZZ
]);
572 zh
= std::max(zh
, x
[i
+ZZ
]);
575 /* Note: possible double to float conversion here */
576 bb
[0*STRIDE_PBB
] = R2F_D(xl
);
577 bb
[1*STRIDE_PBB
] = R2F_D(yl
);
578 bb
[2*STRIDE_PBB
] = R2F_D(zl
);
579 bb
[3*STRIDE_PBB
] = R2F_U(xh
);
580 bb
[4*STRIDE_PBB
] = R2F_U(yh
);
581 bb
[5*STRIDE_PBB
] = R2F_U(zh
);
584 #endif /* NBNXN_SEARCH_BB_SIMD4 */
586 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
588 /* Coordinate order xyz?, bb order xyz0 */
589 static void calc_bounding_box_simd4(int na
, const float *x
, nbnxn_bb_t
*bb
)
591 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
594 Simd4Float bb_0_S
, bb_1_S
;
600 for (int i
= 1; i
< na
; i
++)
602 x_S
= load4(x
+i
*NNBSBB_C
);
603 bb_0_S
= min(bb_0_S
, x_S
);
604 bb_1_S
= max(bb_1_S
, x_S
);
607 store4(&bb
->lower
[0], bb_0_S
);
608 store4(&bb
->upper
[0], bb_1_S
);
611 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
612 static void calc_bounding_box_xxxx_simd4(int na
, const float *x
,
613 nbnxn_bb_t
*bb_work_aligned
,
616 calc_bounding_box_simd4(na
, x
, bb_work_aligned
);
618 bb
[0*STRIDE_PBB
] = bb_work_aligned
->lower
[BB_X
];
619 bb
[1*STRIDE_PBB
] = bb_work_aligned
->lower
[BB_Y
];
620 bb
[2*STRIDE_PBB
] = bb_work_aligned
->lower
[BB_Z
];
621 bb
[3*STRIDE_PBB
] = bb_work_aligned
->upper
[BB_X
];
622 bb
[4*STRIDE_PBB
] = bb_work_aligned
->upper
[BB_Y
];
623 bb
[5*STRIDE_PBB
] = bb_work_aligned
->upper
[BB_Z
];
626 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
629 /* Combines pairs of consecutive bounding boxes */
630 static void combine_bounding_box_pairs(nbnxn_grid_t
*grid
, const nbnxn_bb_t
*bb
)
632 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
635 for (int i
= 0; i
< grid
->ncx
*grid
->ncy
; i
++)
637 /* Starting bb in a column is expected to be 2-aligned */
638 int sc2
= grid
->cxy_ind
[i
]>>1;
639 /* For odd numbers skip the last bb here */
640 int nc2
= (grid
->cxy_na
[i
]+3)>>(2+1);
642 for (c2
= sc2
; c2
< sc2
+nc2
; c2
++)
644 #if NBNXN_SEARCH_BB_SIMD4
645 Simd4Float min_S
, max_S
;
647 min_S
= min(load4(&bb
[c2
*2+0].lower
[0]),
648 load4(&bb
[c2
*2+1].lower
[0]));
649 max_S
= max(load4(&bb
[c2
*2+0].upper
[0]),
650 load4(&bb
[c2
*2+1].upper
[0]));
651 store4(&grid
->bbj
[c2
].lower
[0], min_S
);
652 store4(&grid
->bbj
[c2
].upper
[0], max_S
);
654 for (int j
= 0; j
< NNBSBB_C
; j
++)
656 grid
->bbj
[c2
].lower
[j
] = std::min(bb
[c2
*2+0].lower
[j
],
657 bb
[c2
*2+1].lower
[j
]);
658 grid
->bbj
[c2
].upper
[j
] = std::max(bb
[c2
*2+0].upper
[j
],
659 bb
[c2
*2+1].upper
[j
]);
663 if (((grid
->cxy_na
[i
]+3)>>2) & 1)
665 /* The bb count in this column is odd: duplicate the last bb */
666 for (int j
= 0; j
< NNBSBB_C
; j
++)
668 grid
->bbj
[c2
].lower
[j
] = bb
[c2
*2].lower
[j
];
669 grid
->bbj
[c2
].upper
[j
] = bb
[c2
*2].upper
[j
];
676 /* Prints the average bb size, used for debug output */
677 static void print_bbsizes_simple(FILE *fp
,
678 const nbnxn_grid_t
*grid
)
683 for (int c
= 0; c
< grid
->nc
; c
++)
685 for (int d
= 0; d
< DIM
; d
++)
687 ba
[d
] += grid
->bb
[c
].upper
[d
] - grid
->bb
[c
].lower
[d
];
690 dsvmul(1.0/grid
->nc
, ba
, ba
);
692 fprintf(fp
, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
695 grid
->atom_density
> 0 ?
696 grid
->na_c
/(grid
->atom_density
*grid
->sx
*grid
->sy
) : 0.0,
697 ba
[XX
], ba
[YY
], ba
[ZZ
],
700 grid
->atom_density
> 0 ?
701 ba
[ZZ
]/(grid
->na_c
/(grid
->atom_density
*grid
->sx
*grid
->sy
)) : 0.0);
704 /* Prints the average bb size, used for debug output */
705 static void print_bbsizes_supersub(FILE *fp
,
706 const nbnxn_grid_t
*grid
)
713 for (int c
= 0; c
< grid
->nc
; c
++)
716 for (int s
= 0; s
< grid
->nsubc
[c
]; s
+= STRIDE_PBB
)
718 int cs_w
= (c
*c_gpuNumClusterPerCell
+ s
)/STRIDE_PBB
;
719 for (int i
= 0; i
< STRIDE_PBB
; i
++)
721 for (int d
= 0; d
< DIM
; d
++)
724 grid
->pbb
[cs_w
*NNBSBB_XXXX
+(DIM
+d
)*STRIDE_PBB
+i
] -
725 grid
->pbb
[cs_w
*NNBSBB_XXXX
+ d
*STRIDE_PBB
+i
];
730 for (int s
= 0; s
< grid
->nsubc
[c
]; s
++)
732 int cs
= c
*c_gpuNumClusterPerCell
+ s
;
733 for (int d
= 0; d
< DIM
; d
++)
735 ba
[d
] += grid
->bb
[cs
].upper
[d
] - grid
->bb
[cs
].lower
[d
];
739 ns
+= grid
->nsubc
[c
];
741 dsvmul(1.0/ns
, ba
, ba
);
743 fprintf(fp
, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
744 grid
->sx
/c_gpuNumClusterPerCellX
,
745 grid
->sy
/c_gpuNumClusterPerCellY
,
746 grid
->atom_density
> 0 ?
747 grid
->na_sc
/(grid
->atom_density
*grid
->sx
*grid
->sy
*c_gpuNumClusterPerCellZ
) : 0.0,
748 ba
[XX
], ba
[YY
], ba
[ZZ
],
749 ba
[XX
]*c_gpuNumClusterPerCellX
/grid
->sx
,
750 ba
[YY
]*c_gpuNumClusterPerCellY
/grid
->sy
,
751 grid
->atom_density
> 0 ?
752 ba
[ZZ
]/(grid
->na_sc
/(grid
->atom_density
*grid
->sx
*grid
->sy
*c_gpuNumClusterPerCellZ
)) : 0.0);
755 /* Set non-bonded interaction flags for the current cluster.
756 * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
758 static void sort_cluster_on_flag(int natoms_cluster
,
759 int a0
, int a1
, const int *atinfo
,
763 const int natoms_cluster_max
= 8;
764 int sort1
[natoms_cluster_max
];
765 int sort2
[natoms_cluster_max
];
767 assert(natoms_cluster
<= natoms_cluster_max
);
772 for (int s
= a0
; s
< a1
; s
+= natoms_cluster
)
774 /* Make lists for this (sub-)cell on atoms with and without LJ */
777 gmx_bool haveQ
= FALSE
;
779 for (int a
= s
; a
< std::min(s
+ natoms_cluster
, a1
); a
++)
781 haveQ
= haveQ
|| GET_CGINFO_HAS_Q(atinfo
[order
[a
]]);
783 if (GET_CGINFO_HAS_VDW(atinfo
[order
[a
]]))
785 sort1
[n1
++] = order
[a
];
790 sort2
[n2
++] = order
[a
];
794 /* If we don't have atoms with LJ, there's nothing to sort */
797 *flags
|= NBNXN_CI_DO_LJ(subc
);
799 if (2*n1
<= natoms_cluster
)
801 /* Only sort when strictly necessary. Ordering particles
802 * Ordering particles can lead to less accurate summation
803 * due to rounding, both for LJ and Coulomb interactions.
805 if (2*(a_lj_max
- s
) >= natoms_cluster
)
807 for (int i
= 0; i
< n1
; i
++)
809 order
[a0
+ i
] = sort1
[i
];
811 for (int j
= 0; j
< n2
; j
++)
813 order
[a0
+ n1
+ j
] = sort2
[j
];
817 *flags
|= NBNXN_CI_HALF_LJ(subc
);
822 *flags
|= NBNXN_CI_DO_COUL(subc
);
828 /* Fill a pair search cell with atoms.
829 * Potentially sorts atoms and sets the interaction flags.
831 static void fill_cell(const nbnxn_search_t nbs
,
833 nbnxn_atomdata_t
*nbat
,
837 nbnxn_bb_t gmx_unused
*bb_work_aligned
)
850 /* Note that non-local grids are already sorted.
851 * Then sort_cluster_on_flag will only set the flags and the sorting
852 * will not affect the atom order.
854 sort_cluster_on_flag(grid
->na_c
, a0
, a1
, atinfo
, nbs
->a
,
855 grid
->flags
+(a0
>>grid
->na_c_2log
)-grid
->cell0
);
860 /* Set the fep flag for perturbed atoms in this (sub-)cell */
863 /* The grid-local cluster/(sub-)cell index */
864 c
= (a0
>> grid
->na_c_2log
) - grid
->cell0
*(grid
->bSimple
? 1 : c_gpuNumClusterPerCell
);
866 for (int at
= a0
; at
< a1
; at
++)
868 if (nbs
->a
[at
] >= 0 && GET_CGINFO_FEP(atinfo
[nbs
->a
[at
]]))
870 grid
->fep
[c
] |= (1 << (at
- a0
));
875 /* Now we have sorted the atoms, set the cell indices */
876 for (a
= a0
; a
< a1
; a
++)
878 nbs
->cell
[nbs
->a
[a
]] = a
;
881 copy_rvec_to_nbat_real(nbs
->a
+a0
, a1
-a0
, grid
->na_c
, x
,
882 nbat
->XFormat
, nbat
->x
, a0
);
884 if (nbat
->XFormat
== nbatX4
)
886 /* Store the bounding boxes as xyz.xyz. */
887 offset
= (a0
- grid
->cell0
*grid
->na_sc
) >> grid
->na_c_2log
;
888 bb_ptr
= grid
->bb
+ offset
;
890 #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2
891 if (2*grid
->na_cj
== grid
->na_c
)
893 calc_bounding_box_x_x4_halves(na
, nbat
->x
+ atom_to_x_index
<c_packX4
>(a0
), bb_ptr
,
899 calc_bounding_box_x_x4(na
, nbat
->x
+ atom_to_x_index
<c_packX4
>(a0
), bb_ptr
);
902 else if (nbat
->XFormat
== nbatX8
)
904 /* Store the bounding boxes as xyz.xyz. */
905 offset
= (a0
- grid
->cell0
*grid
->na_sc
) >> grid
->na_c_2log
;
906 bb_ptr
= grid
->bb
+ offset
;
908 calc_bounding_box_x_x8(na
, nbat
->x
+ atom_to_x_index
<c_packX8
>(a0
), bb_ptr
);
911 else if (!grid
->bSimple
)
913 /* Store the bounding boxes in a format convenient
914 * for SIMD4 calculations: xxxxyyyyzzzz...
918 ((a0
-grid
->cell0
*grid
->na_sc
)>>(grid
->na_c_2log
+STRIDE_PBB_2LOG
))*NNBSBB_XXXX
+
919 (((a0
-grid
->cell0
*grid
->na_sc
)>>grid
->na_c_2log
) & (STRIDE_PBB
-1));
921 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
922 if (nbat
->XFormat
== nbatXYZQ
)
924 calc_bounding_box_xxxx_simd4(na
, nbat
->x
+a0
*nbat
->xstride
,
925 bb_work_aligned
, pbb_ptr
);
930 calc_bounding_box_xxxx(na
, nbat
->xstride
, nbat
->x
+a0
*nbat
->xstride
,
935 fprintf(debug
, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
936 a0
>> grid
->na_c_2log
,
937 pbb_ptr
[0*STRIDE_PBB
], pbb_ptr
[3*STRIDE_PBB
],
938 pbb_ptr
[1*STRIDE_PBB
], pbb_ptr
[4*STRIDE_PBB
],
939 pbb_ptr
[2*STRIDE_PBB
], pbb_ptr
[5*STRIDE_PBB
]);
945 /* Store the bounding boxes as xyz.xyz. */
946 bb_ptr
= grid
->bb
+((a0
-grid
->cell0
*grid
->na_sc
)>>grid
->na_c_2log
);
948 calc_bounding_box(na
, nbat
->xstride
, nbat
->x
+a0
*nbat
->xstride
,
954 bbo
= (a0
- grid
->cell0
*grid
->na_sc
)/grid
->na_c
;
955 fprintf(debug
, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
956 a0
>> grid
->na_c_2log
,
957 grid
->bb
[bbo
].lower
[BB_X
],
958 grid
->bb
[bbo
].lower
[BB_Y
],
959 grid
->bb
[bbo
].lower
[BB_Z
],
960 grid
->bb
[bbo
].upper
[BB_X
],
961 grid
->bb
[bbo
].upper
[BB_Y
],
962 grid
->bb
[bbo
].upper
[BB_Z
]);
967 /* Spatially sort the atoms within one grid column */
968 static void sort_columns_simple(const nbnxn_search_t nbs
,
974 nbnxn_atomdata_t
*nbat
,
975 int cxy_start
, int cxy_end
,
982 fprintf(debug
, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
983 grid
->cell0
, cxy_start
, cxy_end
, a0
, a1
);
986 /* Sort the atoms within each x,y column in 3 dimensions */
987 for (int cxy
= cxy_start
; cxy
< cxy_end
; cxy
++)
989 int na
= grid
->cxy_na
[cxy
];
990 int ncz
= grid
->cxy_ind
[cxy
+1] - grid
->cxy_ind
[cxy
];
991 int ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
993 /* Sort the atoms within each x,y column on z coordinate */
994 sort_atoms(ZZ
, FALSE
, dd_zone
,
997 1.0/grid
->size
[ZZ
], ncz
*grid
->na_sc
,
1000 /* Fill the ncz cells in this column */
1001 cfilled
= grid
->cxy_ind
[cxy
];
1002 for (int cz
= 0; cz
< ncz
; cz
++)
1004 c
= grid
->cxy_ind
[cxy
] + cz
;
1006 int ash_c
= ash
+ cz
*grid
->na_sc
;
1007 int na_c
= std::min(grid
->na_sc
, na
-(ash_c
-ash
));
1009 fill_cell(nbs
, grid
, nbat
,
1010 ash_c
, ash_c
+na_c
, atinfo
, x
,
1013 /* This copy to bbcz is not really necessary.
1014 * But it allows to use the same grid search code
1015 * for the simple and supersub cell setups.
1021 grid
->bbcz
[c
*NNBSBB_D
] = grid
->bb
[cfilled
].lower
[BB_Z
];
1022 grid
->bbcz
[c
*NNBSBB_D
+1] = grid
->bb
[cfilled
].upper
[BB_Z
];
1025 /* Set the unused atom indices to -1 */
1026 for (int ind
= na
; ind
< ncz
*grid
->na_sc
; ind
++)
1028 nbs
->a
[ash
+ind
] = -1;
1033 /* Spatially sort the atoms within one grid column */
1034 static void sort_columns_supersub(const nbnxn_search_t nbs
,
1040 nbnxn_atomdata_t
*nbat
,
1041 int cxy_start
, int cxy_end
,
1044 nbnxn_bb_t bb_work_array
[2], *bb_work_aligned
;
1046 bb_work_aligned
= reinterpret_cast<nbnxn_bb_t
*>((reinterpret_cast<std::size_t>(bb_work_array
+1)) & (~(static_cast<std::size_t>(15))));
1050 fprintf(debug
, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1051 grid
->cell0
, cxy_start
, cxy_end
, a0
, a1
);
1054 int subdiv_x
= grid
->na_c
;
1055 int subdiv_y
= c_gpuNumClusterPerCellX
*subdiv_x
;
1056 int subdiv_z
= c_gpuNumClusterPerCellY
*subdiv_y
;
1058 /* Sort the atoms within each x,y column in 3 dimensions */
1059 for (int cxy
= cxy_start
; cxy
< cxy_end
; cxy
++)
1061 int cx
= cxy
/grid
->ncy
;
1062 int cy
= cxy
- cx
*grid
->ncy
;
1064 int na
= grid
->cxy_na
[cxy
];
1065 int ncz
= grid
->cxy_ind
[cxy
+1] - grid
->cxy_ind
[cxy
];
1066 int ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
1068 /* Sort the atoms within each x,y column on z coordinate */
1069 sort_atoms(ZZ
, FALSE
, dd_zone
,
1070 nbs
->a
+ ash
, na
, x
,
1072 1.0/grid
->size
[ZZ
], ncz
*grid
->na_sc
,
1075 /* This loop goes over the supercells and subcells along z at once */
1076 for (int sub_z
= 0; sub_z
< ncz
*c_gpuNumClusterPerCellZ
; sub_z
++)
1078 int ash_z
= ash
+ sub_z
*subdiv_z
;
1079 int na_z
= std::min(subdiv_z
, na
- (ash_z
- ash
));
1081 /* We have already sorted on z */
1083 if (sub_z
% c_gpuNumClusterPerCellZ
== 0)
1085 cz
= sub_z
/c_gpuNumClusterPerCellZ
;
1086 int c
= grid
->cxy_ind
[cxy
] + cz
;
1088 /* The number of atoms in this supercell */
1089 int na_c
= std::min(grid
->na_sc
, na
- (ash_z
- ash
));
1091 grid
->nsubc
[c
] = std::min(c_gpuNumClusterPerCell
, (na_c
+ grid
->na_c
- 1)/grid
->na_c
);
1093 /* Store the z-boundaries of the super cell */
1094 grid
->bbcz
[c
*NNBSBB_D
] = x
[nbs
->a
[ash_z
]][ZZ
];
1095 grid
->bbcz
[c
*NNBSBB_D
+1] = x
[nbs
->a
[ash_z
+ na_c
- 1]][ZZ
];
1098 if (c_gpuNumClusterPerCellY
> 1)
1100 /* Sort the atoms along y */
1101 sort_atoms(YY
, (sub_z
& 1), dd_zone
,
1102 nbs
->a
+ash_z
, na_z
, x
,
1103 grid
->c0
[YY
] + cy
*grid
->sy
,
1104 grid
->inv_sy
, subdiv_z
,
1108 for (int sub_y
= 0; sub_y
< c_gpuNumClusterPerCellY
; sub_y
++)
1110 int ash_y
= ash_z
+ sub_y
*subdiv_y
;
1111 int na_y
= std::min(subdiv_y
, na
- (ash_y
- ash
));
1113 if (c_gpuNumClusterPerCellX
> 1)
1115 /* Sort the atoms along x */
1116 sort_atoms(XX
, ((cz
*c_gpuNumClusterPerCellY
+ sub_y
) & 1), dd_zone
,
1117 nbs
->a
+ ash_y
, na_y
, x
,
1118 grid
->c0
[XX
] + cx
*grid
->sx
,
1119 grid
->inv_sx
, subdiv_y
,
1123 for (int sub_x
= 0; sub_x
< c_gpuNumClusterPerCellX
; sub_x
++)
1125 int ash_x
= ash_y
+ sub_x
*subdiv_x
;
1126 int na_x
= std::min(subdiv_x
, na
- (ash_x
- ash
));
1128 fill_cell(nbs
, grid
, nbat
,
1129 ash_x
, ash_x
+ na_x
, atinfo
, x
,
1135 /* Set the unused atom indices to -1 */
1136 for (int ind
= na
; ind
< ncz
*grid
->na_sc
; ind
++)
1138 nbs
->a
[ash
+ind
] = -1;
1143 /* Determine in which grid column atoms should go */
1144 static void calc_column_indices(nbnxn_grid_t
*grid
,
1147 int dd_zone
, const int *move
,
1148 int thread
, int nthread
,
1152 /* We add one extra cell for particles which moved during DD */
1153 for (int i
= 0; i
< grid
->ncx
*grid
->ncy
+1; i
++)
1158 int n0
= a0
+ static_cast<int>((thread
+0)*(a1
- a0
))/nthread
;
1159 int n1
= a0
+ static_cast<int>((thread
+1)*(a1
- a0
))/nthread
;
1163 for (int i
= n0
; i
< n1
; i
++)
1165 if (move
== nullptr || move
[i
] >= 0)
1167 /* We need to be careful with rounding,
1168 * particles might be a few bits outside the local zone.
1169 * The int cast takes care of the lower bound,
1170 * we will explicitly take care of the upper bound.
1172 int cx
= static_cast<int>((x
[i
][XX
] - grid
->c0
[XX
])*grid
->inv_sx
);
1173 int cy
= static_cast<int>((x
[i
][YY
] - grid
->c0
[YY
])*grid
->inv_sy
);
1176 if (cx
< 0 || cx
> grid
->ncx
||
1177 cy
< 0 || cy
> grid
->ncy
)
1180 "grid cell cx %d cy %d out of range (max %d %d)\n"
1181 "atom %f %f %f, grid->c0 %f %f",
1182 cx
, cy
, grid
->ncx
, grid
->ncy
,
1183 x
[i
][XX
], x
[i
][YY
], x
[i
][ZZ
], grid
->c0
[XX
], grid
->c0
[YY
]);
1186 /* Take care of potential rouding issues */
1187 cx
= std::min(cx
, grid
->ncx
- 1);
1188 cy
= std::min(cy
, grid
->ncy
- 1);
1190 /* For the moment cell will contain only the, grid local,
1191 * x and y indices, not z.
1193 cell
[i
] = cx
*grid
->ncy
+ cy
;
1197 /* Put this moved particle after the end of the grid,
1198 * so we can process it later without using conditionals.
1200 cell
[i
] = grid
->ncx
*grid
->ncy
;
1209 for (int i
= n0
; i
< n1
; i
++)
1211 int cx
= static_cast<int>((x
[i
][XX
] - grid
->c0
[XX
])*grid
->inv_sx
);
1212 int cy
= static_cast<int>((x
[i
][YY
] - grid
->c0
[YY
])*grid
->inv_sy
);
1214 /* For non-home zones there could be particles outside
1215 * the non-bonded cut-off range, which have been communicated
1216 * for bonded interactions only. For the result it doesn't
1217 * matter where these end up on the grid. For performance
1218 * we put them in an extra row at the border.
1220 cx
= std::max(cx
, 0);
1221 cx
= std::min(cx
, grid
->ncx
- 1);
1222 cy
= std::max(cy
, 0);
1223 cy
= std::min(cy
, grid
->ncy
- 1);
1225 /* For the moment cell will contain only the, grid local,
1226 * x and y indices, not z.
1228 cell
[i
] = cx
*grid
->ncy
+ cy
;
1235 /* Determine in which grid cells the atoms should go */
1236 static void calc_cell_indices(const nbnxn_search_t nbs
,
1243 nbnxn_atomdata_t
*nbat
)
1246 int cx
, cy
, cxy
, ncz_max
, ncz
;
1250 nthread
= gmx_omp_nthreads_get(emntPairsearch
);
1252 #pragma omp parallel for num_threads(nthread) schedule(static)
1253 for (int thread
= 0; thread
< nthread
; thread
++)
1257 calc_column_indices(grid
, a0
, a1
, x
, dd_zone
, move
, thread
, nthread
,
1258 nbs
->cell
, nbs
->work
[thread
].cxy_na
);
1260 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1263 /* Make the cell index as a function of x and y */
1266 grid
->cxy_ind
[0] = 0;
1267 for (int i
= 0; i
< grid
->ncx
*grid
->ncy
+1; i
++)
1269 /* We set ncz_max at the beginning of the loop iso at the end
1270 * to skip i=grid->ncx*grid->ncy which are moved particles
1271 * that do not need to be ordered on the grid.
1277 cxy_na_i
= nbs
->work
[0].cxy_na
[i
];
1278 for (int thread
= 1; thread
< nthread
; thread
++)
1280 cxy_na_i
+= nbs
->work
[thread
].cxy_na
[i
];
1282 ncz
= (cxy_na_i
+ grid
->na_sc
- 1)/grid
->na_sc
;
1283 if (nbat
->XFormat
== nbatX8
)
1285 /* Make the number of cell a multiple of 2 */
1286 ncz
= (ncz
+ 1) & ~1;
1288 grid
->cxy_ind
[i
+1] = grid
->cxy_ind
[i
] + ncz
;
1289 /* Clear cxy_na, so we can reuse the array below */
1290 grid
->cxy_na
[i
] = 0;
1292 grid
->nc
= grid
->cxy_ind
[grid
->ncx
*grid
->ncy
] - grid
->cxy_ind
[0];
1294 nbat
->natoms
= (grid
->cell0
+ grid
->nc
)*grid
->na_sc
;
1298 fprintf(debug
, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1299 grid
->na_sc
, grid
->na_c
, grid
->nc
,
1300 grid
->ncx
, grid
->ncy
, grid
->nc
/((double)(grid
->ncx
*grid
->ncy
)),
1305 for (cy
= 0; cy
< grid
->ncy
; cy
++)
1307 for (cx
= 0; cx
< grid
->ncx
; cx
++)
1309 fprintf(debug
, " %2d", grid
->cxy_ind
[i
+1]-grid
->cxy_ind
[i
]);
1312 fprintf(debug
, "\n");
1317 /* Make sure the work array for sorting is large enough */
1318 if (ncz_max
*grid
->na_sc
*SGSF
> nbs
->work
[0].sort_work_nalloc
)
1320 for (int thread
= 0; thread
< nbs
->nthread_max
; thread
++)
1322 nbs
->work
[thread
].sort_work_nalloc
=
1323 over_alloc_large(ncz_max
*grid
->na_sc
*SGSF
);
1324 srenew(nbs
->work
[thread
].sort_work
,
1325 nbs
->work
[thread
].sort_work_nalloc
);
1326 /* When not in use, all elements should be -1 */
1327 for (int i
= 0; i
< nbs
->work
[thread
].sort_work_nalloc
; i
++)
1329 nbs
->work
[thread
].sort_work
[i
] = -1;
1334 /* Now we know the dimensions we can fill the grid.
1335 * This is the first, unsorted fill. We sort the columns after this.
1337 for (int i
= a0
; i
< a1
; i
++)
1339 /* At this point nbs->cell contains the local grid x,y indices */
1341 nbs
->a
[(grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
+ grid
->cxy_na
[cxy
]++] = i
;
1346 /* Set the cell indices for the moved particles */
1347 n0
= grid
->nc
*grid
->na_sc
;
1348 n1
= grid
->nc
*grid
->na_sc
+grid
->cxy_na
[grid
->ncx
*grid
->ncy
];
1351 for (int i
= n0
; i
< n1
; i
++)
1353 nbs
->cell
[nbs
->a
[i
]] = i
;
1358 /* Sort the super-cell columns along z into the sub-cells. */
1359 #pragma omp parallel for num_threads(nthread) schedule(static)
1360 for (int thread
= 0; thread
< nthread
; thread
++)
1366 sort_columns_simple(nbs
, dd_zone
, grid
, a0
, a1
, atinfo
, x
, nbat
,
1367 ((thread
+0)*grid
->ncx
*grid
->ncy
)/nthread
,
1368 ((thread
+1)*grid
->ncx
*grid
->ncy
)/nthread
,
1369 nbs
->work
[thread
].sort_work
);
1373 sort_columns_supersub(nbs
, dd_zone
, grid
, a0
, a1
, atinfo
, x
, nbat
,
1374 ((thread
+0)*grid
->ncx
*grid
->ncy
)/nthread
,
1375 ((thread
+1)*grid
->ncx
*grid
->ncy
)/nthread
,
1376 nbs
->work
[thread
].sort_work
);
1379 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1382 if (grid
->bSimple
&& nbat
->XFormat
== nbatX8
)
1384 combine_bounding_box_pairs(grid
, grid
->bb
);
1389 grid
->nsubc_tot
= 0;
1390 for (int i
= 0; i
< grid
->nc
; i
++)
1392 grid
->nsubc_tot
+= grid
->nsubc
[i
];
1400 print_bbsizes_simple(debug
, grid
);
1404 fprintf(debug
, "ns non-zero sub-cells: %d average atoms %.2f\n",
1405 grid
->nsubc_tot
, (a1
-a0
)/(double)grid
->nsubc_tot
);
1407 print_bbsizes_supersub(debug
, grid
);
1412 static void init_buffer_flags(nbnxn_buffer_flags_t
*flags
,
1415 flags
->nflag
= (natoms
+ NBNXN_BUFFERFLAG_SIZE
- 1)/NBNXN_BUFFERFLAG_SIZE
;
1416 if (flags
->nflag
> flags
->flag_nalloc
)
1418 flags
->flag_nalloc
= over_alloc_large(flags
->nflag
);
1419 srenew(flags
->flag
, flags
->flag_nalloc
);
1421 for (int b
= 0; b
< flags
->nflag
; b
++)
1423 bitmask_clear(&(flags
->flag
[b
]));
1427 /* Sets up a grid and puts the atoms on the grid.
1428 * This function only operates on one domain of the domain decompostion.
1429 * Note that without domain decomposition there is only one domain.
1431 void nbnxn_put_on_grid(nbnxn_search_t nbs
,
1432 int ePBC
, matrix box
,
1434 rvec corner0
, rvec corner1
,
1439 int nmoved
, int *move
,
1441 nbnxn_atomdata_t
*nbat
)
1445 int nc_max_grid
, nc_max
;
1447 grid
= &nbs
->grid
[dd_zone
];
1449 nbs_cycle_start(&nbs
->cc
[enbsCCgrid
]);
1451 grid
->bSimple
= nbnxn_kernel_pairlist_simple(nb_kernel_type
);
1453 grid
->na_c
= nbnxn_kernel_to_cluster_i_size(nb_kernel_type
);
1454 grid
->na_cj
= nbnxn_kernel_to_cluster_j_size(nb_kernel_type
);
1455 grid
->na_sc
= (grid
->bSimple
? 1 : c_gpuNumClusterPerCell
)*grid
->na_c
;
1456 grid
->na_c_2log
= get_2log(grid
->na_c
);
1458 nbat
->na_c
= grid
->na_c
;
1467 (nbs
->grid
[dd_zone
-1].cell0
+ nbs
->grid
[dd_zone
-1].nc
)*
1468 nbs
->grid
[dd_zone
-1].na_sc
/grid
->na_sc
;
1476 copy_mat(box
, nbs
->box
);
1478 /* Avoid zero density */
1479 if (atom_density
> 0)
1481 grid
->atom_density
= atom_density
;
1485 grid
->atom_density
= grid_atom_density(n
-nmoved
, corner0
, corner1
);
1490 nbs
->natoms_local
= a1
- nmoved
;
1491 /* We assume that nbnxn_put_on_grid is called first
1492 * for the local atoms (dd_zone=0).
1494 nbs
->natoms_nonlocal
= a1
- nmoved
;
1498 fprintf(debug
, "natoms_local = %5d atom_density = %5.1f\n",
1499 nbs
->natoms_local
, grid
->atom_density
);
1504 nbs
->natoms_nonlocal
= std::max(nbs
->natoms_nonlocal
, a1
);
1507 /* We always use the home zone (grid[0]) for setting the cell size,
1508 * since determining densities for non-local zones is difficult.
1510 nc_max_grid
= set_grid_size_xy(nbs
, grid
,
1511 dd_zone
, n
-nmoved
, corner0
, corner1
,
1512 nbs
->grid
[0].atom_density
);
1514 nc_max
= grid
->cell0
+ nc_max_grid
;
1516 if (a1
> nbs
->cell_nalloc
)
1518 nbs
->cell_nalloc
= over_alloc_large(a1
);
1519 srenew(nbs
->cell
, nbs
->cell_nalloc
);
1522 /* To avoid conditionals we store the moved particles at the end of a,
1523 * make sure we have enough space.
1525 if (nc_max
*grid
->na_sc
+ nmoved
> nbs
->a_nalloc
)
1527 nbs
->a_nalloc
= over_alloc_large(nc_max
*grid
->na_sc
+ nmoved
);
1528 srenew(nbs
->a
, nbs
->a_nalloc
);
1531 /* We need padding up to a multiple of the buffer flag size: simply add */
1532 if (nc_max
*grid
->na_sc
+ NBNXN_BUFFERFLAG_SIZE
> nbat
->nalloc
)
1534 nbnxn_atomdata_realloc(nbat
, nc_max
*grid
->na_sc
+NBNXN_BUFFERFLAG_SIZE
);
1537 calc_cell_indices(nbs
, dd_zone
, grid
, a0
, a1
, atinfo
, x
, move
, nbat
);
1541 nbat
->natoms_local
= nbat
->natoms
;
1544 nbs_cycle_stop(&nbs
->cc
[enbsCCgrid
]);
1547 /* Calls nbnxn_put_on_grid for all non-local domains */
1548 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs
,
1549 const struct gmx_domdec_zones_t
*zones
,
1553 nbnxn_atomdata_t
*nbat
)
1557 for (int zone
= 1; zone
< zones
->n
; zone
++)
1559 for (int d
= 0; d
< DIM
; d
++)
1561 c0
[d
] = zones
->size
[zone
].bb_x0
[d
];
1562 c1
[d
] = zones
->size
[zone
].bb_x1
[d
];
1565 nbnxn_put_on_grid(nbs
, nbs
->ePBC
, nullptr,
1567 zones
->cg_range
[zone
],
1568 zones
->cg_range
[zone
+1],
1578 /* Add simple grid type information to the local super/sub grid */
1579 void nbnxn_grid_add_simple(nbnxn_search_t nbs
,
1580 nbnxn_atomdata_t
*nbat
)
1587 grid
= &nbs
->grid
[0];
1591 gmx_incons("nbnxn_grid_simple called with a simple grid");
1594 ncd
= grid
->na_sc
/NBNXN_CPU_CLUSTER_I_SIZE
;
1596 if (grid
->nc
*ncd
> grid
->nc_nalloc_simple
)
1598 grid
->nc_nalloc_simple
= over_alloc_large(grid
->nc
*ncd
);
1599 srenew(grid
->bbcz_simple
, grid
->nc_nalloc_simple
*NNBSBB_D
);
1600 srenew(grid
->bb_simple
, grid
->nc_nalloc_simple
);
1601 srenew(grid
->flags_simple
, grid
->nc_nalloc_simple
);
1604 sfree_aligned(grid
->bbj
);
1605 snew_aligned(grid
->bbj
, grid
->nc_nalloc_simple
/2, 16);
1609 bbcz
= grid
->bbcz_simple
;
1610 bb
= grid
->bb_simple
;
1612 #if GMX_OPENMP && !(defined __clang_analyzer__)
1613 // cppcheck-suppress unreadVariable
1614 int nthreads
= gmx_omp_nthreads_get(emntPairsearch
);
1617 #pragma omp parallel for num_threads(nthreads) schedule(static)
1618 for (int sc
= 0; sc
< grid
->nc
; sc
++)
1622 for (int c
= 0; c
< ncd
; c
++)
1624 int tx
= sc
*ncd
+ c
;
1625 int na
= NBNXN_CPU_CLUSTER_I_SIZE
;
1627 nbat
->type
[tx
*NBNXN_CPU_CLUSTER_I_SIZE
+na
-1] == nbat
->ntype
-1)
1634 switch (nbat
->XFormat
)
1637 /* c_packX4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
1638 calc_bounding_box_x_x4(na
, nbat
->x
+tx
*STRIDE_P4
,
1642 /* c_packX8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
1643 calc_bounding_box_x_x8(na
, nbat
->x
+ atom_to_x_index
<c_packX8
>(tx
*NBNXN_CPU_CLUSTER_I_SIZE
),
1647 calc_bounding_box(na
, nbat
->xstride
,
1648 nbat
->x
+tx
*NBNXN_CPU_CLUSTER_I_SIZE
*nbat
->xstride
,
1652 bbcz
[tx
*NNBSBB_D
+0] = bb
[tx
].lower
[BB_Z
];
1653 bbcz
[tx
*NNBSBB_D
+1] = bb
[tx
].upper
[BB_Z
];
1655 /* No interaction optimization yet here */
1656 grid
->flags_simple
[tx
] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
1660 grid
->flags_simple
[tx
] = 0;
1664 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1667 if (grid
->bSimple
&& nbat
->XFormat
== nbatX8
)
1669 combine_bounding_box_pairs(grid
, grid
->bb_simple
);
1673 void nbnxn_get_ncells(nbnxn_search_t nbs
, int *ncx
, int *ncy
)
1675 *ncx
= nbs
->grid
[0].ncx
;
1676 *ncy
= nbs
->grid
[0].ncy
;
1679 void nbnxn_get_atomorder(const nbnxn_search_t nbs
, const int **a
, int *n
)
1681 const nbnxn_grid_t
*grid
;
1683 grid
= &nbs
->grid
[0];
1685 /* Return the atom order for the home cell (index 0) */
1688 *n
= grid
->cxy_ind
[grid
->ncx
*grid
->ncy
]*grid
->na_sc
;
1691 void nbnxn_set_atomorder(nbnxn_search_t nbs
)
1693 /* Set the atom order for the home cell (index 0) */
1694 nbnxn_grid_t
*grid
= &nbs
->grid
[0];
1697 for (int cx
= 0; cx
< grid
->ncx
; cx
++)
1699 for (int cy
= 0; cy
< grid
->ncy
; cy
++)
1701 int cxy
= cx
*grid
->ncy
+ cy
;
1702 int j
= grid
->cxy_ind
[cxy
]*grid
->na_sc
;
1703 for (int cz
= 0; cz
< grid
->cxy_na
[cxy
]; cz
++)