2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 #include "nbnxn_grid.h"
46 #include "gromacs/domdec/domdec_struct.h"
47 #include "gromacs/math/utilities.h"
48 #include "gromacs/math/vec.h"
49 #include "gromacs/mdlib/gmx_omp_nthreads.h"
50 #include "gromacs/mdlib/nb_verlet.h"
51 #include "gromacs/mdlib/nbnxn_atomdata.h"
52 #include "gromacs/mdlib/nbnxn_consts.h"
53 #include "gromacs/mdlib/nbnxn_internal.h"
54 #include "gromacs/mdlib/nbnxn_search.h"
55 #include "gromacs/mdlib/nbnxn_util.h"
56 #include "gromacs/mdtypes/forcerec.h" // only for GET_CGINFO_*
57 #include "gromacs/simd/simd.h"
58 #include "gromacs/simd/vector_operations.h"
59 #include "gromacs/utility/exceptions.h"
60 #include "gromacs/utility/smalloc.h"
62 struct gmx_domdec_zones_t
;
64 static real
grid_atom_density(int numAtoms
,
65 const rvec lowerCorner
,
66 const rvec upperCorner
)
72 /* To avoid zero density we use a minimum of 1 atom */
76 rvec_sub(upperCorner
, lowerCorner
, size
);
78 return numAtoms
/(size
[XX
]*size
[YY
]*size
[ZZ
]);
81 static void set_grid_size_xy(const nbnxn_search_t nbs
,
85 const rvec lowerCorner
,
86 const rvec upperCorner
,
90 real tlen
, tlen_x
, tlen_y
;
92 rvec_sub(upperCorner
, lowerCorner
, size
);
94 if (numAtoms
> grid
->na_sc
)
96 GMX_ASSERT(atomDensity
> 0, "With one or more atoms, the density should be positive");
98 /* target cell length */
101 /* To minimize the zero interactions, we should make
102 * the largest of the i/j cell cubic.
104 int numAtomsInCell
= std::max(grid
->na_c
, grid
->na_cj
);
106 /* Approximately cubic cells */
107 tlen
= std::cbrt(numAtomsInCell
/atomDensity
);
113 /* Approximately cubic sub cells */
114 tlen
= std::cbrt(grid
->na_c
/atomDensity
);
115 tlen_x
= tlen
*c_gpuNumClusterPerCellX
;
116 tlen_y
= tlen
*c_gpuNumClusterPerCellY
;
118 /* We round ncx and ncy down, because we get less cell pairs
119 * in the nbsist when the fixed cell dimensions (x,y) are
120 * larger than the variable one (z) than the other way around.
122 grid
->ncx
= std::max(1, static_cast<int>(size
[XX
]/tlen_x
));
123 grid
->ncy
= std::max(1, static_cast<int>(size
[YY
]/tlen_y
));
131 grid
->sx
= size
[XX
]/grid
->ncx
;
132 grid
->sy
= size
[YY
]/grid
->ncy
;
133 grid
->inv_sx
= 1/grid
->sx
;
134 grid
->inv_sy
= 1/grid
->sy
;
138 /* This is a non-home zone, add an extra row of cells
139 * for particles communicated for bonded interactions.
140 * These can be beyond the cut-off. It doesn't matter where
141 * they end up on the grid, but for performance it's better
142 * if they don't end up in cells that can be within cut-off range.
148 /* We need one additional cell entry for particles moved by DD */
149 grid
->cxy_na
.resize(grid
->ncx
*grid
->ncy
+ 1);
150 grid
->cxy_ind
.resize(grid
->ncx
*grid
->ncy
+ 2);
152 for (int t
= 0; t
< nbs
->nthread_max
; t
++)
154 if (grid
->ncx
*grid
->ncy
+1 > nbs
->work
[t
].cxy_na_nalloc
)
156 nbs
->work
[t
].cxy_na_nalloc
= over_alloc_large(grid
->ncx
*grid
->ncy
+1);
157 srenew(nbs
->work
[t
].cxy_na
, nbs
->work
[t
].cxy_na_nalloc
);
161 /* Worst case scenario of 1 atom in each last cell */
163 if (grid
->na_cj
<= grid
->na_c
)
165 maxNumCells
= numAtoms
/grid
->na_sc
+ grid
->ncx
*grid
->ncy
;
169 maxNumCells
= numAtoms
/grid
->na_sc
+ grid
->ncx
*grid
->ncy
*grid
->na_cj
/grid
->na_c
;
172 grid
->nsubc
.resize(maxNumCells
);
173 grid
->bbcz
.resize(maxNumCells
*NNBSBB_D
);
175 /* This resize also zeros the contents, this avoid possible
176 * floating exceptions in SIMD with the unused bb elements.
180 grid
->bb
.resize(maxNumCells
);
185 grid
->pbb
.resize(maxNumCells
*c_gpuNumClusterPerCell
/STRIDE_PBB
*NNBSBB_XXXX
);
187 grid
->bb
.resize(maxNumCells
*c_gpuNumClusterPerCell
);
193 if (grid
->na_cj
== grid
->na_c
)
195 grid
->bbj
= grid
->bb
;
199 grid
->bbjStorage
.resize(maxNumCells
*grid
->na_c
/grid
->na_cj
);
200 grid
->bbj
= grid
->bbjStorage
;
204 grid
->flags
.resize(maxNumCells
);
207 grid
->fep
.resize(maxNumCells
*grid
->na_sc
/grid
->na_c
);
210 copy_rvec(lowerCorner
, grid
->c0
);
211 copy_rvec(upperCorner
, grid
->c1
);
212 copy_rvec(size
, grid
->size
);
215 /* We need to sort paricles in grid columns on z-coordinate.
216 * As particle are very often distributed homogeneously, we a sorting
217 * algorithm similar to pigeonhole sort. We multiply the z-coordinate
218 * by a factor, cast to an int and try to store in that hole. If the hole
219 * is full, we move this or another particle. A second pass is needed to make
220 * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
221 * 4 is the optimal value for homogeneous particle distribution and allows
222 * for an O(#particles) sort up till distributions were all particles are
223 * concentrated in 1/4 of the space. No NlogN fallback is implemented,
224 * as it can be expensive to detect imhomogeneous particle distributions.
225 * SGSF is the maximum ratio of holes used, in the worst case all particles
226 * end up in the last hole and we need #particles extra holes at the end.
228 #define SORT_GRID_OVERSIZE 4
229 #define SGSF (SORT_GRID_OVERSIZE + 1)
231 /* Sort particle index a on coordinates x along dim.
232 * Backwards tells if we want decreasing iso increasing coordinates.
233 * h0 is the minimum of the coordinate range.
234 * invh is the 1/length of the sorting range.
235 * n_per_h (>=n) is the expected average number of particles per 1/invh
236 * sort is the sorting work array.
237 * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
238 * or easier, allocate at least n*SGSF elements.
240 static void sort_atoms(int dim
, gmx_bool Backwards
,
241 int gmx_unused dd_zone
,
242 int *a
, int n
, const rvec
*x
,
243 real h0
, real invh
, int n_per_h
,
255 gmx_incons("n > n_per_h");
259 /* Transform the inverse range height into the inverse hole height */
260 invh
*= n_per_h
*SORT_GRID_OVERSIZE
;
262 /* Set nsort to the maximum possible number of holes used.
263 * In worst case all n elements end up in the last bin.
265 int nsort
= n_per_h
*SORT_GRID_OVERSIZE
+ n
;
267 /* Determine the index range used, so we can limit it for the second pass */
268 int zi_min
= INT_MAX
;
271 /* Sort the particles using a simple index sort */
272 for (int i
= 0; i
< n
; i
++)
274 /* The cast takes care of float-point rounding effects below zero.
275 * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
276 * times the box height out of the box.
278 int zi
= static_cast<int>((x
[a
[i
]][dim
] - h0
)*invh
);
281 /* As we can have rounding effect, we use > iso >= here */
282 if (zi
< 0 || (dd_zone
== 0 && zi
> n_per_h
*SORT_GRID_OVERSIZE
))
284 gmx_fatal(FARGS
, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
285 a
[i
], 'x'+dim
, x
[a
[i
]][dim
], h0
, invh
, zi
,
286 n_per_h
, SORT_GRID_OVERSIZE
);
290 /* In a non-local domain, particles communcated for bonded interactions
291 * can be far beyond the grid size, which is set by the non-bonded
292 * cut-off distance. We sort such particles into the last cell.
294 if (zi
> n_per_h
*SORT_GRID_OVERSIZE
)
296 zi
= n_per_h
*SORT_GRID_OVERSIZE
;
299 /* Ideally this particle should go in sort cell zi,
300 * but that might already be in use,
301 * in that case find the first empty cell higher up
306 zi_min
= std::min(zi_min
, zi
);
307 zi_max
= std::max(zi_max
, zi
);
311 /* We have multiple atoms in the same sorting slot.
312 * Sort on real z for minimal bounding box size.
313 * There is an extra check for identical z to ensure
314 * well-defined output order, independent of input order
315 * to ensure binary reproducibility after restarts.
317 while (sort
[zi
] >= 0 && ( x
[a
[i
]][dim
] > x
[sort
[zi
]][dim
] ||
318 (x
[a
[i
]][dim
] == x
[sort
[zi
]][dim
] &&
326 /* Shift all elements by one slot until we find an empty slot */
329 while (sort
[zim
] >= 0)
337 zi_max
= std::max(zi_max
, zim
);
340 zi_max
= std::max(zi_max
, zi
);
347 for (int zi
= 0; zi
< nsort
; zi
++)
358 for (int zi
= zi_max
; zi
>= zi_min
; zi
--)
369 gmx_incons("Lost particles while sorting");
374 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
375 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
381 /* Coordinate order x,y,z, bb order xyz0 */
382 static void calc_bounding_box(int na
, int stride
, const real
*x
, nbnxn_bb_t
*bb
)
385 real xl
, xh
, yl
, yh
, zl
, zh
;
395 for (int j
= 1; j
< na
; j
++)
397 xl
= std::min(xl
, x
[i
+XX
]);
398 xh
= std::max(xh
, x
[i
+XX
]);
399 yl
= std::min(yl
, x
[i
+YY
]);
400 yh
= std::max(yh
, x
[i
+YY
]);
401 zl
= std::min(zl
, x
[i
+ZZ
]);
402 zh
= std::max(zh
, x
[i
+ZZ
]);
405 /* Note: possible double to float conversion here */
406 bb
->lower
[BB_X
] = R2F_D(xl
);
407 bb
->lower
[BB_Y
] = R2F_D(yl
);
408 bb
->lower
[BB_Z
] = R2F_D(zl
);
409 bb
->upper
[BB_X
] = R2F_U(xh
);
410 bb
->upper
[BB_Y
] = R2F_U(yh
);
411 bb
->upper
[BB_Z
] = R2F_U(zh
);
414 /* Packed coordinates, bb order xyz0 */
415 static void calc_bounding_box_x_x4(int na
, const real
*x
, nbnxn_bb_t
*bb
)
417 real xl
, xh
, yl
, yh
, zl
, zh
;
425 for (int j
= 1; j
< na
; j
++)
427 xl
= std::min(xl
, x
[j
+XX
*c_packX4
]);
428 xh
= std::max(xh
, x
[j
+XX
*c_packX4
]);
429 yl
= std::min(yl
, x
[j
+YY
*c_packX4
]);
430 yh
= std::max(yh
, x
[j
+YY
*c_packX4
]);
431 zl
= std::min(zl
, x
[j
+ZZ
*c_packX4
]);
432 zh
= std::max(zh
, x
[j
+ZZ
*c_packX4
]);
434 /* Note: possible double to float conversion here */
435 bb
->lower
[BB_X
] = R2F_D(xl
);
436 bb
->lower
[BB_Y
] = R2F_D(yl
);
437 bb
->lower
[BB_Z
] = R2F_D(zl
);
438 bb
->upper
[BB_X
] = R2F_U(xh
);
439 bb
->upper
[BB_Y
] = R2F_U(yh
);
440 bb
->upper
[BB_Z
] = R2F_U(zh
);
443 /* Packed coordinates, bb order xyz0 */
444 static void calc_bounding_box_x_x8(int na
, const real
*x
, nbnxn_bb_t
*bb
)
446 real xl
, xh
, yl
, yh
, zl
, zh
;
454 for (int j
= 1; j
< na
; j
++)
456 xl
= std::min(xl
, x
[j
+XX
*c_packX8
]);
457 xh
= std::max(xh
, x
[j
+XX
*c_packX8
]);
458 yl
= std::min(yl
, x
[j
+YY
*c_packX8
]);
459 yh
= std::max(yh
, x
[j
+YY
*c_packX8
]);
460 zl
= std::min(zl
, x
[j
+ZZ
*c_packX8
]);
461 zh
= std::max(zh
, x
[j
+ZZ
*c_packX8
]);
463 /* Note: possible double to float conversion here */
464 bb
->lower
[BB_X
] = R2F_D(xl
);
465 bb
->lower
[BB_Y
] = R2F_D(yl
);
466 bb
->lower
[BB_Z
] = R2F_D(zl
);
467 bb
->upper
[BB_X
] = R2F_U(xh
);
468 bb
->upper
[BB_Y
] = R2F_U(yh
);
469 bb
->upper
[BB_Z
] = R2F_U(zh
);
472 /* Packed coordinates, bb order xyz0 */
473 gmx_unused
static void calc_bounding_box_x_x4_halves(int na
, const real
*x
,
474 nbnxn_bb_t
*bb
, nbnxn_bb_t
*bbj
)
476 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
479 calc_bounding_box_x_x4(std::min(na
, 2), x
, bbj
);
483 calc_bounding_box_x_x4(std::min(na
-2, 2), x
+(c_packX4
>>1), bbj
+1);
487 /* Set the "empty" bounding box to the same as the first one,
488 * so we don't need to treat special cases in the rest of the code.
490 #if NBNXN_SEARCH_BB_SIMD4
491 store4(&bbj
[1].lower
[0], load4(&bbj
[0].lower
[0]));
492 store4(&bbj
[1].upper
[0], load4(&bbj
[0].upper
[0]));
498 #if NBNXN_SEARCH_BB_SIMD4
499 store4(&bb
->lower
[0], min(load4(&bbj
[0].lower
[0]), load4(&bbj
[1].lower
[0])));
500 store4(&bb
->upper
[0], max(load4(&bbj
[0].upper
[0]), load4(&bbj
[1].upper
[0])));
505 for (i
= 0; i
< NNBSBB_C
; i
++)
507 bb
->lower
[i
] = std::min(bbj
[0].lower
[i
], bbj
[1].lower
[i
]);
508 bb
->upper
[i
] = std::max(bbj
[0].upper
[i
], bbj
[1].upper
[i
]);
514 #if NBNXN_SEARCH_BB_SIMD4
516 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
517 static void calc_bounding_box_xxxx(int na
, int stride
, const real
*x
, float *bb
)
520 real xl
, xh
, yl
, yh
, zl
, zh
;
530 for (int j
= 1; j
< na
; j
++)
532 xl
= std::min(xl
, x
[i
+XX
]);
533 xh
= std::max(xh
, x
[i
+XX
]);
534 yl
= std::min(yl
, x
[i
+YY
]);
535 yh
= std::max(yh
, x
[i
+YY
]);
536 zl
= std::min(zl
, x
[i
+ZZ
]);
537 zh
= std::max(zh
, x
[i
+ZZ
]);
540 /* Note: possible double to float conversion here */
541 bb
[0*STRIDE_PBB
] = R2F_D(xl
);
542 bb
[1*STRIDE_PBB
] = R2F_D(yl
);
543 bb
[2*STRIDE_PBB
] = R2F_D(zl
);
544 bb
[3*STRIDE_PBB
] = R2F_U(xh
);
545 bb
[4*STRIDE_PBB
] = R2F_U(yh
);
546 bb
[5*STRIDE_PBB
] = R2F_U(zh
);
549 #endif /* NBNXN_SEARCH_BB_SIMD4 */
551 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
553 /* Coordinate order xyz?, bb order xyz0 */
554 static void calc_bounding_box_simd4(int na
, const float *x
, nbnxn_bb_t
*bb
)
556 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
559 Simd4Float bb_0_S
, bb_1_S
;
565 for (int i
= 1; i
< na
; i
++)
567 x_S
= load4(x
+i
*NNBSBB_C
);
568 bb_0_S
= min(bb_0_S
, x_S
);
569 bb_1_S
= max(bb_1_S
, x_S
);
572 store4(&bb
->lower
[0], bb_0_S
);
573 store4(&bb
->upper
[0], bb_1_S
);
576 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
577 static void calc_bounding_box_xxxx_simd4(int na
, const float *x
,
578 nbnxn_bb_t
*bb_work_aligned
,
581 calc_bounding_box_simd4(na
, x
, bb_work_aligned
);
583 bb
[0*STRIDE_PBB
] = bb_work_aligned
->lower
[BB_X
];
584 bb
[1*STRIDE_PBB
] = bb_work_aligned
->lower
[BB_Y
];
585 bb
[2*STRIDE_PBB
] = bb_work_aligned
->lower
[BB_Z
];
586 bb
[3*STRIDE_PBB
] = bb_work_aligned
->upper
[BB_X
];
587 bb
[4*STRIDE_PBB
] = bb_work_aligned
->upper
[BB_Y
];
588 bb
[5*STRIDE_PBB
] = bb_work_aligned
->upper
[BB_Z
];
591 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
594 /* Combines pairs of consecutive bounding boxes */
595 static void combine_bounding_box_pairs(nbnxn_grid_t
*grid
,
596 gmx::ArrayRef
<const nbnxn_bb_t
> bb
)
598 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
601 for (int i
= 0; i
< grid
->ncx
*grid
->ncy
; i
++)
603 /* Starting bb in a column is expected to be 2-aligned */
604 int sc2
= grid
->cxy_ind
[i
]>>1;
605 /* For odd numbers skip the last bb here */
606 int nc2
= (grid
->cxy_na
[i
]+3)>>(2+1);
608 for (c2
= sc2
; c2
< sc2
+nc2
; c2
++)
610 #if NBNXN_SEARCH_BB_SIMD4
611 Simd4Float min_S
, max_S
;
613 min_S
= min(load4(&bb
[c2
*2+0].lower
[0]),
614 load4(&bb
[c2
*2+1].lower
[0]));
615 max_S
= max(load4(&bb
[c2
*2+0].upper
[0]),
616 load4(&bb
[c2
*2+1].upper
[0]));
617 store4(&grid
->bbj
[c2
].lower
[0], min_S
);
618 store4(&grid
->bbj
[c2
].upper
[0], max_S
);
620 for (int j
= 0; j
< NNBSBB_C
; j
++)
622 grid
->bbj
[c2
].lower
[j
] = std::min(bb
[c2
*2+0].lower
[j
],
623 bb
[c2
*2+1].lower
[j
]);
624 grid
->bbj
[c2
].upper
[j
] = std::max(bb
[c2
*2+0].upper
[j
],
625 bb
[c2
*2+1].upper
[j
]);
629 if (((grid
->cxy_na
[i
]+3)>>2) & 1)
631 /* The bb count in this column is odd: duplicate the last bb */
632 for (int j
= 0; j
< NNBSBB_C
; j
++)
634 grid
->bbj
[c2
].lower
[j
] = bb
[c2
*2].lower
[j
];
635 grid
->bbj
[c2
].upper
[j
] = bb
[c2
*2].upper
[j
];
642 /* Prints the average bb size, used for debug output */
643 static void print_bbsizes_simple(FILE *fp
,
644 const nbnxn_grid_t
*grid
)
649 for (int c
= 0; c
< grid
->nc
; c
++)
651 for (int d
= 0; d
< DIM
; d
++)
653 ba
[d
] += grid
->bb
[c
].upper
[d
] - grid
->bb
[c
].lower
[d
];
656 dsvmul(1.0/grid
->nc
, ba
, ba
);
658 fprintf(fp
, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
661 grid
->atom_density
> 0 ?
662 grid
->na_c
/(grid
->atom_density
*grid
->sx
*grid
->sy
) : 0.0,
663 ba
[XX
], ba
[YY
], ba
[ZZ
],
666 grid
->atom_density
> 0 ?
667 ba
[ZZ
]/(grid
->na_c
/(grid
->atom_density
*grid
->sx
*grid
->sy
)) : 0.0);
670 /* Prints the average bb size, used for debug output */
671 static void print_bbsizes_supersub(FILE *fp
,
672 const nbnxn_grid_t
*grid
)
679 for (int c
= 0; c
< grid
->nc
; c
++)
682 for (int s
= 0; s
< grid
->nsubc
[c
]; s
+= STRIDE_PBB
)
684 int cs_w
= (c
*c_gpuNumClusterPerCell
+ s
)/STRIDE_PBB
;
685 for (int i
= 0; i
< STRIDE_PBB
; i
++)
687 for (int d
= 0; d
< DIM
; d
++)
690 grid
->pbb
[cs_w
*NNBSBB_XXXX
+(DIM
+d
)*STRIDE_PBB
+i
] -
691 grid
->pbb
[cs_w
*NNBSBB_XXXX
+ d
*STRIDE_PBB
+i
];
696 for (int s
= 0; s
< grid
->nsubc
[c
]; s
++)
698 int cs
= c
*c_gpuNumClusterPerCell
+ s
;
699 for (int d
= 0; d
< DIM
; d
++)
701 ba
[d
] += grid
->bb
[cs
].upper
[d
] - grid
->bb
[cs
].lower
[d
];
705 ns
+= grid
->nsubc
[c
];
707 dsvmul(1.0/ns
, ba
, ba
);
709 fprintf(fp
, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
710 grid
->sx
/c_gpuNumClusterPerCellX
,
711 grid
->sy
/c_gpuNumClusterPerCellY
,
712 grid
->atom_density
> 0 ?
713 grid
->na_sc
/(grid
->atom_density
*grid
->sx
*grid
->sy
*c_gpuNumClusterPerCellZ
) : 0.0,
714 ba
[XX
], ba
[YY
], ba
[ZZ
],
715 ba
[XX
]*c_gpuNumClusterPerCellX
/grid
->sx
,
716 ba
[YY
]*c_gpuNumClusterPerCellY
/grid
->sy
,
717 grid
->atom_density
> 0 ?
718 ba
[ZZ
]/(grid
->na_sc
/(grid
->atom_density
*grid
->sx
*grid
->sy
*c_gpuNumClusterPerCellZ
)) : 0.0);
721 /* Set non-bonded interaction flags for the current cluster.
722 * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
724 static void sort_cluster_on_flag(int numAtomsInCluster
,
728 gmx::ArrayRef
<int> order
,
731 constexpr int c_maxNumAtomsInCluster
= 8;
732 int sort1
[c_maxNumAtomsInCluster
];
733 int sort2
[c_maxNumAtomsInCluster
];
735 GMX_ASSERT(numAtomsInCluster
<= c_maxNumAtomsInCluster
, "Need to increase c_maxNumAtomsInCluster to support larger clusters");
740 for (int s
= atomStart
; s
< atomEnd
; s
+= numAtomsInCluster
)
742 /* Make lists for this (sub-)cell on atoms with and without LJ */
745 gmx_bool haveQ
= FALSE
;
747 for (int a
= s
; a
< std::min(s
+ numAtomsInCluster
, atomEnd
); a
++)
749 haveQ
= haveQ
|| GET_CGINFO_HAS_Q(atinfo
[order
[a
]]);
751 if (GET_CGINFO_HAS_VDW(atinfo
[order
[a
]]))
753 sort1
[n1
++] = order
[a
];
758 sort2
[n2
++] = order
[a
];
762 /* If we don't have atoms with LJ, there's nothing to sort */
765 *flags
|= NBNXN_CI_DO_LJ(subc
);
767 if (2*n1
<= numAtomsInCluster
)
769 /* Only sort when strictly necessary. Ordering particles
770 * Ordering particles can lead to less accurate summation
771 * due to rounding, both for LJ and Coulomb interactions.
773 if (2*(a_lj_max
- s
) >= numAtomsInCluster
)
775 for (int i
= 0; i
< n1
; i
++)
777 order
[atomStart
+ i
] = sort1
[i
];
779 for (int j
= 0; j
< n2
; j
++)
781 order
[atomStart
+ n1
+ j
] = sort2
[j
];
785 *flags
|= NBNXN_CI_HALF_LJ(subc
);
790 *flags
|= NBNXN_CI_DO_COUL(subc
);
796 /* Fill a pair search cell with atoms.
797 * Potentially sorts atoms and sets the interaction flags.
799 static void fill_cell(const nbnxn_search_t nbs
,
801 nbnxn_atomdata_t
*nbat
,
806 nbnxn_bb_t gmx_unused
*bb_work_aligned
)
808 const int numAtoms
= atomEnd
- atomStart
;
812 /* Note that non-local grids are already sorted.
813 * Then sort_cluster_on_flag will only set the flags and the sorting
814 * will not affect the atom order.
816 sort_cluster_on_flag(grid
->na_c
, atomStart
, atomEnd
, atinfo
, nbs
->a
,
817 grid
->flags
.data() + (atomStart
>> grid
->na_c_2log
) - grid
->cell0
);
822 /* Set the fep flag for perturbed atoms in this (sub-)cell */
824 /* The grid-local cluster/(sub-)cell index */
825 int cell
= (atomStart
>> grid
->na_c_2log
) - grid
->cell0
*(grid
->bSimple
? 1 : c_gpuNumClusterPerCell
);
827 for (int at
= atomStart
; at
< atomEnd
; at
++)
829 if (nbs
->a
[at
] >= 0 && GET_CGINFO_FEP(atinfo
[nbs
->a
[at
]]))
831 grid
->fep
[cell
] |= (1 << (at
- atomStart
));
836 /* Now we have sorted the atoms, set the cell indices */
837 for (int at
= atomStart
; at
< atomEnd
; at
++)
839 nbs
->cell
[nbs
->a
[at
]] = at
;
842 copy_rvec_to_nbat_real(nbs
->a
.data() + atomStart
, numAtoms
, grid
->na_c
, x
,
843 nbat
->XFormat
, nbat
->x
, atomStart
);
845 if (nbat
->XFormat
== nbatX4
)
847 /* Store the bounding boxes as xyz.xyz. */
848 size_t offset
= (atomStart
- grid
->cell0
*grid
->na_sc
) >> grid
->na_c_2log
;
849 nbnxn_bb_t
*bb_ptr
= grid
->bb
.data() + offset
;
851 #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2
852 if (2*grid
->na_cj
== grid
->na_c
)
854 calc_bounding_box_x_x4_halves(numAtoms
, nbat
->x
+ atom_to_x_index
<c_packX4
>(atomStart
), bb_ptr
,
855 grid
->bbj
.data() + offset
*2);
860 calc_bounding_box_x_x4(numAtoms
, nbat
->x
+ atom_to_x_index
<c_packX4
>(atomStart
), bb_ptr
);
863 else if (nbat
->XFormat
== nbatX8
)
865 /* Store the bounding boxes as xyz.xyz. */
866 size_t offset
= (atomStart
- grid
->cell0
*grid
->na_sc
) >> grid
->na_c_2log
;
867 nbnxn_bb_t
*bb_ptr
= grid
->bb
.data() + offset
;
869 calc_bounding_box_x_x8(numAtoms
, nbat
->x
+ atom_to_x_index
<c_packX8
>(atomStart
), bb_ptr
);
872 else if (!grid
->bSimple
)
874 /* Store the bounding boxes in a format convenient
875 * for SIMD4 calculations: xxxxyyyyzzzz...
879 ((atomStart
- grid
->cell0
*grid
->na_sc
) >> (grid
->na_c_2log
+ STRIDE_PBB_2LOG
))*NNBSBB_XXXX
+
880 (((atomStart
- grid
->cell0
*grid
->na_sc
) >> grid
->na_c_2log
) & (STRIDE_PBB
- 1));
882 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
883 if (nbat
->XFormat
== nbatXYZQ
)
885 calc_bounding_box_xxxx_simd4(numAtoms
, nbat
->x
+ atomStart
*nbat
->xstride
,
886 bb_work_aligned
, pbb_ptr
);
891 calc_bounding_box_xxxx(numAtoms
, nbat
->xstride
, nbat
->x
+ atomStart
*nbat
->xstride
,
896 fprintf(debug
, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
897 atomStart
>> grid
->na_c_2log
,
898 pbb_ptr
[0*STRIDE_PBB
], pbb_ptr
[3*STRIDE_PBB
],
899 pbb_ptr
[1*STRIDE_PBB
], pbb_ptr
[4*STRIDE_PBB
],
900 pbb_ptr
[2*STRIDE_PBB
], pbb_ptr
[5*STRIDE_PBB
]);
906 /* Store the bounding boxes as xyz.xyz. */
907 nbnxn_bb_t
*bb_ptr
= grid
->bb
.data() + ((atomStart
- grid
->cell0
*grid
->na_sc
) >> grid
->na_c_2log
);
909 calc_bounding_box(numAtoms
, nbat
->xstride
, nbat
->x
+ atomStart
*nbat
->xstride
,
914 int bbo
= (atomStart
- grid
->cell0
*grid
->na_sc
)/grid
->na_c
;
915 fprintf(debug
, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
916 atomStart
>> grid
->na_c_2log
,
917 grid
->bb
[bbo
].lower
[BB_X
],
918 grid
->bb
[bbo
].lower
[BB_Y
],
919 grid
->bb
[bbo
].lower
[BB_Z
],
920 grid
->bb
[bbo
].upper
[BB_X
],
921 grid
->bb
[bbo
].upper
[BB_Y
],
922 grid
->bb
[bbo
].upper
[BB_Z
]);
927 /* Spatially sort the atoms within one grid column */
928 static void sort_columns_simple(const nbnxn_search_t nbs
,
931 int atomStart
, int atomEnd
,
934 nbnxn_atomdata_t
*nbat
,
935 int cxy_start
, int cxy_end
,
940 fprintf(debug
, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
941 grid
->cell0
, cxy_start
, cxy_end
, atomStart
, atomEnd
);
944 /* Sort the atoms within each x,y column in 3 dimensions */
945 for (int cxy
= cxy_start
; cxy
< cxy_end
; cxy
++)
947 int na
= grid
->cxy_na
[cxy
];
948 int ncz
= grid
->cxy_ind
[cxy
+1] - grid
->cxy_ind
[cxy
];
949 int ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
951 /* Sort the atoms within each x,y column on z coordinate */
952 sort_atoms(ZZ
, FALSE
, dd_zone
,
953 nbs
->a
.data() + ash
, na
, x
,
955 1.0/grid
->size
[ZZ
], ncz
*grid
->na_sc
,
958 /* Fill the ncz cells in this column */
959 int cfilled
= grid
->cxy_ind
[cxy
];
960 for (int cz
= 0; cz
< ncz
; cz
++)
962 int c
= grid
->cxy_ind
[cxy
] + cz
;
964 int ash_c
= ash
+ cz
*grid
->na_sc
;
965 int na_c
= std::min(grid
->na_sc
, na
-(ash_c
-ash
));
967 fill_cell(nbs
, grid
, nbat
,
968 ash_c
, ash_c
+na_c
, atinfo
, x
,
971 /* This copy to bbcz is not really necessary.
972 * But it allows to use the same grid search code
973 * for the simple and supersub cell setups.
979 grid
->bbcz
[c
*NNBSBB_D
] = grid
->bb
[cfilled
].lower
[BB_Z
];
980 grid
->bbcz
[c
*NNBSBB_D
+1] = grid
->bb
[cfilled
].upper
[BB_Z
];
983 /* Set the unused atom indices to -1 */
984 for (int ind
= na
; ind
< ncz
*grid
->na_sc
; ind
++)
986 nbs
->a
[ash
+ind
] = -1;
991 /* Spatially sort the atoms within one grid column */
992 static void sort_columns_supersub(const nbnxn_search_t nbs
,
995 int atomStart
, int atomEnd
,
998 nbnxn_atomdata_t
*nbat
,
999 int cxy_start
, int cxy_end
,
1002 nbnxn_bb_t bb_work_array
[2];
1003 nbnxn_bb_t
*bb_work_aligned
= reinterpret_cast<nbnxn_bb_t
*>((reinterpret_cast<std::size_t>(bb_work_array
+ 1)) & (~(static_cast<std::size_t>(15))));
1007 fprintf(debug
, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1008 grid
->cell0
, cxy_start
, cxy_end
, atomStart
, atomEnd
);
1011 int subdiv_x
= grid
->na_c
;
1012 int subdiv_y
= c_gpuNumClusterPerCellX
*subdiv_x
;
1013 int subdiv_z
= c_gpuNumClusterPerCellY
*subdiv_y
;
1015 /* Sort the atoms within each x,y column in 3 dimensions.
1016 * Loop over all columns on the x/y grid.
1018 for (int cxy
= cxy_start
; cxy
< cxy_end
; cxy
++)
1020 int gridX
= cxy
/grid
->ncy
;
1021 int gridY
= cxy
- gridX
*grid
->ncy
;
1023 int numAtomsInColumn
= grid
->cxy_na
[cxy
];
1024 int numCellsInColumn
= grid
->cxy_ind
[cxy
+ 1] - grid
->cxy_ind
[cxy
];
1025 int ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
1027 /* Sort the atoms within each x,y column on z coordinate */
1028 sort_atoms(ZZ
, FALSE
, dd_zone
,
1029 nbs
->a
.data() + ash
, numAtomsInColumn
, x
,
1031 1.0/grid
->size
[ZZ
], numCellsInColumn
*grid
->na_sc
,
1034 /* This loop goes over the cells and clusters along z at once */
1035 for (int sub_z
= 0; sub_z
< numCellsInColumn
*c_gpuNumClusterPerCellZ
; sub_z
++)
1037 int ash_z
= ash
+ sub_z
*subdiv_z
;
1038 int na_z
= std::min(subdiv_z
, numAtomsInColumn
- (ash_z
- ash
));
1040 /* We have already sorted on z */
1042 if (sub_z
% c_gpuNumClusterPerCellZ
== 0)
1044 cz
= sub_z
/c_gpuNumClusterPerCellZ
;
1045 int cell
= grid
->cxy_ind
[cxy
] + cz
;
1047 /* The number of atoms in this cell/super-cluster */
1048 int numAtoms
= std::min(grid
->na_sc
, numAtomsInColumn
- (ash_z
- ash
));
1050 grid
->nsubc
[cell
] = std::min(c_gpuNumClusterPerCell
,
1051 (numAtoms
+ grid
->na_c
- 1)/grid
->na_c
);
1053 /* Store the z-boundaries of the bounding box of the cell */
1054 grid
->bbcz
[cell
*NNBSBB_D
] = x
[nbs
->a
[ash_z
]][ZZ
];
1055 grid
->bbcz
[cell
*NNBSBB_D
+1] = x
[nbs
->a
[ash_z
+ numAtoms
- 1]][ZZ
];
1058 if (c_gpuNumClusterPerCellY
> 1)
1060 /* Sort the atoms along y */
1061 sort_atoms(YY
, (sub_z
& 1), dd_zone
,
1062 nbs
->a
.data() + ash_z
, na_z
, x
,
1063 grid
->c0
[YY
] + gridY
*grid
->sy
,
1064 grid
->inv_sy
, subdiv_z
,
1068 for (int sub_y
= 0; sub_y
< c_gpuNumClusterPerCellY
; sub_y
++)
1070 int ash_y
= ash_z
+ sub_y
*subdiv_y
;
1071 int na_y
= std::min(subdiv_y
, numAtomsInColumn
- (ash_y
- ash
));
1073 if (c_gpuNumClusterPerCellX
> 1)
1075 /* Sort the atoms along x */
1076 sort_atoms(XX
, ((cz
*c_gpuNumClusterPerCellY
+ sub_y
) & 1), dd_zone
,
1077 nbs
->a
.data() + ash_y
, na_y
, x
,
1078 grid
->c0
[XX
] + gridX
*grid
->sx
,
1079 grid
->inv_sx
, subdiv_y
,
1083 for (int sub_x
= 0; sub_x
< c_gpuNumClusterPerCellX
; sub_x
++)
1085 int ash_x
= ash_y
+ sub_x
*subdiv_x
;
1086 int na_x
= std::min(subdiv_x
, numAtomsInColumn
- (ash_x
- ash
));
1088 fill_cell(nbs
, grid
, nbat
,
1089 ash_x
, ash_x
+ na_x
, atinfo
, x
,
1095 /* Set the unused atom indices to -1 */
1096 for (int ind
= numAtomsInColumn
; ind
< numCellsInColumn
*grid
->na_sc
; ind
++)
1098 nbs
->a
[ash
+ ind
] = -1;
1103 /* Determine in which grid column atoms should go */
1104 static void calc_column_indices(nbnxn_grid_t
*grid
,
1105 int atomStart
, int atomEnd
,
1107 int dd_zone
, const int *move
,
1108 int thread
, int nthread
,
1109 gmx::ArrayRef
<int> cell
,
1112 /* We add one extra cell for particles which moved during DD */
1113 for (int i
= 0; i
< grid
->ncx
*grid
->ncy
+1; i
++)
1118 int taskAtomStart
= atomStart
+ static_cast<int>((thread
+ 0)*(atomEnd
- atomStart
))/nthread
;
1119 int taskAtomEnd
= atomStart
+ static_cast<int>((thread
+ 1)*(atomEnd
- atomStart
))/nthread
;
1123 for (int i
= taskAtomStart
; i
< taskAtomEnd
; i
++)
1125 if (move
== nullptr || move
[i
] >= 0)
1127 /* We need to be careful with rounding,
1128 * particles might be a few bits outside the local zone.
1129 * The int cast takes care of the lower bound,
1130 * we will explicitly take care of the upper bound.
1132 int cx
= static_cast<int>((x
[i
][XX
] - grid
->c0
[XX
])*grid
->inv_sx
);
1133 int cy
= static_cast<int>((x
[i
][YY
] - grid
->c0
[YY
])*grid
->inv_sy
);
1136 if (cx
< 0 || cx
> grid
->ncx
||
1137 cy
< 0 || cy
> grid
->ncy
)
1140 "grid cell cx %d cy %d out of range (max %d %d)\n"
1141 "atom %f %f %f, grid->c0 %f %f",
1142 cx
, cy
, grid
->ncx
, grid
->ncy
,
1143 x
[i
][XX
], x
[i
][YY
], x
[i
][ZZ
], grid
->c0
[XX
], grid
->c0
[YY
]);
1146 /* Take care of potential rouding issues */
1147 cx
= std::min(cx
, grid
->ncx
- 1);
1148 cy
= std::min(cy
, grid
->ncy
- 1);
1150 /* For the moment cell will contain only the, grid local,
1151 * x and y indices, not z.
1153 cell
[i
] = cx
*grid
->ncy
+ cy
;
1157 /* Put this moved particle after the end of the grid,
1158 * so we can process it later without using conditionals.
1160 cell
[i
] = grid
->ncx
*grid
->ncy
;
1169 for (int i
= taskAtomStart
; i
< taskAtomEnd
; i
++)
1171 int cx
= static_cast<int>((x
[i
][XX
] - grid
->c0
[XX
])*grid
->inv_sx
);
1172 int cy
= static_cast<int>((x
[i
][YY
] - grid
->c0
[YY
])*grid
->inv_sy
);
1174 /* For non-home zones there could be particles outside
1175 * the non-bonded cut-off range, which have been communicated
1176 * for bonded interactions only. For the result it doesn't
1177 * matter where these end up on the grid. For performance
1178 * we put them in an extra row at the border.
1180 cx
= std::max(cx
, 0);
1181 cx
= std::min(cx
, grid
->ncx
- 1);
1182 cy
= std::max(cy
, 0);
1183 cy
= std::min(cy
, grid
->ncy
- 1);
1185 /* For the moment cell will contain only the, grid local,
1186 * x and y indices, not z.
1188 cell
[i
] = cx
*grid
->ncy
+ cy
;
1195 /* Resizes grid and atom data which depend on the number of cells */
1196 static void resizeForNumberOfCells(const nbnxn_grid_t
&grid
,
1199 nbnxn_atomdata_t
*nbat
)
1201 int numNbnxnAtoms
= (grid
.cell0
+ grid
.nc
)*grid
.na_sc
;
1203 /* Note: nbs->cell was already resized before */
1205 /* To avoid conditionals we store the moved particles at the end of a,
1206 * make sure we have enough space.
1208 nbs
->a
.resize(numNbnxnAtoms
+ numAtomsMoved
);
1210 /* We need padding up to a multiple of the buffer flag size: simply add */
1211 if (numNbnxnAtoms
+ NBNXN_BUFFERFLAG_SIZE
> nbat
->nalloc
)
1213 nbnxn_atomdata_realloc(nbat
, numNbnxnAtoms
+ NBNXN_BUFFERFLAG_SIZE
);
1216 nbat
->natoms
= numNbnxnAtoms
;
1219 /* Determine in which grid cells the atoms should go */
1220 static void calc_cell_indices(const nbnxn_search_t nbs
,
1229 nbnxn_atomdata_t
*nbat
)
1231 /* First compute all grid/column indices and store them in nbs->cell */
1232 nbs
->cell
.resize(atomEnd
);
1234 const int nthread
= gmx_omp_nthreads_get(emntPairsearch
);
1236 #pragma omp parallel for num_threads(nthread) schedule(static)
1237 for (int thread
= 0; thread
< nthread
; thread
++)
1241 calc_column_indices(grid
, atomStart
, atomEnd
, x
, ddZone
, move
, thread
, nthread
,
1242 nbs
->cell
, nbs
->work
[thread
].cxy_na
);
1244 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1247 /* Make the cell index as a function of x and y */
1250 grid
->cxy_ind
[0] = 0;
1251 for (int i
= 0; i
< grid
->ncx
*grid
->ncy
+ 1; i
++)
1253 /* We set ncz_max at the beginning of the loop iso at the end
1254 * to skip i=grid->ncx*grid->ncy which are moved particles
1255 * that do not need to be ordered on the grid.
1261 int cxy_na_i
= nbs
->work
[0].cxy_na
[i
];
1262 for (int thread
= 1; thread
< nthread
; thread
++)
1264 cxy_na_i
+= nbs
->work
[thread
].cxy_na
[i
];
1266 ncz
= (cxy_na_i
+ grid
->na_sc
- 1)/grid
->na_sc
;
1267 if (nbat
->XFormat
== nbatX8
)
1269 /* Make the number of cell a multiple of 2 */
1270 ncz
= (ncz
+ 1) & ~1;
1272 grid
->cxy_ind
[i
+1] = grid
->cxy_ind
[i
] + ncz
;
1273 /* Clear cxy_na, so we can reuse the array below */
1274 grid
->cxy_na
[i
] = 0;
1276 grid
->nc
= grid
->cxy_ind
[grid
->ncx
*grid
->ncy
] - grid
->cxy_ind
[0];
1278 resizeForNumberOfCells(*grid
, numAtomsMoved
, nbs
, nbat
);
1282 fprintf(debug
, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1283 grid
->na_sc
, grid
->na_c
, grid
->nc
,
1284 grid
->ncx
, grid
->ncy
, grid
->nc
/((double)(grid
->ncx
*grid
->ncy
)),
1289 for (int cy
= 0; cy
< grid
->ncy
; cy
++)
1291 for (int cx
= 0; cx
< grid
->ncx
; cx
++)
1293 fprintf(debug
, " %2d", grid
->cxy_ind
[i
+1]-grid
->cxy_ind
[i
]);
1296 fprintf(debug
, "\n");
1301 /* Make sure the work array for sorting is large enough */
1302 if (ncz_max
*grid
->na_sc
*SGSF
> nbs
->work
[0].sort_work_nalloc
)
1304 for (int thread
= 0; thread
< nbs
->nthread_max
; thread
++)
1306 nbs
->work
[thread
].sort_work_nalloc
=
1307 over_alloc_large(ncz_max
*grid
->na_sc
*SGSF
);
1308 srenew(nbs
->work
[thread
].sort_work
,
1309 nbs
->work
[thread
].sort_work_nalloc
);
1310 /* When not in use, all elements should be -1 */
1311 for (int i
= 0; i
< nbs
->work
[thread
].sort_work_nalloc
; i
++)
1313 nbs
->work
[thread
].sort_work
[i
] = -1;
1318 /* Now we know the dimensions we can fill the grid.
1319 * This is the first, unsorted fill. We sort the columns after this.
1321 for (int i
= atomStart
; i
< atomEnd
; i
++)
1323 /* At this point nbs->cell contains the local grid x,y indices */
1324 int cxy
= nbs
->cell
[i
];
1325 nbs
->a
[(grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
+ grid
->cxy_na
[cxy
]++] = i
;
1330 /* Set the cell indices for the moved particles */
1331 int n0
= grid
->nc
*grid
->na_sc
;
1332 int n1
= grid
->nc
*grid
->na_sc
+grid
->cxy_na
[grid
->ncx
*grid
->ncy
];
1335 for (int i
= n0
; i
< n1
; i
++)
1337 nbs
->cell
[nbs
->a
[i
]] = i
;
1342 /* Sort the super-cell columns along z into the sub-cells. */
1343 #pragma omp parallel for num_threads(nthread) schedule(static)
1344 for (int thread
= 0; thread
< nthread
; thread
++)
1350 sort_columns_simple(nbs
, ddZone
, grid
, atomStart
, atomEnd
, atinfo
, x
, nbat
,
1351 ((thread
+0)*grid
->ncx
*grid
->ncy
)/nthread
,
1352 ((thread
+1)*grid
->ncx
*grid
->ncy
)/nthread
,
1353 nbs
->work
[thread
].sort_work
);
1357 sort_columns_supersub(nbs
, ddZone
, grid
, atomStart
, atomEnd
, atinfo
, x
, nbat
,
1358 ((thread
+0)*grid
->ncx
*grid
->ncy
)/nthread
,
1359 ((thread
+1)*grid
->ncx
*grid
->ncy
)/nthread
,
1360 nbs
->work
[thread
].sort_work
);
1363 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1366 if (grid
->bSimple
&& nbat
->XFormat
== nbatX8
)
1368 combine_bounding_box_pairs(grid
, grid
->bb
);
1373 grid
->nsubc_tot
= 0;
1374 for (int i
= 0; i
< grid
->nc
; i
++)
1376 grid
->nsubc_tot
+= grid
->nsubc
[i
];
1384 print_bbsizes_simple(debug
, grid
);
1388 fprintf(debug
, "ns non-zero sub-cells: %d average atoms %.2f\n",
1389 grid
->nsubc_tot
, (atomEnd
- atomStart
)/(double)grid
->nsubc_tot
);
1391 print_bbsizes_supersub(debug
, grid
);
1396 /* Sets up a grid and puts the atoms on the grid.
1397 * This function only operates on one domain of the domain decompostion.
1398 * Note that without domain decomposition there is only one domain.
1400 void nbnxn_put_on_grid(nbnxn_search_t nbs
,
1404 const rvec lowerCorner
,
1405 const rvec upperCorner
,
1414 nbnxn_atomdata_t
*nbat
)
1416 nbnxn_grid_t
*grid
= &nbs
->grid
[ddZone
];
1418 nbs_cycle_start(&nbs
->cc
[enbsCCgrid
]);
1420 grid
->bSimple
= nbnxn_kernel_pairlist_simple(nb_kernel_type
);
1422 grid
->na_c
= nbnxn_kernel_to_cluster_i_size(nb_kernel_type
);
1423 grid
->na_cj
= nbnxn_kernel_to_cluster_j_size(nb_kernel_type
);
1424 grid
->na_sc
= (grid
->bSimple
? 1 : c_gpuNumClusterPerCell
)*grid
->na_c
;
1425 grid
->na_c_2log
= get_2log(grid
->na_c
);
1427 nbat
->na_c
= grid
->na_c
;
1436 (nbs
->grid
[ddZone
- 1].cell0
+ nbs
->grid
[ddZone
- 1].nc
)*
1437 nbs
->grid
[ddZone
- 1].na_sc
/grid
->na_sc
;
1440 const int n
= atomEnd
- atomStart
;
1445 copy_mat(box
, nbs
->box
);
1447 /* Avoid zero density */
1448 if (atomDensity
> 0)
1450 grid
->atom_density
= atomDensity
;
1454 grid
->atom_density
= grid_atom_density(n
- numAtomsMoved
, lowerCorner
, upperCorner
);
1459 nbs
->natoms_local
= atomEnd
- numAtomsMoved
;
1460 /* We assume that nbnxn_put_on_grid is called first
1461 * for the local atoms (ddZone=0).
1463 nbs
->natoms_nonlocal
= atomEnd
- numAtomsMoved
;
1467 fprintf(debug
, "natoms_local = %5d atom_density = %5.1f\n",
1468 nbs
->natoms_local
, grid
->atom_density
);
1473 nbs
->natoms_nonlocal
= std::max(nbs
->natoms_nonlocal
, atomEnd
);
1476 /* We always use the home zone (grid[0]) for setting the cell size,
1477 * since determining densities for non-local zones is difficult.
1479 set_grid_size_xy(nbs
, grid
,
1480 ddZone
, n
- numAtomsMoved
,
1481 lowerCorner
, upperCorner
,
1482 nbs
->grid
[0].atom_density
);
1484 calc_cell_indices(nbs
, ddZone
, grid
, atomStart
, atomEnd
, atinfo
, x
, numAtomsMoved
, move
, nbat
);
1488 nbat
->natoms_local
= nbat
->natoms
;
1491 nbs_cycle_stop(&nbs
->cc
[enbsCCgrid
]);
1494 /* Calls nbnxn_put_on_grid for all non-local domains */
1495 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs
,
1496 const struct gmx_domdec_zones_t
*zones
,
1500 nbnxn_atomdata_t
*nbat
)
1502 for (int zone
= 1; zone
< zones
->n
; zone
++)
1505 for (int d
= 0; d
< DIM
; d
++)
1507 c0
[d
] = zones
->size
[zone
].bb_x0
[d
];
1508 c1
[d
] = zones
->size
[zone
].bb_x1
[d
];
1511 nbnxn_put_on_grid(nbs
, nbs
->ePBC
, nullptr,
1513 zones
->cg_range
[zone
],
1514 zones
->cg_range
[zone
+1],
1524 void nbnxn_get_ncells(nbnxn_search_t nbs
, int *ncx
, int *ncy
)
1526 *ncx
= nbs
->grid
[0].ncx
;
1527 *ncy
= nbs
->grid
[0].ncy
;
1530 void nbnxn_get_atomorder(const nbnxn_search_t nbs
, const int **a
, int *n
)
1532 const nbnxn_grid_t
*grid
;
1534 grid
= &nbs
->grid
[0];
1536 /* Return the atom order for the home cell (index 0) */
1539 *n
= grid
->cxy_ind
[grid
->ncx
*grid
->ncy
]*grid
->na_sc
;
1542 void nbnxn_set_atomorder(nbnxn_search_t nbs
)
1544 /* Set the atom order for the home cell (index 0) */
1545 nbnxn_grid_t
*grid
= &nbs
->grid
[0];
1548 for (int cx
= 0; cx
< grid
->ncx
; cx
++)
1550 for (int cy
= 0; cy
< grid
->ncy
; cy
++)
1552 int cxy
= cx
*grid
->ncy
+ cy
;
1553 int j
= grid
->cxy_ind
[cxy
]*grid
->na_sc
;
1554 for (int cz
= 0; cz
< grid
->cxy_na
[cxy
]; cz
++)