2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 #include "nbnxn_grid.h"
46 #include "gromacs/domdec/domdec_struct.h"
47 #include "gromacs/math/utilities.h"
48 #include "gromacs/math/vec.h"
49 #include "gromacs/mdlib/gmx_omp_nthreads.h"
50 #include "gromacs/mdlib/nb_verlet.h"
51 #include "gromacs/mdlib/nbnxn_atomdata.h"
52 #include "gromacs/mdlib/nbnxn_consts.h"
53 #include "gromacs/mdlib/nbnxn_internal.h"
54 #include "gromacs/mdlib/nbnxn_search.h"
55 #include "gromacs/mdlib/nbnxn_util.h"
56 #include "gromacs/mdtypes/forcerec.h" // only for GET_CGINFO_*
57 #include "gromacs/simd/simd.h"
58 #include "gromacs/simd/vector_operations.h"
59 #include "gromacs/utility/exceptions.h"
60 #include "gromacs/utility/smalloc.h"
62 struct gmx_domdec_zones_t
;
64 static real
grid_atom_density(int numAtoms
,
65 const rvec lowerCorner
,
66 const rvec upperCorner
)
72 /* To avoid zero density we use a minimum of 1 atom */
76 rvec_sub(upperCorner
, lowerCorner
, size
);
78 return numAtoms
/(size
[XX
]*size
[YY
]*size
[ZZ
]);
81 static int set_grid_size_xy(const nbnxn_search_t nbs
,
85 const rvec lowerCorner
,
86 const rvec upperCorner
,
90 real tlen
, tlen_x
, tlen_y
;
92 rvec_sub(upperCorner
, lowerCorner
, size
);
94 if (numAtoms
> grid
->na_sc
)
96 GMX_ASSERT(atomDensity
> 0, "With one or more atoms, the density should be positive");
98 /* target cell length */
101 /* To minimize the zero interactions, we should make
102 * the largest of the i/j cell cubic.
104 int numAtomsInCell
= std::max(grid
->na_c
, grid
->na_cj
);
106 /* Approximately cubic cells */
107 tlen
= std::cbrt(numAtomsInCell
/atomDensity
);
113 /* Approximately cubic sub cells */
114 tlen
= std::cbrt(grid
->na_c
/atomDensity
);
115 tlen_x
= tlen
*c_gpuNumClusterPerCellX
;
116 tlen_y
= tlen
*c_gpuNumClusterPerCellY
;
118 /* We round ncx and ncy down, because we get less cell pairs
119 * in the nbsist when the fixed cell dimensions (x,y) are
120 * larger than the variable one (z) than the other way around.
122 grid
->ncx
= std::max(1, static_cast<int>(size
[XX
]/tlen_x
));
123 grid
->ncy
= std::max(1, static_cast<int>(size
[YY
]/tlen_y
));
131 grid
->sx
= size
[XX
]/grid
->ncx
;
132 grid
->sy
= size
[YY
]/grid
->ncy
;
133 grid
->inv_sx
= 1/grid
->sx
;
134 grid
->inv_sy
= 1/grid
->sy
;
138 /* This is a non-home zone, add an extra row of cells
139 * for particles communicated for bonded interactions.
140 * These can be beyond the cut-off. It doesn't matter where
141 * they end up on the grid, but for performance it's better
142 * if they don't end up in cells that can be within cut-off range.
148 /* We need one additional cell entry for particles moved by DD */
149 grid
->cxy_na
.resize(grid
->ncx
*grid
->ncy
+ 1);
150 grid
->cxy_ind
.resize(grid
->ncx
*grid
->ncy
+ 2);
152 for (int t
= 0; t
< nbs
->nthread_max
; t
++)
154 if (grid
->ncx
*grid
->ncy
+1 > nbs
->work
[t
].cxy_na_nalloc
)
156 nbs
->work
[t
].cxy_na_nalloc
= over_alloc_large(grid
->ncx
*grid
->ncy
+1);
157 srenew(nbs
->work
[t
].cxy_na
, nbs
->work
[t
].cxy_na_nalloc
);
161 /* Worst case scenario of 1 atom in each last cell */
163 if (grid
->na_cj
<= grid
->na_c
)
165 maxNumCells
= numAtoms
/grid
->na_sc
+ grid
->ncx
*grid
->ncy
;
169 maxNumCells
= numAtoms
/grid
->na_sc
+ grid
->ncx
*grid
->ncy
*grid
->na_cj
/grid
->na_c
;
172 grid
->nsubc
.resize(maxNumCells
);
173 grid
->bbcz
.resize(maxNumCells
*NNBSBB_D
);
175 /* This resize also zeros the contents, this avoid possible
176 * floating exceptions in SIMD with the unused bb elements.
180 grid
->bb
.resize(maxNumCells
);
185 grid
->pbb
.resize(maxNumCells
*c_gpuNumClusterPerCell
/STRIDE_PBB
*NNBSBB_XXXX
);
187 grid
->bb
.resize(maxNumCells
*c_gpuNumClusterPerCell
);
193 if (grid
->na_cj
== grid
->na_c
)
195 grid
->bbj
= grid
->bb
;
199 grid
->bbjStorage
.resize(maxNumCells
*grid
->na_c
/grid
->na_cj
);
200 grid
->bbj
= grid
->bbjStorage
;
204 grid
->flags
.resize(maxNumCells
);
207 grid
->fep
.resize(maxNumCells
*grid
->na_sc
/grid
->na_c
);
210 copy_rvec(lowerCorner
, grid
->c0
);
211 copy_rvec(upperCorner
, grid
->c1
);
212 copy_rvec(size
, grid
->size
);
217 /* We need to sort paricles in grid columns on z-coordinate.
218 * As particle are very often distributed homogeneously, we a sorting
219 * algorithm similar to pigeonhole sort. We multiply the z-coordinate
220 * by a factor, cast to an int and try to store in that hole. If the hole
221 * is full, we move this or another particle. A second pass is needed to make
222 * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
223 * 4 is the optimal value for homogeneous particle distribution and allows
224 * for an O(#particles) sort up till distributions were all particles are
225 * concentrated in 1/4 of the space. No NlogN fallback is implemented,
226 * as it can be expensive to detect imhomogeneous particle distributions.
227 * SGSF is the maximum ratio of holes used, in the worst case all particles
228 * end up in the last hole and we need #particles extra holes at the end.
230 #define SORT_GRID_OVERSIZE 4
231 #define SGSF (SORT_GRID_OVERSIZE + 1)
233 /* Sort particle index a on coordinates x along dim.
234 * Backwards tells if we want decreasing iso increasing coordinates.
235 * h0 is the minimum of the coordinate range.
236 * invh is the 1/length of the sorting range.
237 * n_per_h (>=n) is the expected average number of particles per 1/invh
238 * sort is the sorting work array.
239 * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
240 * or easier, allocate at least n*SGSF elements.
242 static void sort_atoms(int dim
, gmx_bool Backwards
,
243 int gmx_unused dd_zone
,
244 int *a
, int n
, const rvec
*x
,
245 real h0
, real invh
, int n_per_h
,
257 gmx_incons("n > n_per_h");
261 /* Transform the inverse range height into the inverse hole height */
262 invh
*= n_per_h
*SORT_GRID_OVERSIZE
;
264 /* Set nsort to the maximum possible number of holes used.
265 * In worst case all n elements end up in the last bin.
267 int nsort
= n_per_h
*SORT_GRID_OVERSIZE
+ n
;
269 /* Determine the index range used, so we can limit it for the second pass */
270 int zi_min
= INT_MAX
;
273 /* Sort the particles using a simple index sort */
274 for (int i
= 0; i
< n
; i
++)
276 /* The cast takes care of float-point rounding effects below zero.
277 * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
278 * times the box height out of the box.
280 int zi
= static_cast<int>((x
[a
[i
]][dim
] - h0
)*invh
);
283 /* As we can have rounding effect, we use > iso >= here */
284 if (zi
< 0 || (dd_zone
== 0 && zi
> n_per_h
*SORT_GRID_OVERSIZE
))
286 gmx_fatal(FARGS
, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
287 a
[i
], 'x'+dim
, x
[a
[i
]][dim
], h0
, invh
, zi
,
288 n_per_h
, SORT_GRID_OVERSIZE
);
292 /* In a non-local domain, particles communcated for bonded interactions
293 * can be far beyond the grid size, which is set by the non-bonded
294 * cut-off distance. We sort such particles into the last cell.
296 if (zi
> n_per_h
*SORT_GRID_OVERSIZE
)
298 zi
= n_per_h
*SORT_GRID_OVERSIZE
;
301 /* Ideally this particle should go in sort cell zi,
302 * but that might already be in use,
303 * in that case find the first empty cell higher up
308 zi_min
= std::min(zi_min
, zi
);
309 zi_max
= std::max(zi_max
, zi
);
313 /* We have multiple atoms in the same sorting slot.
314 * Sort on real z for minimal bounding box size.
315 * There is an extra check for identical z to ensure
316 * well-defined output order, independent of input order
317 * to ensure binary reproducibility after restarts.
319 while (sort
[zi
] >= 0 && ( x
[a
[i
]][dim
] > x
[sort
[zi
]][dim
] ||
320 (x
[a
[i
]][dim
] == x
[sort
[zi
]][dim
] &&
328 /* Shift all elements by one slot until we find an empty slot */
331 while (sort
[zim
] >= 0)
339 zi_max
= std::max(zi_max
, zim
);
342 zi_max
= std::max(zi_max
, zi
);
349 for (int zi
= 0; zi
< nsort
; zi
++)
360 for (int zi
= zi_max
; zi
>= zi_min
; zi
--)
371 gmx_incons("Lost particles while sorting");
376 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
377 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
383 /* Coordinate order x,y,z, bb order xyz0 */
384 static void calc_bounding_box(int na
, int stride
, const real
*x
, nbnxn_bb_t
*bb
)
387 real xl
, xh
, yl
, yh
, zl
, zh
;
397 for (int j
= 1; j
< na
; j
++)
399 xl
= std::min(xl
, x
[i
+XX
]);
400 xh
= std::max(xh
, x
[i
+XX
]);
401 yl
= std::min(yl
, x
[i
+YY
]);
402 yh
= std::max(yh
, x
[i
+YY
]);
403 zl
= std::min(zl
, x
[i
+ZZ
]);
404 zh
= std::max(zh
, x
[i
+ZZ
]);
407 /* Note: possible double to float conversion here */
408 bb
->lower
[BB_X
] = R2F_D(xl
);
409 bb
->lower
[BB_Y
] = R2F_D(yl
);
410 bb
->lower
[BB_Z
] = R2F_D(zl
);
411 bb
->upper
[BB_X
] = R2F_U(xh
);
412 bb
->upper
[BB_Y
] = R2F_U(yh
);
413 bb
->upper
[BB_Z
] = R2F_U(zh
);
416 /* Packed coordinates, bb order xyz0 */
417 static void calc_bounding_box_x_x4(int na
, const real
*x
, nbnxn_bb_t
*bb
)
419 real xl
, xh
, yl
, yh
, zl
, zh
;
427 for (int j
= 1; j
< na
; j
++)
429 xl
= std::min(xl
, x
[j
+XX
*c_packX4
]);
430 xh
= std::max(xh
, x
[j
+XX
*c_packX4
]);
431 yl
= std::min(yl
, x
[j
+YY
*c_packX4
]);
432 yh
= std::max(yh
, x
[j
+YY
*c_packX4
]);
433 zl
= std::min(zl
, x
[j
+ZZ
*c_packX4
]);
434 zh
= std::max(zh
, x
[j
+ZZ
*c_packX4
]);
436 /* Note: possible double to float conversion here */
437 bb
->lower
[BB_X
] = R2F_D(xl
);
438 bb
->lower
[BB_Y
] = R2F_D(yl
);
439 bb
->lower
[BB_Z
] = R2F_D(zl
);
440 bb
->upper
[BB_X
] = R2F_U(xh
);
441 bb
->upper
[BB_Y
] = R2F_U(yh
);
442 bb
->upper
[BB_Z
] = R2F_U(zh
);
445 /* Packed coordinates, bb order xyz0 */
446 static void calc_bounding_box_x_x8(int na
, const real
*x
, nbnxn_bb_t
*bb
)
448 real xl
, xh
, yl
, yh
, zl
, zh
;
456 for (int j
= 1; j
< na
; j
++)
458 xl
= std::min(xl
, x
[j
+XX
*c_packX8
]);
459 xh
= std::max(xh
, x
[j
+XX
*c_packX8
]);
460 yl
= std::min(yl
, x
[j
+YY
*c_packX8
]);
461 yh
= std::max(yh
, x
[j
+YY
*c_packX8
]);
462 zl
= std::min(zl
, x
[j
+ZZ
*c_packX8
]);
463 zh
= std::max(zh
, x
[j
+ZZ
*c_packX8
]);
465 /* Note: possible double to float conversion here */
466 bb
->lower
[BB_X
] = R2F_D(xl
);
467 bb
->lower
[BB_Y
] = R2F_D(yl
);
468 bb
->lower
[BB_Z
] = R2F_D(zl
);
469 bb
->upper
[BB_X
] = R2F_U(xh
);
470 bb
->upper
[BB_Y
] = R2F_U(yh
);
471 bb
->upper
[BB_Z
] = R2F_U(zh
);
474 /* Packed coordinates, bb order xyz0 */
475 gmx_unused
static void calc_bounding_box_x_x4_halves(int na
, const real
*x
,
476 nbnxn_bb_t
*bb
, nbnxn_bb_t
*bbj
)
478 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
481 calc_bounding_box_x_x4(std::min(na
, 2), x
, bbj
);
485 calc_bounding_box_x_x4(std::min(na
-2, 2), x
+(c_packX4
>>1), bbj
+1);
489 /* Set the "empty" bounding box to the same as the first one,
490 * so we don't need to treat special cases in the rest of the code.
492 #if NBNXN_SEARCH_BB_SIMD4
493 store4(&bbj
[1].lower
[0], load4(&bbj
[0].lower
[0]));
494 store4(&bbj
[1].upper
[0], load4(&bbj
[0].upper
[0]));
500 #if NBNXN_SEARCH_BB_SIMD4
501 store4(&bb
->lower
[0], min(load4(&bbj
[0].lower
[0]), load4(&bbj
[1].lower
[0])));
502 store4(&bb
->upper
[0], max(load4(&bbj
[0].upper
[0]), load4(&bbj
[1].upper
[0])));
507 for (i
= 0; i
< NNBSBB_C
; i
++)
509 bb
->lower
[i
] = std::min(bbj
[0].lower
[i
], bbj
[1].lower
[i
]);
510 bb
->upper
[i
] = std::max(bbj
[0].upper
[i
], bbj
[1].upper
[i
]);
516 #if NBNXN_SEARCH_BB_SIMD4
518 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
519 static void calc_bounding_box_xxxx(int na
, int stride
, const real
*x
, float *bb
)
522 real xl
, xh
, yl
, yh
, zl
, zh
;
532 for (int j
= 1; j
< na
; j
++)
534 xl
= std::min(xl
, x
[i
+XX
]);
535 xh
= std::max(xh
, x
[i
+XX
]);
536 yl
= std::min(yl
, x
[i
+YY
]);
537 yh
= std::max(yh
, x
[i
+YY
]);
538 zl
= std::min(zl
, x
[i
+ZZ
]);
539 zh
= std::max(zh
, x
[i
+ZZ
]);
542 /* Note: possible double to float conversion here */
543 bb
[0*STRIDE_PBB
] = R2F_D(xl
);
544 bb
[1*STRIDE_PBB
] = R2F_D(yl
);
545 bb
[2*STRIDE_PBB
] = R2F_D(zl
);
546 bb
[3*STRIDE_PBB
] = R2F_U(xh
);
547 bb
[4*STRIDE_PBB
] = R2F_U(yh
);
548 bb
[5*STRIDE_PBB
] = R2F_U(zh
);
551 #endif /* NBNXN_SEARCH_BB_SIMD4 */
553 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
555 /* Coordinate order xyz?, bb order xyz0 */
556 static void calc_bounding_box_simd4(int na
, const float *x
, nbnxn_bb_t
*bb
)
558 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
561 Simd4Float bb_0_S
, bb_1_S
;
567 for (int i
= 1; i
< na
; i
++)
569 x_S
= load4(x
+i
*NNBSBB_C
);
570 bb_0_S
= min(bb_0_S
, x_S
);
571 bb_1_S
= max(bb_1_S
, x_S
);
574 store4(&bb
->lower
[0], bb_0_S
);
575 store4(&bb
->upper
[0], bb_1_S
);
578 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
579 static void calc_bounding_box_xxxx_simd4(int na
, const float *x
,
580 nbnxn_bb_t
*bb_work_aligned
,
583 calc_bounding_box_simd4(na
, x
, bb_work_aligned
);
585 bb
[0*STRIDE_PBB
] = bb_work_aligned
->lower
[BB_X
];
586 bb
[1*STRIDE_PBB
] = bb_work_aligned
->lower
[BB_Y
];
587 bb
[2*STRIDE_PBB
] = bb_work_aligned
->lower
[BB_Z
];
588 bb
[3*STRIDE_PBB
] = bb_work_aligned
->upper
[BB_X
];
589 bb
[4*STRIDE_PBB
] = bb_work_aligned
->upper
[BB_Y
];
590 bb
[5*STRIDE_PBB
] = bb_work_aligned
->upper
[BB_Z
];
593 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
596 /* Combines pairs of consecutive bounding boxes */
597 static void combine_bounding_box_pairs(nbnxn_grid_t
*grid
,
598 gmx::ArrayRef
<const nbnxn_bb_t
> bb
)
600 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
603 for (int i
= 0; i
< grid
->ncx
*grid
->ncy
; i
++)
605 /* Starting bb in a column is expected to be 2-aligned */
606 int sc2
= grid
->cxy_ind
[i
]>>1;
607 /* For odd numbers skip the last bb here */
608 int nc2
= (grid
->cxy_na
[i
]+3)>>(2+1);
610 for (c2
= sc2
; c2
< sc2
+nc2
; c2
++)
612 #if NBNXN_SEARCH_BB_SIMD4
613 Simd4Float min_S
, max_S
;
615 min_S
= min(load4(&bb
[c2
*2+0].lower
[0]),
616 load4(&bb
[c2
*2+1].lower
[0]));
617 max_S
= max(load4(&bb
[c2
*2+0].upper
[0]),
618 load4(&bb
[c2
*2+1].upper
[0]));
619 store4(&grid
->bbj
[c2
].lower
[0], min_S
);
620 store4(&grid
->bbj
[c2
].upper
[0], max_S
);
622 for (int j
= 0; j
< NNBSBB_C
; j
++)
624 grid
->bbj
[c2
].lower
[j
] = std::min(bb
[c2
*2+0].lower
[j
],
625 bb
[c2
*2+1].lower
[j
]);
626 grid
->bbj
[c2
].upper
[j
] = std::max(bb
[c2
*2+0].upper
[j
],
627 bb
[c2
*2+1].upper
[j
]);
631 if (((grid
->cxy_na
[i
]+3)>>2) & 1)
633 /* The bb count in this column is odd: duplicate the last bb */
634 for (int j
= 0; j
< NNBSBB_C
; j
++)
636 grid
->bbj
[c2
].lower
[j
] = bb
[c2
*2].lower
[j
];
637 grid
->bbj
[c2
].upper
[j
] = bb
[c2
*2].upper
[j
];
644 /* Prints the average bb size, used for debug output */
645 static void print_bbsizes_simple(FILE *fp
,
646 const nbnxn_grid_t
*grid
)
651 for (int c
= 0; c
< grid
->nc
; c
++)
653 for (int d
= 0; d
< DIM
; d
++)
655 ba
[d
] += grid
->bb
[c
].upper
[d
] - grid
->bb
[c
].lower
[d
];
658 dsvmul(1.0/grid
->nc
, ba
, ba
);
660 fprintf(fp
, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
663 grid
->atom_density
> 0 ?
664 grid
->na_c
/(grid
->atom_density
*grid
->sx
*grid
->sy
) : 0.0,
665 ba
[XX
], ba
[YY
], ba
[ZZ
],
668 grid
->atom_density
> 0 ?
669 ba
[ZZ
]/(grid
->na_c
/(grid
->atom_density
*grid
->sx
*grid
->sy
)) : 0.0);
672 /* Prints the average bb size, used for debug output */
673 static void print_bbsizes_supersub(FILE *fp
,
674 const nbnxn_grid_t
*grid
)
681 for (int c
= 0; c
< grid
->nc
; c
++)
684 for (int s
= 0; s
< grid
->nsubc
[c
]; s
+= STRIDE_PBB
)
686 int cs_w
= (c
*c_gpuNumClusterPerCell
+ s
)/STRIDE_PBB
;
687 for (int i
= 0; i
< STRIDE_PBB
; i
++)
689 for (int d
= 0; d
< DIM
; d
++)
692 grid
->pbb
[cs_w
*NNBSBB_XXXX
+(DIM
+d
)*STRIDE_PBB
+i
] -
693 grid
->pbb
[cs_w
*NNBSBB_XXXX
+ d
*STRIDE_PBB
+i
];
698 for (int s
= 0; s
< grid
->nsubc
[c
]; s
++)
700 int cs
= c
*c_gpuNumClusterPerCell
+ s
;
701 for (int d
= 0; d
< DIM
; d
++)
703 ba
[d
] += grid
->bb
[cs
].upper
[d
] - grid
->bb
[cs
].lower
[d
];
707 ns
+= grid
->nsubc
[c
];
709 dsvmul(1.0/ns
, ba
, ba
);
711 fprintf(fp
, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
712 grid
->sx
/c_gpuNumClusterPerCellX
,
713 grid
->sy
/c_gpuNumClusterPerCellY
,
714 grid
->atom_density
> 0 ?
715 grid
->na_sc
/(grid
->atom_density
*grid
->sx
*grid
->sy
*c_gpuNumClusterPerCellZ
) : 0.0,
716 ba
[XX
], ba
[YY
], ba
[ZZ
],
717 ba
[XX
]*c_gpuNumClusterPerCellX
/grid
->sx
,
718 ba
[YY
]*c_gpuNumClusterPerCellY
/grid
->sy
,
719 grid
->atom_density
> 0 ?
720 ba
[ZZ
]/(grid
->na_sc
/(grid
->atom_density
*grid
->sx
*grid
->sy
*c_gpuNumClusterPerCellZ
)) : 0.0);
723 /* Set non-bonded interaction flags for the current cluster.
724 * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
726 static void sort_cluster_on_flag(int numAtomsInCluster
,
733 constexpr int c_maxNumAtomsInCluster
= 8;
734 int sort1
[c_maxNumAtomsInCluster
];
735 int sort2
[c_maxNumAtomsInCluster
];
737 GMX_ASSERT(numAtomsInCluster
<= c_maxNumAtomsInCluster
, "Need to increase c_maxNumAtomsInCluster to support larger clusters");
742 for (int s
= atomStart
; s
< atomEnd
; s
+= numAtomsInCluster
)
744 /* Make lists for this (sub-)cell on atoms with and without LJ */
747 gmx_bool haveQ
= FALSE
;
749 for (int a
= s
; a
< std::min(s
+ numAtomsInCluster
, atomEnd
); a
++)
751 haveQ
= haveQ
|| GET_CGINFO_HAS_Q(atinfo
[order
[a
]]);
753 if (GET_CGINFO_HAS_VDW(atinfo
[order
[a
]]))
755 sort1
[n1
++] = order
[a
];
760 sort2
[n2
++] = order
[a
];
764 /* If we don't have atoms with LJ, there's nothing to sort */
767 *flags
|= NBNXN_CI_DO_LJ(subc
);
769 if (2*n1
<= numAtomsInCluster
)
771 /* Only sort when strictly necessary. Ordering particles
772 * Ordering particles can lead to less accurate summation
773 * due to rounding, both for LJ and Coulomb interactions.
775 if (2*(a_lj_max
- s
) >= numAtomsInCluster
)
777 for (int i
= 0; i
< n1
; i
++)
779 order
[atomStart
+ i
] = sort1
[i
];
781 for (int j
= 0; j
< n2
; j
++)
783 order
[atomStart
+ n1
+ j
] = sort2
[j
];
787 *flags
|= NBNXN_CI_HALF_LJ(subc
);
792 *flags
|= NBNXN_CI_DO_COUL(subc
);
798 /* Fill a pair search cell with atoms.
799 * Potentially sorts atoms and sets the interaction flags.
801 static void fill_cell(const nbnxn_search_t nbs
,
803 nbnxn_atomdata_t
*nbat
,
808 nbnxn_bb_t gmx_unused
*bb_work_aligned
)
810 const int numAtoms
= atomEnd
- atomStart
;
814 /* Note that non-local grids are already sorted.
815 * Then sort_cluster_on_flag will only set the flags and the sorting
816 * will not affect the atom order.
818 sort_cluster_on_flag(grid
->na_c
, atomStart
, atomEnd
, atinfo
, nbs
->a
,
819 grid
->flags
.data() + (atomStart
>> grid
->na_c_2log
) - grid
->cell0
);
824 /* Set the fep flag for perturbed atoms in this (sub-)cell */
826 /* The grid-local cluster/(sub-)cell index */
827 int cell
= (atomStart
>> grid
->na_c_2log
) - grid
->cell0
*(grid
->bSimple
? 1 : c_gpuNumClusterPerCell
);
829 for (int at
= atomStart
; at
< atomEnd
; at
++)
831 if (nbs
->a
[at
] >= 0 && GET_CGINFO_FEP(atinfo
[nbs
->a
[at
]]))
833 grid
->fep
[cell
] |= (1 << (at
- atomStart
));
838 /* Now we have sorted the atoms, set the cell indices */
839 for (int at
= atomStart
; at
< atomEnd
; at
++)
841 nbs
->cell
[nbs
->a
[at
]] = at
;
844 copy_rvec_to_nbat_real(nbs
->a
+ atomStart
, numAtoms
, grid
->na_c
, x
,
845 nbat
->XFormat
, nbat
->x
, atomStart
);
847 if (nbat
->XFormat
== nbatX4
)
849 /* Store the bounding boxes as xyz.xyz. */
850 size_t offset
= (atomStart
- grid
->cell0
*grid
->na_sc
) >> grid
->na_c_2log
;
851 nbnxn_bb_t
*bb_ptr
= grid
->bb
.data() + offset
;
853 #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2
854 if (2*grid
->na_cj
== grid
->na_c
)
856 calc_bounding_box_x_x4_halves(numAtoms
, nbat
->x
+ atom_to_x_index
<c_packX4
>(atomStart
), bb_ptr
,
857 grid
->bbj
.data() + offset
*2);
862 calc_bounding_box_x_x4(numAtoms
, nbat
->x
+ atom_to_x_index
<c_packX4
>(atomStart
), bb_ptr
);
865 else if (nbat
->XFormat
== nbatX8
)
867 /* Store the bounding boxes as xyz.xyz. */
868 size_t offset
= (atomStart
- grid
->cell0
*grid
->na_sc
) >> grid
->na_c_2log
;
869 nbnxn_bb_t
*bb_ptr
= grid
->bb
.data() + offset
;
871 calc_bounding_box_x_x8(numAtoms
, nbat
->x
+ atom_to_x_index
<c_packX8
>(atomStart
), bb_ptr
);
874 else if (!grid
->bSimple
)
876 /* Store the bounding boxes in a format convenient
877 * for SIMD4 calculations: xxxxyyyyzzzz...
881 ((atomStart
- grid
->cell0
*grid
->na_sc
) >> (grid
->na_c_2log
+ STRIDE_PBB_2LOG
))*NNBSBB_XXXX
+
882 (((atomStart
- grid
->cell0
*grid
->na_sc
) >> grid
->na_c_2log
) & (STRIDE_PBB
- 1));
884 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
885 if (nbat
->XFormat
== nbatXYZQ
)
887 calc_bounding_box_xxxx_simd4(numAtoms
, nbat
->x
+ atomStart
*nbat
->xstride
,
888 bb_work_aligned
, pbb_ptr
);
893 calc_bounding_box_xxxx(numAtoms
, nbat
->xstride
, nbat
->x
+ atomStart
*nbat
->xstride
,
898 fprintf(debug
, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
899 atomStart
>> grid
->na_c_2log
,
900 pbb_ptr
[0*STRIDE_PBB
], pbb_ptr
[3*STRIDE_PBB
],
901 pbb_ptr
[1*STRIDE_PBB
], pbb_ptr
[4*STRIDE_PBB
],
902 pbb_ptr
[2*STRIDE_PBB
], pbb_ptr
[5*STRIDE_PBB
]);
908 /* Store the bounding boxes as xyz.xyz. */
909 nbnxn_bb_t
*bb_ptr
= grid
->bb
.data() + ((atomStart
- grid
->cell0
*grid
->na_sc
) >> grid
->na_c_2log
);
911 calc_bounding_box(numAtoms
, nbat
->xstride
, nbat
->x
+ atomStart
*nbat
->xstride
,
916 int bbo
= (atomStart
- grid
->cell0
*grid
->na_sc
)/grid
->na_c
;
917 fprintf(debug
, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
918 atomStart
>> grid
->na_c_2log
,
919 grid
->bb
[bbo
].lower
[BB_X
],
920 grid
->bb
[bbo
].lower
[BB_Y
],
921 grid
->bb
[bbo
].lower
[BB_Z
],
922 grid
->bb
[bbo
].upper
[BB_X
],
923 grid
->bb
[bbo
].upper
[BB_Y
],
924 grid
->bb
[bbo
].upper
[BB_Z
]);
929 /* Spatially sort the atoms within one grid column */
930 static void sort_columns_simple(const nbnxn_search_t nbs
,
933 int atomStart
, int atomEnd
,
936 nbnxn_atomdata_t
*nbat
,
937 int cxy_start
, int cxy_end
,
942 fprintf(debug
, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
943 grid
->cell0
, cxy_start
, cxy_end
, atomStart
, atomEnd
);
946 /* Sort the atoms within each x,y column in 3 dimensions */
947 for (int cxy
= cxy_start
; cxy
< cxy_end
; cxy
++)
949 int na
= grid
->cxy_na
[cxy
];
950 int ncz
= grid
->cxy_ind
[cxy
+1] - grid
->cxy_ind
[cxy
];
951 int ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
953 /* Sort the atoms within each x,y column on z coordinate */
954 sort_atoms(ZZ
, FALSE
, dd_zone
,
957 1.0/grid
->size
[ZZ
], ncz
*grid
->na_sc
,
960 /* Fill the ncz cells in this column */
961 int cfilled
= grid
->cxy_ind
[cxy
];
962 for (int cz
= 0; cz
< ncz
; cz
++)
964 int c
= grid
->cxy_ind
[cxy
] + cz
;
966 int ash_c
= ash
+ cz
*grid
->na_sc
;
967 int na_c
= std::min(grid
->na_sc
, na
-(ash_c
-ash
));
969 fill_cell(nbs
, grid
, nbat
,
970 ash_c
, ash_c
+na_c
, atinfo
, x
,
973 /* This copy to bbcz is not really necessary.
974 * But it allows to use the same grid search code
975 * for the simple and supersub cell setups.
981 grid
->bbcz
[c
*NNBSBB_D
] = grid
->bb
[cfilled
].lower
[BB_Z
];
982 grid
->bbcz
[c
*NNBSBB_D
+1] = grid
->bb
[cfilled
].upper
[BB_Z
];
985 /* Set the unused atom indices to -1 */
986 for (int ind
= na
; ind
< ncz
*grid
->na_sc
; ind
++)
988 nbs
->a
[ash
+ind
] = -1;
993 /* Spatially sort the atoms within one grid column */
994 static void sort_columns_supersub(const nbnxn_search_t nbs
,
997 int atomStart
, int atomEnd
,
1000 nbnxn_atomdata_t
*nbat
,
1001 int cxy_start
, int cxy_end
,
1004 nbnxn_bb_t bb_work_array
[2];
1005 nbnxn_bb_t
*bb_work_aligned
= reinterpret_cast<nbnxn_bb_t
*>((reinterpret_cast<std::size_t>(bb_work_array
+ 1)) & (~(static_cast<std::size_t>(15))));
1009 fprintf(debug
, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1010 grid
->cell0
, cxy_start
, cxy_end
, atomStart
, atomEnd
);
1013 int subdiv_x
= grid
->na_c
;
1014 int subdiv_y
= c_gpuNumClusterPerCellX
*subdiv_x
;
1015 int subdiv_z
= c_gpuNumClusterPerCellY
*subdiv_y
;
1017 /* Sort the atoms within each x,y column in 3 dimensions.
1018 * Loop over all columns on the x/y grid.
1020 for (int cxy
= cxy_start
; cxy
< cxy_end
; cxy
++)
1022 int gridX
= cxy
/grid
->ncy
;
1023 int gridY
= cxy
- gridX
*grid
->ncy
;
1025 int numAtomsInColumn
= grid
->cxy_na
[cxy
];
1026 int numCellsInColumn
= grid
->cxy_ind
[cxy
+ 1] - grid
->cxy_ind
[cxy
];
1027 int ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
1029 /* Sort the atoms within each x,y column on z coordinate */
1030 sort_atoms(ZZ
, FALSE
, dd_zone
,
1031 nbs
->a
+ ash
, numAtomsInColumn
, x
,
1033 1.0/grid
->size
[ZZ
], numCellsInColumn
*grid
->na_sc
,
1036 /* This loop goes over the cells and clusters along z at once */
1037 for (int sub_z
= 0; sub_z
< numCellsInColumn
*c_gpuNumClusterPerCellZ
; sub_z
++)
1039 int ash_z
= ash
+ sub_z
*subdiv_z
;
1040 int na_z
= std::min(subdiv_z
, numAtomsInColumn
- (ash_z
- ash
));
1042 /* We have already sorted on z */
1044 if (sub_z
% c_gpuNumClusterPerCellZ
== 0)
1046 cz
= sub_z
/c_gpuNumClusterPerCellZ
;
1047 int cell
= grid
->cxy_ind
[cxy
] + cz
;
1049 /* The number of atoms in this cell/super-cluster */
1050 int numAtoms
= std::min(grid
->na_sc
, numAtomsInColumn
- (ash_z
- ash
));
1052 grid
->nsubc
[cell
] = std::min(c_gpuNumClusterPerCell
,
1053 (numAtoms
+ grid
->na_c
- 1)/grid
->na_c
);
1055 /* Store the z-boundaries of the bounding box of the cell */
1056 grid
->bbcz
[cell
*NNBSBB_D
] = x
[nbs
->a
[ash_z
]][ZZ
];
1057 grid
->bbcz
[cell
*NNBSBB_D
+1] = x
[nbs
->a
[ash_z
+ numAtoms
- 1]][ZZ
];
1060 if (c_gpuNumClusterPerCellY
> 1)
1062 /* Sort the atoms along y */
1063 sort_atoms(YY
, (sub_z
& 1), dd_zone
,
1064 nbs
->a
+ ash_z
, na_z
, x
,
1065 grid
->c0
[YY
] + gridY
*grid
->sy
,
1066 grid
->inv_sy
, subdiv_z
,
1070 for (int sub_y
= 0; sub_y
< c_gpuNumClusterPerCellY
; sub_y
++)
1072 int ash_y
= ash_z
+ sub_y
*subdiv_y
;
1073 int na_y
= std::min(subdiv_y
, numAtomsInColumn
- (ash_y
- ash
));
1075 if (c_gpuNumClusterPerCellX
> 1)
1077 /* Sort the atoms along x */
1078 sort_atoms(XX
, ((cz
*c_gpuNumClusterPerCellY
+ sub_y
) & 1), dd_zone
,
1079 nbs
->a
+ ash_y
, na_y
, x
,
1080 grid
->c0
[XX
] + gridX
*grid
->sx
,
1081 grid
->inv_sx
, subdiv_y
,
1085 for (int sub_x
= 0; sub_x
< c_gpuNumClusterPerCellX
; sub_x
++)
1087 int ash_x
= ash_y
+ sub_x
*subdiv_x
;
1088 int na_x
= std::min(subdiv_x
, numAtomsInColumn
- (ash_x
- ash
));
1090 fill_cell(nbs
, grid
, nbat
,
1091 ash_x
, ash_x
+ na_x
, atinfo
, x
,
1097 /* Set the unused atom indices to -1 */
1098 for (int ind
= numAtomsInColumn
; ind
< numCellsInColumn
*grid
->na_sc
; ind
++)
1100 nbs
->a
[ash
+ ind
] = -1;
1105 /* Determine in which grid column atoms should go */
1106 static void calc_column_indices(nbnxn_grid_t
*grid
,
1107 int atomStart
, int atomEnd
,
1109 int dd_zone
, const int *move
,
1110 int thread
, int nthread
,
1114 /* We add one extra cell for particles which moved during DD */
1115 for (int i
= 0; i
< grid
->ncx
*grid
->ncy
+1; i
++)
1120 int taskAtomStart
= atomStart
+ static_cast<int>((thread
+ 0)*(atomEnd
- atomStart
))/nthread
;
1121 int taskAtomEnd
= atomStart
+ static_cast<int>((thread
+ 1)*(atomEnd
- atomStart
))/nthread
;
1125 for (int i
= taskAtomStart
; i
< taskAtomEnd
; i
++)
1127 if (move
== nullptr || move
[i
] >= 0)
1129 /* We need to be careful with rounding,
1130 * particles might be a few bits outside the local zone.
1131 * The int cast takes care of the lower bound,
1132 * we will explicitly take care of the upper bound.
1134 int cx
= static_cast<int>((x
[i
][XX
] - grid
->c0
[XX
])*grid
->inv_sx
);
1135 int cy
= static_cast<int>((x
[i
][YY
] - grid
->c0
[YY
])*grid
->inv_sy
);
1138 if (cx
< 0 || cx
> grid
->ncx
||
1139 cy
< 0 || cy
> grid
->ncy
)
1142 "grid cell cx %d cy %d out of range (max %d %d)\n"
1143 "atom %f %f %f, grid->c0 %f %f",
1144 cx
, cy
, grid
->ncx
, grid
->ncy
,
1145 x
[i
][XX
], x
[i
][YY
], x
[i
][ZZ
], grid
->c0
[XX
], grid
->c0
[YY
]);
1148 /* Take care of potential rouding issues */
1149 cx
= std::min(cx
, grid
->ncx
- 1);
1150 cy
= std::min(cy
, grid
->ncy
- 1);
1152 /* For the moment cell will contain only the, grid local,
1153 * x and y indices, not z.
1155 cell
[i
] = cx
*grid
->ncy
+ cy
;
1159 /* Put this moved particle after the end of the grid,
1160 * so we can process it later without using conditionals.
1162 cell
[i
] = grid
->ncx
*grid
->ncy
;
1171 for (int i
= taskAtomStart
; i
< taskAtomEnd
; i
++)
1173 int cx
= static_cast<int>((x
[i
][XX
] - grid
->c0
[XX
])*grid
->inv_sx
);
1174 int cy
= static_cast<int>((x
[i
][YY
] - grid
->c0
[YY
])*grid
->inv_sy
);
1176 /* For non-home zones there could be particles outside
1177 * the non-bonded cut-off range, which have been communicated
1178 * for bonded interactions only. For the result it doesn't
1179 * matter where these end up on the grid. For performance
1180 * we put them in an extra row at the border.
1182 cx
= std::max(cx
, 0);
1183 cx
= std::min(cx
, grid
->ncx
- 1);
1184 cy
= std::max(cy
, 0);
1185 cy
= std::min(cy
, grid
->ncy
- 1);
1187 /* For the moment cell will contain only the, grid local,
1188 * x and y indices, not z.
1190 cell
[i
] = cx
*grid
->ncy
+ cy
;
1197 /* Determine in which grid cells the atoms should go */
1198 static void calc_cell_indices(const nbnxn_search_t nbs
,
1206 nbnxn_atomdata_t
*nbat
)
1208 const int nthread
= gmx_omp_nthreads_get(emntPairsearch
);
1210 #pragma omp parallel for num_threads(nthread) schedule(static)
1211 for (int thread
= 0; thread
< nthread
; thread
++)
1215 calc_column_indices(grid
, atomStart
, atomEnd
, x
, ddZone
, move
, thread
, nthread
,
1216 nbs
->cell
, nbs
->work
[thread
].cxy_na
);
1218 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1221 /* Make the cell index as a function of x and y */
1224 grid
->cxy_ind
[0] = 0;
1225 for (int i
= 0; i
< grid
->ncx
*grid
->ncy
+ 1; i
++)
1227 /* We set ncz_max at the beginning of the loop iso at the end
1228 * to skip i=grid->ncx*grid->ncy which are moved particles
1229 * that do not need to be ordered on the grid.
1235 int cxy_na_i
= nbs
->work
[0].cxy_na
[i
];
1236 for (int thread
= 1; thread
< nthread
; thread
++)
1238 cxy_na_i
+= nbs
->work
[thread
].cxy_na
[i
];
1240 ncz
= (cxy_na_i
+ grid
->na_sc
- 1)/grid
->na_sc
;
1241 if (nbat
->XFormat
== nbatX8
)
1243 /* Make the number of cell a multiple of 2 */
1244 ncz
= (ncz
+ 1) & ~1;
1246 grid
->cxy_ind
[i
+1] = grid
->cxy_ind
[i
] + ncz
;
1247 /* Clear cxy_na, so we can reuse the array below */
1248 grid
->cxy_na
[i
] = 0;
1250 grid
->nc
= grid
->cxy_ind
[grid
->ncx
*grid
->ncy
] - grid
->cxy_ind
[0];
1252 nbat
->natoms
= (grid
->cell0
+ grid
->nc
)*grid
->na_sc
;
1256 fprintf(debug
, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1257 grid
->na_sc
, grid
->na_c
, grid
->nc
,
1258 grid
->ncx
, grid
->ncy
, grid
->nc
/((double)(grid
->ncx
*grid
->ncy
)),
1263 for (int cy
= 0; cy
< grid
->ncy
; cy
++)
1265 for (int cx
= 0; cx
< grid
->ncx
; cx
++)
1267 fprintf(debug
, " %2d", grid
->cxy_ind
[i
+1]-grid
->cxy_ind
[i
]);
1270 fprintf(debug
, "\n");
1275 /* Make sure the work array for sorting is large enough */
1276 if (ncz_max
*grid
->na_sc
*SGSF
> nbs
->work
[0].sort_work_nalloc
)
1278 for (int thread
= 0; thread
< nbs
->nthread_max
; thread
++)
1280 nbs
->work
[thread
].sort_work_nalloc
=
1281 over_alloc_large(ncz_max
*grid
->na_sc
*SGSF
);
1282 srenew(nbs
->work
[thread
].sort_work
,
1283 nbs
->work
[thread
].sort_work_nalloc
);
1284 /* When not in use, all elements should be -1 */
1285 for (int i
= 0; i
< nbs
->work
[thread
].sort_work_nalloc
; i
++)
1287 nbs
->work
[thread
].sort_work
[i
] = -1;
1292 /* Now we know the dimensions we can fill the grid.
1293 * This is the first, unsorted fill. We sort the columns after this.
1295 for (int i
= atomStart
; i
< atomEnd
; i
++)
1297 /* At this point nbs->cell contains the local grid x,y indices */
1298 int cxy
= nbs
->cell
[i
];
1299 nbs
->a
[(grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
+ grid
->cxy_na
[cxy
]++] = i
;
1304 /* Set the cell indices for the moved particles */
1305 int n0
= grid
->nc
*grid
->na_sc
;
1306 int n1
= grid
->nc
*grid
->na_sc
+grid
->cxy_na
[grid
->ncx
*grid
->ncy
];
1309 for (int i
= n0
; i
< n1
; i
++)
1311 nbs
->cell
[nbs
->a
[i
]] = i
;
1316 /* Sort the super-cell columns along z into the sub-cells. */
1317 #pragma omp parallel for num_threads(nthread) schedule(static)
1318 for (int thread
= 0; thread
< nthread
; thread
++)
1324 sort_columns_simple(nbs
, ddZone
, grid
, atomStart
, atomEnd
, atinfo
, x
, nbat
,
1325 ((thread
+0)*grid
->ncx
*grid
->ncy
)/nthread
,
1326 ((thread
+1)*grid
->ncx
*grid
->ncy
)/nthread
,
1327 nbs
->work
[thread
].sort_work
);
1331 sort_columns_supersub(nbs
, ddZone
, grid
, atomStart
, atomEnd
, atinfo
, x
, nbat
,
1332 ((thread
+0)*grid
->ncx
*grid
->ncy
)/nthread
,
1333 ((thread
+1)*grid
->ncx
*grid
->ncy
)/nthread
,
1334 nbs
->work
[thread
].sort_work
);
1337 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1340 if (grid
->bSimple
&& nbat
->XFormat
== nbatX8
)
1342 combine_bounding_box_pairs(grid
, grid
->bb
);
1347 grid
->nsubc_tot
= 0;
1348 for (int i
= 0; i
< grid
->nc
; i
++)
1350 grid
->nsubc_tot
+= grid
->nsubc
[i
];
1358 print_bbsizes_simple(debug
, grid
);
1362 fprintf(debug
, "ns non-zero sub-cells: %d average atoms %.2f\n",
1363 grid
->nsubc_tot
, (atomEnd
- atomStart
)/(double)grid
->nsubc_tot
);
1365 print_bbsizes_supersub(debug
, grid
);
1370 /* Sets up a grid and puts the atoms on the grid.
1371 * This function only operates on one domain of the domain decompostion.
1372 * Note that without domain decomposition there is only one domain.
1374 void nbnxn_put_on_grid(nbnxn_search_t nbs
,
1378 const rvec lowerCorner
,
1379 const rvec upperCorner
,
1388 nbnxn_atomdata_t
*nbat
)
1390 nbnxn_grid_t
*grid
= &nbs
->grid
[ddZone
];
1392 nbs_cycle_start(&nbs
->cc
[enbsCCgrid
]);
1394 grid
->bSimple
= nbnxn_kernel_pairlist_simple(nb_kernel_type
);
1396 grid
->na_c
= nbnxn_kernel_to_cluster_i_size(nb_kernel_type
);
1397 grid
->na_cj
= nbnxn_kernel_to_cluster_j_size(nb_kernel_type
);
1398 grid
->na_sc
= (grid
->bSimple
? 1 : c_gpuNumClusterPerCell
)*grid
->na_c
;
1399 grid
->na_c_2log
= get_2log(grid
->na_c
);
1401 nbat
->na_c
= grid
->na_c
;
1410 (nbs
->grid
[ddZone
- 1].cell0
+ nbs
->grid
[ddZone
- 1].nc
)*
1411 nbs
->grid
[ddZone
- 1].na_sc
/grid
->na_sc
;
1414 const int n
= atomEnd
- atomStart
;
1419 copy_mat(box
, nbs
->box
);
1421 /* Avoid zero density */
1422 if (atomDensity
> 0)
1424 grid
->atom_density
= atomDensity
;
1428 grid
->atom_density
= grid_atom_density(n
- numAtomsMoved
, lowerCorner
, upperCorner
);
1433 nbs
->natoms_local
= atomEnd
- numAtomsMoved
;
1434 /* We assume that nbnxn_put_on_grid is called first
1435 * for the local atoms (ddZone=0).
1437 nbs
->natoms_nonlocal
= atomEnd
- numAtomsMoved
;
1441 fprintf(debug
, "natoms_local = %5d atom_density = %5.1f\n",
1442 nbs
->natoms_local
, grid
->atom_density
);
1447 nbs
->natoms_nonlocal
= std::max(nbs
->natoms_nonlocal
, atomEnd
);
1450 /* We always use the home zone (grid[0]) for setting the cell size,
1451 * since determining densities for non-local zones is difficult.
1453 int maxNumCellsOnThisGrid
=
1454 set_grid_size_xy(nbs
, grid
,
1455 ddZone
, n
- numAtomsMoved
,
1456 lowerCorner
, upperCorner
,
1457 nbs
->grid
[0].atom_density
);
1459 int maxNumCells
= grid
->cell0
+ maxNumCellsOnThisGrid
;
1461 if (atomEnd
> nbs
->cell_nalloc
)
1463 nbs
->cell_nalloc
= over_alloc_large(atomEnd
);
1464 srenew(nbs
->cell
, nbs
->cell_nalloc
);
1467 /* To avoid conditionals we store the moved particles at the end of a,
1468 * make sure we have enough space.
1470 if (maxNumCells
*grid
->na_sc
+ numAtomsMoved
> nbs
->a_nalloc
)
1472 nbs
->a_nalloc
= over_alloc_large(maxNumCells
*grid
->na_sc
+ numAtomsMoved
);
1473 srenew(nbs
->a
, nbs
->a_nalloc
);
1476 /* We need padding up to a multiple of the buffer flag size: simply add */
1477 if (maxNumCells
*grid
->na_sc
+ NBNXN_BUFFERFLAG_SIZE
> nbat
->nalloc
)
1479 nbnxn_atomdata_realloc(nbat
,
1480 maxNumCells
*grid
->na_sc
+ NBNXN_BUFFERFLAG_SIZE
);
1483 calc_cell_indices(nbs
, ddZone
, grid
, atomStart
, atomEnd
, atinfo
, x
, move
, nbat
);
1487 nbat
->natoms_local
= nbat
->natoms
;
1490 nbs_cycle_stop(&nbs
->cc
[enbsCCgrid
]);
1493 /* Calls nbnxn_put_on_grid for all non-local domains */
1494 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs
,
1495 const struct gmx_domdec_zones_t
*zones
,
1499 nbnxn_atomdata_t
*nbat
)
1501 for (int zone
= 1; zone
< zones
->n
; zone
++)
1504 for (int d
= 0; d
< DIM
; d
++)
1506 c0
[d
] = zones
->size
[zone
].bb_x0
[d
];
1507 c1
[d
] = zones
->size
[zone
].bb_x1
[d
];
1510 nbnxn_put_on_grid(nbs
, nbs
->ePBC
, nullptr,
1512 zones
->cg_range
[zone
],
1513 zones
->cg_range
[zone
+1],
1523 void nbnxn_get_ncells(nbnxn_search_t nbs
, int *ncx
, int *ncy
)
1525 *ncx
= nbs
->grid
[0].ncx
;
1526 *ncy
= nbs
->grid
[0].ncy
;
1529 void nbnxn_get_atomorder(const nbnxn_search_t nbs
, const int **a
, int *n
)
1531 const nbnxn_grid_t
*grid
;
1533 grid
= &nbs
->grid
[0];
1535 /* Return the atom order for the home cell (index 0) */
1538 *n
= grid
->cxy_ind
[grid
->ncx
*grid
->ncy
]*grid
->na_sc
;
1541 void nbnxn_set_atomorder(nbnxn_search_t nbs
)
1543 /* Set the atom order for the home cell (index 0) */
1544 nbnxn_grid_t
*grid
= &nbs
->grid
[0];
1547 for (int cx
= 0; cx
< grid
->ncx
; cx
++)
1549 for (int cy
= 0; cy
< grid
->ncy
; cy
++)
1551 int cxy
= cx
*grid
->ncy
+ cy
;
1552 int j
= grid
->cxy_ind
[cxy
]*grid
->na_sc
;
1553 for (int cz
= 0; cz
< grid
->cxy_na
[cxy
]; cz
++)