2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
48 #include "gromacs/domdec/domdec_struct.h"
49 #include "gromacs/gmxlib/nrnb.h"
50 #include "gromacs/math/functions.h"
51 #include "gromacs/math/utilities.h"
52 #include "gromacs/math/vec.h"
53 #include "gromacs/mdlib/gmx_omp_nthreads.h"
54 #include "gromacs/mdtypes/group.h"
55 #include "gromacs/mdtypes/md_enums.h"
56 #include "gromacs/nbnxm/atomdata.h"
57 #include "gromacs/nbnxm/gpu_data_mgmt.h"
58 #include "gromacs/nbnxm/nbnxm_geometry.h"
59 #include "gromacs/nbnxm/nbnxm_simd.h"
60 #include "gromacs/pbcutil/ishift.h"
61 #include "gromacs/pbcutil/pbc.h"
62 #include "gromacs/simd/simd.h"
63 #include "gromacs/simd/vector_operations.h"
64 #include "gromacs/topology/block.h"
65 #include "gromacs/utility/exceptions.h"
66 #include "gromacs/utility/fatalerror.h"
67 #include "gromacs/utility/gmxomp.h"
68 #include "gromacs/utility/smalloc.h"
70 #include "boundingboxes.h"
71 #include "clusterdistancekerneltype.h"
73 #include "pairlistset.h"
74 #include "pairlistsets.h"
75 #include "pairlistwork.h"
76 #include "pairsearch.h"
78 using namespace gmx
; // TODO: Remove when this file is moved into gmx namespace
80 using BoundingBox
= Nbnxm::BoundingBox
; // TODO: Remove when refactoring this file
81 using BoundingBox1D
= Nbnxm::BoundingBox1D
; // TODO: Remove when refactoring this file
83 using Grid
= Nbnxm::Grid
; // TODO: Remove when refactoring this file
85 // Convience alias for partial Nbnxn namespace usage
86 using InteractionLocality
= Nbnxm::InteractionLocality
;
88 /* We shift the i-particles backward for PBC.
89 * This leads to more conditionals than shifting forward.
90 * We do this to get more balanced pair lists.
92 constexpr bool c_pbcShiftBackward
= true;
94 /* Layout for the nonbonded NxN pair lists */
95 enum class NbnxnLayout
97 NoSimd4x4
, // i-cluster size 4, j-cluster size 4
98 Simd4xN
, // i-cluster size 4, j-cluster size SIMD width
99 Simd2xNN
, // i-cluster size 4, j-cluster size half SIMD width
100 Gpu8x8x8
// i-cluster size 8, j-cluster size 8 + super-clustering
104 /* Returns the j-cluster size */
105 template <NbnxnLayout layout
>
106 static constexpr int jClusterSize()
108 static_assert(layout
== NbnxnLayout::NoSimd4x4
|| layout
== NbnxnLayout::Simd4xN
|| layout
== NbnxnLayout::Simd2xNN
, "Currently jClusterSize only supports CPU layouts");
110 return layout
== NbnxnLayout::Simd4xN
? GMX_SIMD_REAL_WIDTH
: (layout
== NbnxnLayout::Simd2xNN
? GMX_SIMD_REAL_WIDTH
/2 : c_nbnxnCpuIClusterSize
);
113 /*! \brief Returns the j-cluster index given the i-cluster index.
115 * \tparam jClusterSize The number of atoms in a j-cluster
116 * \tparam jSubClusterIndex The j-sub-cluster index (0/1), used when size(j-cluster) < size(i-cluster)
117 * \param[in] ci The i-cluster index
119 template <int jClusterSize
, int jSubClusterIndex
>
120 static inline int cjFromCi(int ci
)
122 static_assert(jClusterSize
== c_nbnxnCpuIClusterSize
/2 || jClusterSize
== c_nbnxnCpuIClusterSize
|| jClusterSize
== c_nbnxnCpuIClusterSize
*2, "Only j-cluster sizes 2, 4 and 8 are currently implemented");
124 static_assert(jSubClusterIndex
== 0 || jSubClusterIndex
== 1,
125 "Only sub-cluster indices 0 and 1 are supported");
127 if (jClusterSize
== c_nbnxnCpuIClusterSize
/2)
129 if (jSubClusterIndex
== 0)
135 return ((ci
+ 1) << 1) - 1;
138 else if (jClusterSize
== c_nbnxnCpuIClusterSize
)
148 /*! \brief Returns the j-cluster index given the i-cluster index.
150 * \tparam layout The pair-list layout
151 * \tparam jSubClusterIndex The j-sub-cluster index (0/1), used when size(j-cluster) < size(i-cluster)
152 * \param[in] ci The i-cluster index
154 template <NbnxnLayout layout
, int jSubClusterIndex
>
155 static inline int cjFromCi(int ci
)
157 constexpr int clusterSize
= jClusterSize
<layout
>();
159 return cjFromCi
<clusterSize
, jSubClusterIndex
>(ci
);
162 /* Returns the nbnxn coordinate data index given the i-cluster index */
163 template <NbnxnLayout layout
>
164 static inline int xIndexFromCi(int ci
)
166 constexpr int clusterSize
= jClusterSize
<layout
>();
168 static_assert(clusterSize
== c_nbnxnCpuIClusterSize
/2 || clusterSize
== c_nbnxnCpuIClusterSize
|| clusterSize
== c_nbnxnCpuIClusterSize
*2, "Only j-cluster sizes 2, 4 and 8 are currently implemented");
170 if (clusterSize
<= c_nbnxnCpuIClusterSize
)
172 /* Coordinates are stored packed in groups of 4 */
177 /* Coordinates packed in 8, i-cluster size is half the packing width */
178 return (ci
>> 1)*STRIDE_P8
+ (ci
& 1)*(c_packX8
>> 1);
182 /* Returns the nbnxn coordinate data index given the j-cluster index */
183 template <NbnxnLayout layout
>
184 static inline int xIndexFromCj(int cj
)
186 constexpr int clusterSize
= jClusterSize
<layout
>();
188 static_assert(clusterSize
== c_nbnxnCpuIClusterSize
/2 || clusterSize
== c_nbnxnCpuIClusterSize
|| clusterSize
== c_nbnxnCpuIClusterSize
*2, "Only j-cluster sizes 2, 4 and 8 are currently implemented");
190 if (clusterSize
== c_nbnxnCpuIClusterSize
/2)
192 /* Coordinates are stored packed in groups of 4 */
193 return (cj
>> 1)*STRIDE_P4
+ (cj
& 1)*(c_packX4
>> 1);
195 else if (clusterSize
== c_nbnxnCpuIClusterSize
)
197 /* Coordinates are stored packed in groups of 4 */
202 /* Coordinates are stored packed in groups of 8 */
209 void nbnxn_init_pairlist_fep(t_nblist
*nl
)
211 nl
->type
= GMX_NBLIST_INTERACTION_FREE_ENERGY
;
212 nl
->igeometry
= GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE
;
213 /* The interaction functions are set in the free energy kernel fuction */
226 nl
->jindex
= nullptr;
228 nl
->excl_fep
= nullptr;
232 static void init_buffer_flags(nbnxn_buffer_flags_t
*flags
,
235 flags
->nflag
= (natoms
+ NBNXN_BUFFERFLAG_SIZE
- 1)/NBNXN_BUFFERFLAG_SIZE
;
236 if (flags
->nflag
> flags
->flag_nalloc
)
238 flags
->flag_nalloc
= over_alloc_large(flags
->nflag
);
239 srenew(flags
->flag
, flags
->flag_nalloc
);
241 for (int b
= 0; b
< flags
->nflag
; b
++)
243 bitmask_clear(&(flags
->flag
[b
]));
247 /* Returns the pair-list cutoff between a bounding box and a grid cell given an atom-to-atom pair-list cutoff
249 * Given a cutoff distance between atoms, this functions returns the cutoff
250 * distance2 between a bounding box of a group of atoms and a grid cell.
251 * Since atoms can be geometrically outside of the cell they have been
252 * assigned to (when atom groups instead of individual atoms are assigned
253 * to cells), this distance returned can be larger than the input.
256 listRangeForBoundingBoxToGridCell(real rlist
,
257 const Grid::Dimensions
&gridDims
)
259 return rlist
+ gridDims
.maxAtomGroupRadius
;
262 /* Returns the pair-list cutoff between a grid cells given an atom-to-atom pair-list cutoff
264 * Given a cutoff distance between atoms, this functions returns the cutoff
265 * distance2 between two grid cells.
266 * Since atoms can be geometrically outside of the cell they have been
267 * assigned to (when atom groups instead of individual atoms are assigned
268 * to cells), this distance returned can be larger than the input.
271 listRangeForGridCellToGridCell(real rlist
,
272 const Grid::Dimensions
&iGridDims
,
273 const Grid::Dimensions
&jGridDims
)
275 return rlist
+ iGridDims
.maxAtomGroupRadius
+ jGridDims
.maxAtomGroupRadius
;
278 /* Determines the cell range along one dimension that
279 * the bounding box b0 - b1 sees.
282 static void get_cell_range(real b0
, real b1
,
283 const Grid::Dimensions
&jGridDims
,
284 real d2
, real rlist
, int *cf
, int *cl
)
286 real listRangeBBToCell2
= gmx::square(listRangeForBoundingBoxToGridCell(rlist
, jGridDims
));
287 real distanceInCells
= (b0
- jGridDims
.lowerCorner
[dim
])*jGridDims
.invCellSize
[dim
];
288 *cf
= std::max(static_cast<int>(distanceInCells
), 0);
291 d2
+ gmx::square((b0
- jGridDims
.lowerCorner
[dim
]) - (*cf
- 1 + 1)*jGridDims
.cellSize
[dim
]) < listRangeBBToCell2
)
296 *cl
= std::min(static_cast<int>((b1
- jGridDims
.lowerCorner
[dim
])*jGridDims
.invCellSize
[dim
]), jGridDims
.numCells
[dim
] - 1);
297 while (*cl
< jGridDims
.numCells
[dim
] - 1 &&
298 d2
+ gmx::square((*cl
+ 1)*jGridDims
.cellSize
[dim
] - (b1
- jGridDims
.lowerCorner
[dim
])) < listRangeBBToCell2
)
304 /* Reference code calculating the distance^2 between two bounding boxes */
306 static float box_dist2(float bx0, float bx1, float by0,
307 float by1, float bz0, float bz1,
308 const BoundingBox *bb)
311 float dl, dh, dm, dm0;
315 dl = bx0 - bb->upper.x;
316 dh = bb->lower.x - bx1;
317 dm = std::max(dl, dh);
318 dm0 = std::max(dm, 0.0f);
321 dl = by0 - bb->upper.y;
322 dh = bb->lower.y - by1;
323 dm = std::max(dl, dh);
324 dm0 = std::max(dm, 0.0f);
327 dl = bz0 - bb->upper.z;
328 dh = bb->lower.z - bz1;
329 dm = std::max(dl, dh);
330 dm0 = std::max(dm, 0.0f);
337 #if !NBNXN_SEARCH_BB_SIMD4
339 /*! \brief Plain C code calculating the distance^2 between two bounding boxes in xyz0 format
341 * \param[in] bb_i First bounding box
342 * \param[in] bb_j Second bounding box
344 static float clusterBoundingBoxDistance2(const BoundingBox
&bb_i
,
345 const BoundingBox
&bb_j
)
347 float dl
= bb_i
.lower
.x
- bb_j
.upper
.x
;
348 float dh
= bb_j
.lower
.x
- bb_i
.upper
.x
;
349 float dm
= std::max(dl
, dh
);
350 float dm0
= std::max(dm
, 0.0F
);
353 dl
= bb_i
.lower
.y
- bb_j
.upper
.y
;
354 dh
= bb_j
.lower
.y
- bb_i
.upper
.y
;
355 dm
= std::max(dl
, dh
);
356 dm0
= std::max(dm
, 0.0F
);
359 dl
= bb_i
.lower
.z
- bb_j
.upper
.z
;
360 dh
= bb_j
.lower
.z
- bb_i
.upper
.z
;
361 dm
= std::max(dl
, dh
);
362 dm0
= std::max(dm
, 0.0F
);
368 #else /* NBNXN_SEARCH_BB_SIMD4 */
370 /*! \brief 4-wide SIMD code calculating the distance^2 between two bounding boxes in xyz0 format
372 * \param[in] bb_i First bounding box, should be aligned for 4-wide SIMD
373 * \param[in] bb_j Second bounding box, should be aligned for 4-wide SIMD
375 static float clusterBoundingBoxDistance2(const BoundingBox
&bb_i
,
376 const BoundingBox
&bb_j
)
378 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
381 const Simd4Float bb_i_S0
= load4(bb_i
.lower
.ptr());
382 const Simd4Float bb_i_S1
= load4(bb_i
.upper
.ptr());
383 const Simd4Float bb_j_S0
= load4(bb_j
.lower
.ptr());
384 const Simd4Float bb_j_S1
= load4(bb_j
.upper
.ptr());
386 const Simd4Float dl_S
= bb_i_S0
- bb_j_S1
;
387 const Simd4Float dh_S
= bb_j_S0
- bb_i_S1
;
389 const Simd4Float dm_S
= max(dl_S
, dh_S
);
390 const Simd4Float dm0_S
= max(dm_S
, simd4SetZeroF());
392 return dotProduct(dm0_S
, dm0_S
);
395 /* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
396 template <int boundingBoxStart
>
397 static inline void gmx_simdcall
398 clusterBoundingBoxDistance2_xxxx_simd4_inner(const float *bb_i
,
400 const Simd4Float xj_l
,
401 const Simd4Float yj_l
,
402 const Simd4Float zj_l
,
403 const Simd4Float xj_h
,
404 const Simd4Float yj_h
,
405 const Simd4Float zj_h
)
407 constexpr int stride
= c_packedBoundingBoxesDimSize
;
409 const int shi
= boundingBoxStart
*Nbnxm::c_numBoundingBoxBounds1D
*DIM
;
411 const Simd4Float zero
= setZero();
413 const Simd4Float xi_l
= load4(bb_i
+ shi
+ 0*stride
);
414 const Simd4Float yi_l
= load4(bb_i
+ shi
+ 1*stride
);
415 const Simd4Float zi_l
= load4(bb_i
+ shi
+ 2*stride
);
416 const Simd4Float xi_h
= load4(bb_i
+ shi
+ 3*stride
);
417 const Simd4Float yi_h
= load4(bb_i
+ shi
+ 4*stride
);
418 const Simd4Float zi_h
= load4(bb_i
+ shi
+ 5*stride
);
420 const Simd4Float dx_0
= xi_l
- xj_h
;
421 const Simd4Float dy_0
= yi_l
- yj_h
;
422 const Simd4Float dz_0
= zi_l
- zj_h
;
424 const Simd4Float dx_1
= xj_l
- xi_h
;
425 const Simd4Float dy_1
= yj_l
- yi_h
;
426 const Simd4Float dz_1
= zj_l
- zi_h
;
428 const Simd4Float mx
= max(dx_0
, dx_1
);
429 const Simd4Float my
= max(dy_0
, dy_1
);
430 const Simd4Float mz
= max(dz_0
, dz_1
);
432 const Simd4Float m0x
= max(mx
, zero
);
433 const Simd4Float m0y
= max(my
, zero
);
434 const Simd4Float m0z
= max(mz
, zero
);
436 const Simd4Float d2x
= m0x
* m0x
;
437 const Simd4Float d2y
= m0y
* m0y
;
438 const Simd4Float d2z
= m0z
* m0z
;
440 const Simd4Float d2s
= d2x
+ d2y
;
441 const Simd4Float d2t
= d2s
+ d2z
;
443 store4(d2
+ boundingBoxStart
, d2t
);
446 /* 4-wide SIMD code for nsi bb distances for bb format xxxxyyyyzzzz */
448 clusterBoundingBoxDistance2_xxxx_simd4(const float *bb_j
,
453 constexpr int stride
= c_packedBoundingBoxesDimSize
;
455 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
458 const Simd4Float xj_l
= Simd4Float(bb_j
[0*stride
]);
459 const Simd4Float yj_l
= Simd4Float(bb_j
[1*stride
]);
460 const Simd4Float zj_l
= Simd4Float(bb_j
[2*stride
]);
461 const Simd4Float xj_h
= Simd4Float(bb_j
[3*stride
]);
462 const Simd4Float yj_h
= Simd4Float(bb_j
[4*stride
]);
463 const Simd4Float zj_h
= Simd4Float(bb_j
[5*stride
]);
465 /* Here we "loop" over si (0,stride) from 0 to nsi with step stride.
466 * But as we know the number of iterations is 1 or 2, we unroll manually.
468 clusterBoundingBoxDistance2_xxxx_simd4_inner
<0>(bb_i
, d2
,
473 clusterBoundingBoxDistance2_xxxx_simd4_inner
<stride
>(bb_i
, d2
,
479 #endif /* NBNXN_SEARCH_BB_SIMD4 */
482 /* Returns if any atom pair from two clusters is within distance sqrt(rlist2) */
483 static inline gmx_bool
484 clusterpair_in_range(const NbnxnPairlistGpuWork
&work
,
486 int csj
, int stride
, const real
*x_j
,
489 #if !GMX_SIMD4_HAVE_REAL
492 * All coordinates are stored as xyzxyz...
495 const real
*x_i
= work
.iSuperClusterData
.x
.data();
497 for (int i
= 0; i
< c_nbnxnGpuClusterSize
; i
++)
499 int i0
= (si
*c_nbnxnGpuClusterSize
+ i
)*DIM
;
500 for (int j
= 0; j
< c_nbnxnGpuClusterSize
; j
++)
502 int j0
= (csj
*c_nbnxnGpuClusterSize
+ j
)*stride
;
504 real d2
= gmx::square(x_i
[i0
] - x_j
[j0
]) + gmx::square(x_i
[i0
+1] - x_j
[j0
+1]) + gmx::square(x_i
[i0
+2] - x_j
[j0
+2]);
515 #else /* !GMX_SIMD4_HAVE_REAL */
517 /* 4-wide SIMD version.
518 * The coordinates x_i are stored as xxxxyyyy..., x_j is stored xyzxyz...
519 * Using 8-wide AVX(2) is not faster on Intel Sandy Bridge and Haswell.
521 static_assert(c_nbnxnGpuClusterSize
== 8 || c_nbnxnGpuClusterSize
== 4,
522 "A cluster is hard-coded to 4/8 atoms.");
524 Simd4Real rc2_S
= Simd4Real(rlist2
);
526 const real
*x_i
= work
.iSuperClusterData
.xSimd
.data();
528 int dim_stride
= c_nbnxnGpuClusterSize
*DIM
;
529 Simd4Real ix_S0
= load4(x_i
+ si
*dim_stride
+ 0*GMX_SIMD4_WIDTH
);
530 Simd4Real iy_S0
= load4(x_i
+ si
*dim_stride
+ 1*GMX_SIMD4_WIDTH
);
531 Simd4Real iz_S0
= load4(x_i
+ si
*dim_stride
+ 2*GMX_SIMD4_WIDTH
);
533 Simd4Real ix_S1
, iy_S1
, iz_S1
;
534 if (c_nbnxnGpuClusterSize
== 8)
536 ix_S1
= load4(x_i
+ si
*dim_stride
+ 3*GMX_SIMD4_WIDTH
);
537 iy_S1
= load4(x_i
+ si
*dim_stride
+ 4*GMX_SIMD4_WIDTH
);
538 iz_S1
= load4(x_i
+ si
*dim_stride
+ 5*GMX_SIMD4_WIDTH
);
540 /* We loop from the outer to the inner particles to maximize
541 * the chance that we find a pair in range quickly and return.
543 int j0
= csj
*c_nbnxnGpuClusterSize
;
544 int j1
= j0
+ c_nbnxnGpuClusterSize
- 1;
547 Simd4Real jx0_S
, jy0_S
, jz0_S
;
548 Simd4Real jx1_S
, jy1_S
, jz1_S
;
550 Simd4Real dx_S0
, dy_S0
, dz_S0
;
551 Simd4Real dx_S1
, dy_S1
, dz_S1
;
552 Simd4Real dx_S2
, dy_S2
, dz_S2
;
553 Simd4Real dx_S3
, dy_S3
, dz_S3
;
564 Simd4Bool wco_any_S01
, wco_any_S23
, wco_any_S
;
566 jx0_S
= Simd4Real(x_j
[j0
*stride
+0]);
567 jy0_S
= Simd4Real(x_j
[j0
*stride
+1]);
568 jz0_S
= Simd4Real(x_j
[j0
*stride
+2]);
570 jx1_S
= Simd4Real(x_j
[j1
*stride
+0]);
571 jy1_S
= Simd4Real(x_j
[j1
*stride
+1]);
572 jz1_S
= Simd4Real(x_j
[j1
*stride
+2]);
574 /* Calculate distance */
575 dx_S0
= ix_S0
- jx0_S
;
576 dy_S0
= iy_S0
- jy0_S
;
577 dz_S0
= iz_S0
- jz0_S
;
578 dx_S2
= ix_S0
- jx1_S
;
579 dy_S2
= iy_S0
- jy1_S
;
580 dz_S2
= iz_S0
- jz1_S
;
581 if (c_nbnxnGpuClusterSize
== 8)
583 dx_S1
= ix_S1
- jx0_S
;
584 dy_S1
= iy_S1
- jy0_S
;
585 dz_S1
= iz_S1
- jz0_S
;
586 dx_S3
= ix_S1
- jx1_S
;
587 dy_S3
= iy_S1
- jy1_S
;
588 dz_S3
= iz_S1
- jz1_S
;
591 /* rsq = dx*dx+dy*dy+dz*dz */
592 rsq_S0
= norm2(dx_S0
, dy_S0
, dz_S0
);
593 rsq_S2
= norm2(dx_S2
, dy_S2
, dz_S2
);
594 if (c_nbnxnGpuClusterSize
== 8)
596 rsq_S1
= norm2(dx_S1
, dy_S1
, dz_S1
);
597 rsq_S3
= norm2(dx_S3
, dy_S3
, dz_S3
);
600 wco_S0
= (rsq_S0
< rc2_S
);
601 wco_S2
= (rsq_S2
< rc2_S
);
602 if (c_nbnxnGpuClusterSize
== 8)
604 wco_S1
= (rsq_S1
< rc2_S
);
605 wco_S3
= (rsq_S3
< rc2_S
);
607 if (c_nbnxnGpuClusterSize
== 8)
609 wco_any_S01
= wco_S0
|| wco_S1
;
610 wco_any_S23
= wco_S2
|| wco_S3
;
611 wco_any_S
= wco_any_S01
|| wco_any_S23
;
615 wco_any_S
= wco_S0
|| wco_S2
;
618 if (anyTrue(wco_any_S
))
629 #endif /* !GMX_SIMD4_HAVE_REAL */
632 /* Returns the j-cluster index for index cjIndex in a cj list */
633 static inline int nblCj(gmx::ArrayRef
<const nbnxn_cj_t
> cjList
,
636 return cjList
[cjIndex
].cj
;
639 /* Returns the j-cluster index for index cjIndex in a cj4 list */
640 static inline int nblCj(gmx::ArrayRef
<const nbnxn_cj4_t
> cj4List
,
643 return cj4List
[cjIndex
/c_nbnxnGpuJgroupSize
].cj
[cjIndex
& (c_nbnxnGpuJgroupSize
- 1)];
646 /* Returns the i-interaction mask of the j sub-cell for index cj_ind */
647 static unsigned int nbl_imask0(const NbnxnPairlistGpu
*nbl
, int cj_ind
)
649 return nbl
->cj4
[cj_ind
/c_nbnxnGpuJgroupSize
].imei
[0].imask
;
652 NbnxnPairlistCpu::NbnxnPairlistCpu() :
653 na_ci(c_nbnxnCpuIClusterSize
),
658 work(std::make_unique
<NbnxnPairlistCpuWork
>())
662 NbnxnPairlistGpu::NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy
) :
663 na_ci(c_nbnxnGpuClusterSize
),
664 na_cj(c_nbnxnGpuClusterSize
),
665 na_sc(c_gpuNumClusterPerCell
*c_nbnxnGpuClusterSize
),
667 sci({}, {pinningPolicy
}),
668 cj4({}, {pinningPolicy
}),
669 excl({}, {pinningPolicy
}),
671 work(std::make_unique
<NbnxnPairlistGpuWork
>())
673 static_assert(c_nbnxnGpuNumClusterPerSupercluster
== c_gpuNumClusterPerCell
,
674 "The search code assumes that the a super-cluster matches a search grid cell");
676 static_assert(sizeof(cj4
[0].imei
[0].imask
)*8 >= c_nbnxnGpuJgroupSize
*c_gpuNumClusterPerCell
,
677 "The i super-cluster cluster interaction mask does not contain a sufficient number of bits");
679 static_assert(sizeof(excl
[0])*8 >= c_nbnxnGpuJgroupSize
*c_gpuNumClusterPerCell
, "The GPU exclusion mask does not contain a sufficient number of bits");
681 // We always want a first entry without any exclusions
685 // TODO: Move to pairlistset.cpp
686 PairlistSet::PairlistSet(const Nbnxm::InteractionLocality locality
,
687 const PairlistParams
&pairlistParams
) :
689 params_(pairlistParams
)
692 (params_
.pairlistType
== PairlistType::Simple4x2
||
693 params_
.pairlistType
== PairlistType::Simple4x4
||
694 params_
.pairlistType
== PairlistType::Simple4x8
);
695 // Currently GPU lists are always combined
696 combineLists_
= !isCpuType_
;
698 const int numLists
= gmx_omp_nthreads_get(emntNonbonded
);
700 if (!combineLists_
&&
701 numLists
> NBNXN_BUFFERFLAG_MAX_THREADS
)
703 gmx_fatal(FARGS
, "%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
704 numLists
, NBNXN_BUFFERFLAG_MAX_THREADS
, NBNXN_BUFFERFLAG_MAX_THREADS
);
709 cpuLists_
.resize(numLists
);
712 cpuListsWork_
.resize(numLists
);
717 /* Only list 0 is used on the GPU, use normal allocation for i>0 */
718 gpuLists_
.emplace_back(gmx::PinningPolicy::PinnedIfSupported
);
719 /* Lists 0 to numLists are use for constructing lists in parallel
720 * on the CPU using numLists threads (and then merged into list 0).
722 for (int i
= 1; i
< numLists
; i
++)
724 gpuLists_
.emplace_back(gmx::PinningPolicy::CannotBePinned
);
729 fepLists_
.resize(numLists
);
731 /* Execute in order to avoid memory interleaving between threads */
732 #pragma omp parallel for num_threads(numLists) schedule(static)
733 for (int i
= 0; i
< numLists
; i
++)
737 /* We used to allocate all normal lists locally on each thread
738 * as well. The question is if allocating the object on the
739 * master thread (but all contained list memory thread local)
740 * impacts performance.
742 fepLists_
[i
] = std::make_unique
<t_nblist
>();
743 nbnxn_init_pairlist_fep(fepLists_
[i
].get());
745 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
750 /* Print statistics of a pair list, used for debug output */
751 static void print_nblist_statistics(FILE *fp
,
752 const NbnxnPairlistCpu
&nbl
,
753 const Nbnxm::GridSet
&gridSet
,
756 const Grid
&grid
= gridSet
.grids()[0];
757 const Grid::Dimensions
&dims
= grid
.dimensions();
759 fprintf(fp
, "nbl nci %zu ncj %d\n",
760 nbl
.ci
.size(), nbl
.ncjInUse
);
761 const int numAtomsJCluster
= grid
.geometry().numAtomsJCluster
;
762 const double numAtomsPerCell
= nbl
.ncjInUse
/static_cast<double>(grid
.numCells())*numAtomsJCluster
;
763 fprintf(fp
, "nbl na_cj %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
764 nbl
.na_cj
, rl
, nbl
.ncjInUse
, nbl
.ncjInUse
/static_cast<double>(grid
.numCells()),
766 numAtomsPerCell
/(0.5*4.0/3.0*M_PI
*rl
*rl
*rl
*grid
.numCells()*numAtomsJCluster
/(dims
.gridSize
[XX
]*dims
.gridSize
[YY
]*dims
.gridSize
[ZZ
])));
768 fprintf(fp
, "nbl average j cell list length %.1f\n",
769 0.25*nbl
.ncjInUse
/std::max(static_cast<double>(nbl
.ci
.size()), 1.0));
771 int cs
[SHIFTS
] = { 0 };
773 for (const nbnxn_ci_t
&ciEntry
: nbl
.ci
)
775 cs
[ciEntry
.shift
& NBNXN_CI_SHIFT
] +=
776 ciEntry
.cj_ind_end
- ciEntry
.cj_ind_start
;
778 int j
= ciEntry
.cj_ind_start
;
779 while (j
< ciEntry
.cj_ind_end
&&
780 nbl
.cj
[j
].excl
!= NBNXN_INTERACTION_MASK_ALL
)
786 fprintf(fp
, "nbl cell pairs, total: %zu excl: %d %.1f%%\n",
787 nbl
.cj
.size(), npexcl
, 100*npexcl
/std::max(static_cast<double>(nbl
.cj
.size()), 1.0));
788 for (int s
= 0; s
< SHIFTS
; s
++)
792 fprintf(fp
, "nbl shift %2d ncj %3d\n", s
, cs
[s
]);
797 /* Print statistics of a pair lists, used for debug output */
798 static void print_nblist_statistics(FILE *fp
,
799 const NbnxnPairlistGpu
&nbl
,
800 const Nbnxm::GridSet
&gridSet
,
803 const Grid
&grid
= gridSet
.grids()[0];
804 const Grid::Dimensions
&dims
= grid
.dimensions();
806 fprintf(fp
, "nbl nsci %zu ncj4 %zu nsi %d excl4 %zu\n",
807 nbl
.sci
.size(), nbl
.cj4
.size(), nbl
.nci_tot
, nbl
.excl
.size());
808 const int numAtomsCluster
= grid
.geometry().numAtomsICluster
;
809 const double numAtomsPerCell
= nbl
.nci_tot
/static_cast<double>(grid
.numClusters())*numAtomsCluster
;
810 fprintf(fp
, "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
811 nbl
.na_ci
, rl
, nbl
.nci_tot
, nbl
.nci_tot
/static_cast<double>(grid
.numClusters()),
813 numAtomsPerCell
/(0.5*4.0/3.0*M_PI
*rl
*rl
*rl
*grid
.numClusters()*numAtomsCluster
/(dims
.gridSize
[XX
]*dims
.gridSize
[YY
]*dims
.gridSize
[ZZ
])));
818 int c
[c_gpuNumClusterPerCell
+ 1] = { 0 };
819 for (const nbnxn_sci_t
&sci
: nbl
.sci
)
822 for (int j4
= sci
.cj4_ind_start
; j4
< sci
.cj4_ind_end
; j4
++)
824 for (int j
= 0; j
< c_nbnxnGpuJgroupSize
; j
++)
827 for (int si
= 0; si
< c_gpuNumClusterPerCell
; si
++)
829 if (nbl
.cj4
[j4
].imei
[0].imask
& (1U << (j
*c_gpuNumClusterPerCell
+ si
)))
840 nsp_max
= std::max(nsp_max
, nsp
);
842 if (!nbl
.sci
.empty())
844 sum_nsp
/= nbl
.sci
.size();
845 sum_nsp2
/= nbl
.sci
.size();
847 fprintf(fp
, "nbl #cluster-pairs: av %.1f stddev %.1f max %d\n",
848 sum_nsp
, std::sqrt(sum_nsp2
- sum_nsp
*sum_nsp
), nsp_max
);
850 if (!nbl
.cj4
.empty())
852 for (int b
= 0; b
<= c_gpuNumClusterPerCell
; b
++)
854 fprintf(fp
, "nbl j-list #i-subcell %d %7d %4.1f\n",
855 b
, c
[b
], 100.0*c
[b
]/size_t {nbl
.cj4
.size()*c_nbnxnGpuJgroupSize
});
860 /* Returns a reference to the exclusion mask for j-cluster-group \p cj4 and warp \p warp
861 * Generates a new exclusion entry when the j-cluster-group uses
862 * the default all-interaction mask at call time, so the returned mask
863 * can be modified when needed.
865 static nbnxn_excl_t
&get_exclusion_mask(NbnxnPairlistGpu
*nbl
,
869 if (nbl
->cj4
[cj4
].imei
[warp
].excl_ind
== 0)
871 /* No exclusions set, make a new list entry */
872 const size_t oldSize
= nbl
->excl
.size();
873 GMX_ASSERT(oldSize
>= 1, "We should always have entry [0]");
874 /* Add entry with default values: no exclusions */
875 nbl
->excl
.resize(oldSize
+ 1);
876 nbl
->cj4
[cj4
].imei
[warp
].excl_ind
= oldSize
;
879 return nbl
->excl
[nbl
->cj4
[cj4
].imei
[warp
].excl_ind
];
882 /* Sets self exclusions and excludes half of the double pairs in the self cluster-pair \p nbl->cj4[cj4Index].cj[jOffsetInGroup]
884 * \param[in,out] nbl The cluster pair list
885 * \param[in] cj4Index The j-cluster group index into \p nbl->cj4
886 * \param[in] jOffsetInGroup The j-entry offset in \p nbl->cj4[cj4Index]
887 * \param[in] iClusterInCell The i-cluster index in the cell
890 setSelfAndNewtonExclusionsGpu(NbnxnPairlistGpu
*nbl
,
892 const int jOffsetInGroup
,
893 const int iClusterInCell
)
895 constexpr int numJatomsPerPart
= c_nbnxnGpuClusterSize
/c_nbnxnGpuClusterpairSplit
;
897 /* The exclusions are stored separately for each part of the split */
898 for (int part
= 0; part
< c_nbnxnGpuClusterpairSplit
; part
++)
900 const int jOffset
= part
*numJatomsPerPart
;
901 /* Make a new exclusion mask entry for each part, if we don't already have one yet */
902 nbnxn_excl_t
&excl
= get_exclusion_mask(nbl
, cj4Index
, part
);
904 /* Set all bits with j-index <= i-index */
905 for (int jIndexInPart
= 0; jIndexInPart
< numJatomsPerPart
; jIndexInPart
++)
907 for (int i
= jOffset
+ jIndexInPart
; i
< c_nbnxnGpuClusterSize
; i
++)
909 excl
.pair
[jIndexInPart
*c_nbnxnGpuClusterSize
+ i
] &=
910 ~(1U << (jOffsetInGroup
*c_gpuNumClusterPerCell
+ iClusterInCell
));
916 /* Returns a diagonal or off-diagonal interaction mask for plain C lists */
917 static unsigned int get_imask(gmx_bool rdiag
, int ci
, int cj
)
919 return (rdiag
&& ci
== cj
? NBNXN_INTERACTION_MASK_DIAG
: NBNXN_INTERACTION_MASK_ALL
);
922 /* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
923 gmx_unused
static unsigned int get_imask_simd_j2(gmx_bool rdiag
, int ci
, int cj
)
925 return (rdiag
&& ci
*2 == cj
? NBNXN_INTERACTION_MASK_DIAG_J2_0
:
926 (rdiag
&& ci
*2+1 == cj
? NBNXN_INTERACTION_MASK_DIAG_J2_1
:
927 NBNXN_INTERACTION_MASK_ALL
));
930 /* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
931 gmx_unused
static unsigned int get_imask_simd_j4(gmx_bool rdiag
, int ci
, int cj
)
933 return (rdiag
&& ci
== cj
? NBNXN_INTERACTION_MASK_DIAG
: NBNXN_INTERACTION_MASK_ALL
);
936 /* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
937 gmx_unused
static unsigned int get_imask_simd_j8(gmx_bool rdiag
, int ci
, int cj
)
939 return (rdiag
&& ci
== cj
*2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0
:
940 (rdiag
&& ci
== cj
*2+1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1
:
941 NBNXN_INTERACTION_MASK_ALL
));
945 #if GMX_SIMD_REAL_WIDTH == 2
946 #define get_imask_simd_4xn get_imask_simd_j2
948 #if GMX_SIMD_REAL_WIDTH == 4
949 #define get_imask_simd_4xn get_imask_simd_j4
951 #if GMX_SIMD_REAL_WIDTH == 8
952 #define get_imask_simd_4xn get_imask_simd_j8
953 #define get_imask_simd_2xnn get_imask_simd_j4
955 #if GMX_SIMD_REAL_WIDTH == 16
956 #define get_imask_simd_2xnn get_imask_simd_j8
960 /* Plain C code for checking and adding cluster-pairs to the list.
962 * \param[in] gridj The j-grid
963 * \param[in,out] nbl The pair-list to store the cluster pairs in
964 * \param[in] icluster The index of the i-cluster
965 * \param[in] jclusterFirst The first cluster in the j-range
966 * \param[in] jclusterLast The last cluster in the j-range
967 * \param[in] excludeSubDiagonal Exclude atom pairs with i-index > j-index
968 * \param[in] x_j Coordinates for the j-atom, in xyz format
969 * \param[in] rlist2 The squared list cut-off
970 * \param[in] rbb2 The squared cut-off for putting cluster-pairs in the list based on bounding box distance only
971 * \param[in,out] numDistanceChecks The number of distance checks performed
974 makeClusterListSimple(const Grid
&jGrid
,
975 NbnxnPairlistCpu
* nbl
,
979 bool excludeSubDiagonal
,
980 const real
* gmx_restrict x_j
,
983 int * gmx_restrict numDistanceChecks
)
985 const BoundingBox
* gmx_restrict bb_ci
= nbl
->work
->iClusterData
.bb
.data();
986 const real
* gmx_restrict x_ci
= nbl
->work
->iClusterData
.x
.data();
991 while (!InRange
&& jclusterFirst
<= jclusterLast
)
993 real d2
= clusterBoundingBoxDistance2(bb_ci
[0], jGrid
.jBoundingBoxes()[jclusterFirst
]);
994 *numDistanceChecks
+= 2;
996 /* Check if the distance is within the distance where
997 * we use only the bounding box distance rbb,
998 * or within the cut-off and there is at least one atom pair
999 * within the cut-off.
1005 else if (d2
< rlist2
)
1007 int cjf_gl
= jGrid
.cellOffset() + jclusterFirst
;
1008 for (int i
= 0; i
< c_nbnxnCpuIClusterSize
&& !InRange
; i
++)
1010 for (int j
= 0; j
< c_nbnxnCpuIClusterSize
; j
++)
1012 InRange
= InRange
||
1013 (gmx::square(x_ci
[i
*STRIDE_XYZ
+XX
] - x_j
[(cjf_gl
*c_nbnxnCpuIClusterSize
+j
)*STRIDE_XYZ
+XX
]) +
1014 gmx::square(x_ci
[i
*STRIDE_XYZ
+YY
] - x_j
[(cjf_gl
*c_nbnxnCpuIClusterSize
+j
)*STRIDE_XYZ
+YY
]) +
1015 gmx::square(x_ci
[i
*STRIDE_XYZ
+ZZ
] - x_j
[(cjf_gl
*c_nbnxnCpuIClusterSize
+j
)*STRIDE_XYZ
+ZZ
]) < rlist2
);
1018 *numDistanceChecks
+= c_nbnxnCpuIClusterSize
*c_nbnxnCpuIClusterSize
;
1031 while (!InRange
&& jclusterLast
> jclusterFirst
)
1033 real d2
= clusterBoundingBoxDistance2(bb_ci
[0], jGrid
.jBoundingBoxes()[jclusterLast
]);
1034 *numDistanceChecks
+= 2;
1036 /* Check if the distance is within the distance where
1037 * we use only the bounding box distance rbb,
1038 * or within the cut-off and there is at least one atom pair
1039 * within the cut-off.
1045 else if (d2
< rlist2
)
1047 int cjl_gl
= jGrid
.cellOffset() + jclusterLast
;
1048 for (int i
= 0; i
< c_nbnxnCpuIClusterSize
&& !InRange
; i
++)
1050 for (int j
= 0; j
< c_nbnxnCpuIClusterSize
; j
++)
1052 InRange
= InRange
||
1053 (gmx::square(x_ci
[i
*STRIDE_XYZ
+XX
] - x_j
[(cjl_gl
*c_nbnxnCpuIClusterSize
+j
)*STRIDE_XYZ
+XX
]) +
1054 gmx::square(x_ci
[i
*STRIDE_XYZ
+YY
] - x_j
[(cjl_gl
*c_nbnxnCpuIClusterSize
+j
)*STRIDE_XYZ
+YY
]) +
1055 gmx::square(x_ci
[i
*STRIDE_XYZ
+ZZ
] - x_j
[(cjl_gl
*c_nbnxnCpuIClusterSize
+j
)*STRIDE_XYZ
+ZZ
]) < rlist2
);
1058 *numDistanceChecks
+= c_nbnxnCpuIClusterSize
*c_nbnxnCpuIClusterSize
;
1066 if (jclusterFirst
<= jclusterLast
)
1068 for (int jcluster
= jclusterFirst
; jcluster
<= jclusterLast
; jcluster
++)
1070 /* Store cj and the interaction mask */
1072 cjEntry
.cj
= jGrid
.cellOffset() + jcluster
;
1073 cjEntry
.excl
= get_imask(excludeSubDiagonal
, icluster
, jcluster
);
1074 nbl
->cj
.push_back(cjEntry
);
1076 /* Increase the closing index in the i list */
1077 nbl
->ci
.back().cj_ind_end
= nbl
->cj
.size();
1081 #ifdef GMX_NBNXN_SIMD_4XN
1082 #include "gromacs/nbnxm/pairlist_simd_4xm.h"
1084 #ifdef GMX_NBNXN_SIMD_2XNN
1085 #include "gromacs/nbnxm/pairlist_simd_2xmm.h"
1088 /* Plain C or SIMD4 code for making a pair list of super-cell sci vs scj.
1089 * Checks bounding box distances and possibly atom pair distances.
1091 static void make_cluster_list_supersub(const Grid
&iGrid
,
1093 NbnxnPairlistGpu
*nbl
,
1096 const bool excludeSubDiagonal
,
1101 int *numDistanceChecks
)
1103 NbnxnPairlistGpuWork
&work
= *nbl
->work
;
1106 const float *pbb_ci
= work
.iSuperClusterData
.bbPacked
.data();
1108 const BoundingBox
*bb_ci
= work
.iSuperClusterData
.bb
.data();
1111 assert(c_nbnxnGpuClusterSize
== iGrid
.geometry().numAtomsICluster
);
1112 assert(c_nbnxnGpuClusterSize
== jGrid
.geometry().numAtomsICluster
);
1114 /* We generate the pairlist mainly based on bounding-box distances
1115 * and do atom pair distance based pruning on the GPU.
1116 * Only if a j-group contains a single cluster-pair, we try to prune
1117 * that pair based on atom distances on the CPU to avoid empty j-groups.
1119 #define PRUNE_LIST_CPU_ONE 1
1120 #define PRUNE_LIST_CPU_ALL 0
1122 #if PRUNE_LIST_CPU_ONE
1126 float *d2l
= work
.distanceBuffer
.data();
1128 for (int subc
= 0; subc
< jGrid
.numClustersPerCell()[scj
]; subc
++)
1130 const int cj4_ind
= work
.cj_ind
/c_nbnxnGpuJgroupSize
;
1131 const int cj_offset
= work
.cj_ind
- cj4_ind
*c_nbnxnGpuJgroupSize
;
1132 const int cj
= scj
*c_gpuNumClusterPerCell
+ subc
;
1134 const int cj_gl
= jGrid
.cellOffset()*c_gpuNumClusterPerCell
+ cj
;
1137 if (excludeSubDiagonal
&& sci
== scj
)
1143 ci1
= iGrid
.numClustersPerCell()[sci
];
1147 /* Determine all ci1 bb distances in one call with SIMD4 */
1148 const int offset
= packedBoundingBoxesIndex(cj
) + (cj
& (c_packedBoundingBoxesDimSize
- 1));
1149 clusterBoundingBoxDistance2_xxxx_simd4(jGrid
.packedBoundingBoxes().data() + offset
,
1151 *numDistanceChecks
+= c_nbnxnGpuClusterSize
*2;
1155 unsigned int imask
= 0;
1156 /* We use a fixed upper-bound instead of ci1 to help optimization */
1157 for (int ci
= 0; ci
< c_gpuNumClusterPerCell
; ci
++)
1165 /* Determine the bb distance between ci and cj */
1166 d2l
[ci
] = clusterBoundingBoxDistance2(bb_ci
[ci
], jGrid
.jBoundingBoxes()[cj
]);
1167 *numDistanceChecks
+= 2;
1171 #if PRUNE_LIST_CPU_ALL
1172 /* Check if the distance is within the distance where
1173 * we use only the bounding box distance rbb,
1174 * or within the cut-off and there is at least one atom pair
1175 * within the cut-off. This check is very costly.
1177 *numDistanceChecks
+= c_nbnxnGpuClusterSize
*c_nbnxnGpuClusterSize
;
1180 clusterpair_in_range(work
, ci
, cj_gl
, stride
, x
, rlist2
)))
1182 /* Check if the distance between the two bounding boxes
1183 * in within the pair-list cut-off.
1188 /* Flag this i-subcell to be taken into account */
1189 imask
|= (1U << (cj_offset
*c_gpuNumClusterPerCell
+ ci
));
1191 #if PRUNE_LIST_CPU_ONE
1199 #if PRUNE_LIST_CPU_ONE
1200 /* If we only found 1 pair, check if any atoms are actually
1201 * within the cut-off, so we could get rid of it.
1203 if (npair
== 1 && d2l
[ci_last
] >= rbb2
&&
1204 !clusterpair_in_range(work
, ci_last
, cj_gl
, stride
, x
, rlist2
))
1206 imask
&= ~(1U << (cj_offset
*c_gpuNumClusterPerCell
+ ci_last
));
1213 /* We have at least one cluster pair: add a j-entry */
1214 if (static_cast<size_t>(cj4_ind
) == nbl
->cj4
.size())
1216 nbl
->cj4
.resize(nbl
->cj4
.size() + 1);
1218 nbnxn_cj4_t
*cj4
= &nbl
->cj4
[cj4_ind
];
1220 cj4
->cj
[cj_offset
] = cj_gl
;
1222 /* Set the exclusions for the ci==sj entry.
1223 * Here we don't bother to check if this entry is actually flagged,
1224 * as it will nearly always be in the list.
1226 if (excludeSubDiagonal
&& sci
== scj
)
1228 setSelfAndNewtonExclusionsGpu(nbl
, cj4_ind
, cj_offset
, subc
);
1231 /* Copy the cluster interaction mask to the list */
1232 for (int w
= 0; w
< c_nbnxnGpuClusterpairSplit
; w
++)
1234 cj4
->imei
[w
].imask
|= imask
;
1237 nbl
->work
->cj_ind
++;
1239 /* Keep the count */
1240 nbl
->nci_tot
+= npair
;
1242 /* Increase the closing index in i super-cell list */
1243 nbl
->sci
.back().cj4_ind_end
=
1244 (nbl
->work
->cj_ind
+ c_nbnxnGpuJgroupSize
- 1)/c_nbnxnGpuJgroupSize
;
1249 /* Returns how many contiguous j-clusters we have starting in the i-list */
1250 template <typename CjListType
>
1251 static int numContiguousJClusters(const int cjIndexStart
,
1252 const int cjIndexEnd
,
1253 gmx::ArrayRef
<const CjListType
> cjList
)
1255 const int firstJCluster
= nblCj(cjList
, cjIndexStart
);
1257 int numContiguous
= 0;
1259 while (cjIndexStart
+ numContiguous
< cjIndexEnd
&&
1260 nblCj(cjList
, cjIndexStart
+ numContiguous
) == firstJCluster
+ numContiguous
)
1265 return numContiguous
;
1269 * \brief Helper struct for efficient searching for excluded atoms in a j-list
1273 /*! \brief Constructs a j-list range from \p cjList with the given index range */
1274 template <typename CjListType
>
1275 JListRanges(int cjIndexStart
,
1277 gmx::ArrayRef
<const CjListType
> cjList
);
1279 int cjIndexStart
; //!< The start index in the j-list
1280 int cjIndexEnd
; //!< The end index in the j-list
1281 int cjFirst
; //!< The j-cluster with index cjIndexStart
1282 int cjLast
; //!< The j-cluster with index cjIndexEnd-1
1283 int numDirect
; //!< Up to cjIndexStart+numDirect the j-clusters are cjFirst + the index offset
1287 template <typename CjListType
>
1288 JListRanges::JListRanges(int cjIndexStart
,
1290 gmx::ArrayRef
<const CjListType
> cjList
) :
1291 cjIndexStart(cjIndexStart
),
1292 cjIndexEnd(cjIndexEnd
)
1294 GMX_ASSERT(cjIndexEnd
> cjIndexStart
, "JListRanges should only be called with non-empty lists");
1296 cjFirst
= nblCj(cjList
, cjIndexStart
);
1297 cjLast
= nblCj(cjList
, cjIndexEnd
- 1);
1299 /* Determine how many contiguous j-cells we have starting
1300 * from the first i-cell. This number can be used to directly
1301 * calculate j-cell indices for excluded atoms.
1303 numDirect
= numContiguousJClusters(cjIndexStart
, cjIndexEnd
, cjList
);
1307 /* Return the index of \p jCluster in the given range or -1 when not present
1309 * Note: This code is executed very often and therefore performance is
1310 * important. It should be inlined and fully optimized.
1312 template <typename CjListType
>
1314 findJClusterInJList(int jCluster
,
1315 const JListRanges
&ranges
,
1316 gmx::ArrayRef
<const CjListType
> cjList
)
1320 if (jCluster
< ranges
.cjFirst
+ ranges
.numDirect
)
1322 /* We can calculate the index directly using the offset */
1323 index
= ranges
.cjIndexStart
+ jCluster
- ranges
.cjFirst
;
1327 /* Search for jCluster using bisection */
1329 int rangeStart
= ranges
.cjIndexStart
+ ranges
.numDirect
;
1330 int rangeEnd
= ranges
.cjIndexEnd
;
1332 while (index
== -1 && rangeStart
< rangeEnd
)
1334 rangeMiddle
= (rangeStart
+ rangeEnd
) >> 1;
1336 const int clusterMiddle
= nblCj(cjList
, rangeMiddle
);
1338 if (jCluster
== clusterMiddle
)
1340 index
= rangeMiddle
;
1342 else if (jCluster
< clusterMiddle
)
1344 rangeEnd
= rangeMiddle
;
1348 rangeStart
= rangeMiddle
+ 1;
1356 // TODO: Get rid of the two functions below by renaming sci to ci (or something better)
1358 /* Return the i-entry in the list we are currently operating on */
1359 static nbnxn_ci_t
*getOpenIEntry(NbnxnPairlistCpu
*nbl
)
1361 return &nbl
->ci
.back();
1364 /* Return the i-entry in the list we are currently operating on */
1365 static nbnxn_sci_t
*getOpenIEntry(NbnxnPairlistGpu
*nbl
)
1367 return &nbl
->sci
.back();
1370 /* Set all atom-pair exclusions for a simple type list i-entry
1372 * Set all atom-pair exclusions from the topology stored in exclusions
1373 * as masks in the pair-list for simple list entry iEntry.
1376 setExclusionsForIEntry(const Nbnxm::GridSet
&gridSet
,
1377 NbnxnPairlistCpu
*nbl
,
1378 gmx_bool diagRemoved
,
1380 const nbnxn_ci_t
&iEntry
,
1381 const t_blocka
&exclusions
)
1383 if (iEntry
.cj_ind_end
== iEntry
.cj_ind_start
)
1385 /* Empty list: no exclusions */
1389 const JListRanges
ranges(iEntry
.cj_ind_start
, iEntry
.cj_ind_end
, gmx::makeConstArrayRef(nbl
->cj
));
1391 const int iCluster
= iEntry
.ci
;
1393 gmx::ArrayRef
<const int> cell
= gridSet
.cells();
1394 gmx::ArrayRef
<const int> atomIndices
= gridSet
.atomIndices();
1396 /* Loop over the atoms in the i-cluster */
1397 for (int i
= 0; i
< nbl
->na_ci
; i
++)
1399 const int iIndex
= iCluster
*nbl
->na_ci
+ i
;
1400 const int iAtom
= atomIndices
[iIndex
];
1403 /* Loop over the topology-based exclusions for this i-atom */
1404 for (int exclIndex
= exclusions
.index
[iAtom
]; exclIndex
< exclusions
.index
[iAtom
+ 1]; exclIndex
++)
1406 const int jAtom
= exclusions
.a
[exclIndex
];
1410 /* The self exclusion are already set, save some time */
1414 /* Get the index of the j-atom in the nbnxn atom data */
1415 const int jIndex
= cell
[jAtom
];
1417 /* Without shifts we only calculate interactions j>i
1418 * for one-way pair-lists.
1420 if (diagRemoved
&& jIndex
<= iIndex
)
1425 const int jCluster
= (jIndex
>> na_cj_2log
);
1427 /* Could the cluster se be in our list? */
1428 if (jCluster
>= ranges
.cjFirst
&& jCluster
<= ranges
.cjLast
)
1431 findJClusterInJList(jCluster
, ranges
,
1432 gmx::makeConstArrayRef(nbl
->cj
));
1436 /* We found an exclusion, clear the corresponding
1439 const int innerJ
= jIndex
- (jCluster
<< na_cj_2log
);
1441 nbl
->cj
[index
].excl
&= ~(1U << ((i
<< na_cj_2log
) + innerJ
));
1449 /* Add a new i-entry to the FEP list and copy the i-properties */
1450 static inline void fep_list_new_nri_copy(t_nblist
*nlist
)
1452 /* Add a new i-entry */
1455 assert(nlist
->nri
< nlist
->maxnri
);
1457 /* Duplicate the last i-entry, except for jindex, which continues */
1458 nlist
->iinr
[nlist
->nri
] = nlist
->iinr
[nlist
->nri
-1];
1459 nlist
->shift
[nlist
->nri
] = nlist
->shift
[nlist
->nri
-1];
1460 nlist
->gid
[nlist
->nri
] = nlist
->gid
[nlist
->nri
-1];
1461 nlist
->jindex
[nlist
->nri
] = nlist
->nrj
;
1464 /* Rellocate FEP list for size nl->maxnri, TODO: replace by C++ */
1465 static void reallocate_nblist(t_nblist
*nl
)
1469 fprintf(debug
, "reallocating neigborlist (ielec=%d, ivdw=%d, igeometry=%d, type=%d), maxnri=%d\n",
1470 nl
->ielec
, nl
->ivdw
, nl
->igeometry
, nl
->type
, nl
->maxnri
);
1472 srenew(nl
->iinr
, nl
->maxnri
);
1473 srenew(nl
->gid
, nl
->maxnri
);
1474 srenew(nl
->shift
, nl
->maxnri
);
1475 srenew(nl
->jindex
, nl
->maxnri
+1);
1478 /* For load balancing of the free-energy lists over threads, we set
1479 * the maximum nrj size of an i-entry to 40. This leads to good
1480 * load balancing in the worst case scenario of a single perturbed
1481 * particle on 16 threads, while not introducing significant overhead.
1482 * Note that half of the perturbed pairs will anyhow end up in very small lists,
1483 * since non perturbed i-particles will see few perturbed j-particles).
1485 const int max_nrj_fep
= 40;
1487 /* Exclude the perturbed pairs from the Verlet list. This is only done to avoid
1488 * singularities for overlapping particles (0/0), since the charges and
1489 * LJ parameters have been zeroed in the nbnxn data structure.
1490 * Simultaneously make a group pair list for the perturbed pairs.
1492 static void make_fep_list(gmx::ArrayRef
<const int> atomIndices
,
1493 const nbnxn_atomdata_t
*nbat
,
1494 NbnxnPairlistCpu
*nbl
,
1495 gmx_bool bDiagRemoved
,
1497 real gmx_unused shx
,
1498 real gmx_unused shy
,
1499 real gmx_unused shz
,
1500 real gmx_unused rlist_fep2
,
1505 int ci
, cj_ind_start
, cj_ind_end
, cja
, cjr
;
1507 int gid_i
= 0, gid_j
, gid
;
1508 int egp_shift
, egp_mask
;
1510 int ind_i
, ind_j
, ai
, aj
;
1512 gmx_bool bFEP_i
, bFEP_i_all
;
1514 if (nbl_ci
->cj_ind_end
== nbl_ci
->cj_ind_start
)
1522 cj_ind_start
= nbl_ci
->cj_ind_start
;
1523 cj_ind_end
= nbl_ci
->cj_ind_end
;
1525 /* In worst case we have alternating energy groups
1526 * and create #atom-pair lists, which means we need the size
1527 * of a cluster pair (na_ci*na_cj) times the number of cj's.
1529 nri_max
= nbl
->na_ci
*nbl
->na_cj
*(cj_ind_end
- cj_ind_start
);
1530 if (nlist
->nri
+ nri_max
> nlist
->maxnri
)
1532 nlist
->maxnri
= over_alloc_large(nlist
->nri
+ nri_max
);
1533 reallocate_nblist(nlist
);
1536 const int numAtomsJCluster
= jGrid
.geometry().numAtomsJCluster
;
1538 const nbnxn_atomdata_t::Params
&nbatParams
= nbat
->params();
1540 const int ngid
= nbatParams
.nenergrp
;
1542 /* TODO: Consider adding a check in grompp and changing this to an assert */
1543 const int numBitsInEnergyGroupIdsForAtomsInJCluster
= sizeof(gid_cj
)*8;
1544 if (ngid
*numAtomsJCluster
> numBitsInEnergyGroupIdsForAtomsInJCluster
)
1546 gmx_fatal(FARGS
, "The Verlet scheme with %dx%d kernels and free-energy only supports up to %zu energy groups",
1547 iGrid
.geometry().numAtomsICluster
, numAtomsJCluster
,
1548 (sizeof(gid_cj
)*8)/numAtomsJCluster
);
1551 egp_shift
= nbatParams
.neg_2log
;
1552 egp_mask
= (1 << egp_shift
) - 1;
1554 /* Loop over the atoms in the i sub-cell */
1556 for (int i
= 0; i
< nbl
->na_ci
; i
++)
1558 ind_i
= ci
*nbl
->na_ci
+ i
;
1559 ai
= atomIndices
[ind_i
];
1563 nlist
->jindex
[nri
+1] = nlist
->jindex
[nri
];
1564 nlist
->iinr
[nri
] = ai
;
1565 /* The actual energy group pair index is set later */
1566 nlist
->gid
[nri
] = 0;
1567 nlist
->shift
[nri
] = nbl_ci
->shift
& NBNXN_CI_SHIFT
;
1569 bFEP_i
= iGrid
.atomIsPerturbed(ci
- iGrid
.cellOffset(), i
);
1571 bFEP_i_all
= bFEP_i_all
&& bFEP_i
;
1573 if (nlist
->nrj
+ (cj_ind_end
- cj_ind_start
)*nbl
->na_cj
> nlist
->maxnrj
)
1575 nlist
->maxnrj
= over_alloc_small(nlist
->nrj
+ (cj_ind_end
- cj_ind_start
)*nbl
->na_cj
);
1576 srenew(nlist
->jjnr
, nlist
->maxnrj
);
1577 srenew(nlist
->excl_fep
, nlist
->maxnrj
);
1582 gid_i
= (nbatParams
.energrp
[ci
] >> (egp_shift
*i
)) & egp_mask
;
1585 for (int cj_ind
= cj_ind_start
; cj_ind
< cj_ind_end
; cj_ind
++)
1587 unsigned int fep_cj
;
1589 cja
= nbl
->cj
[cj_ind
].cj
;
1591 if (numAtomsJCluster
== jGrid
.geometry().numAtomsICluster
)
1593 cjr
= cja
- jGrid
.cellOffset();
1594 fep_cj
= jGrid
.fepBits(cjr
);
1597 gid_cj
= nbatParams
.energrp
[cja
];
1600 else if (2*numAtomsJCluster
== jGrid
.geometry().numAtomsICluster
)
1602 cjr
= cja
- jGrid
.cellOffset()*2;
1603 /* Extract half of the ci fep/energrp mask */
1604 fep_cj
= (jGrid
.fepBits(cjr
>> 1) >> ((cjr
& 1)*numAtomsJCluster
)) & ((1 << numAtomsJCluster
) - 1);
1607 gid_cj
= nbatParams
.energrp
[cja
>> 1] >> ((cja
& 1)*numAtomsJCluster
*egp_shift
) & ((1 << (numAtomsJCluster
*egp_shift
)) - 1);
1612 cjr
= cja
- (jGrid
.cellOffset() >> 1);
1613 /* Combine two ci fep masks/energrp */
1614 fep_cj
= jGrid
.fepBits(cjr
*2) + (jGrid
.fepBits(cjr
*2 + 1) << jGrid
.geometry().numAtomsICluster
);
1617 gid_cj
= nbatParams
.energrp
[cja
*2] + (nbatParams
.energrp
[cja
*2+1] << (jGrid
.geometry().numAtomsICluster
*egp_shift
));
1621 if (bFEP_i
|| fep_cj
!= 0)
1623 for (int j
= 0; j
< nbl
->na_cj
; j
++)
1625 /* Is this interaction perturbed and not excluded? */
1626 ind_j
= cja
*nbl
->na_cj
+ j
;
1627 aj
= atomIndices
[ind_j
];
1629 (bFEP_i
|| (fep_cj
& (1 << j
))) &&
1630 (!bDiagRemoved
|| ind_j
>= ind_i
))
1634 gid_j
= (gid_cj
>> (j
*egp_shift
)) & egp_mask
;
1635 gid
= GID(gid_i
, gid_j
, ngid
);
1637 if (nlist
->nrj
> nlist
->jindex
[nri
] &&
1638 nlist
->gid
[nri
] != gid
)
1640 /* Energy group pair changed: new list */
1641 fep_list_new_nri_copy(nlist
);
1644 nlist
->gid
[nri
] = gid
;
1647 if (nlist
->nrj
- nlist
->jindex
[nri
] >= max_nrj_fep
)
1649 fep_list_new_nri_copy(nlist
);
1653 /* Add it to the FEP list */
1654 nlist
->jjnr
[nlist
->nrj
] = aj
;
1655 nlist
->excl_fep
[nlist
->nrj
] = (nbl
->cj
[cj_ind
].excl
>> (i
*nbl
->na_cj
+ j
)) & 1;
1658 /* Exclude it from the normal list.
1659 * Note that the charge has been set to zero,
1660 * but we need to avoid 0/0, as perturbed atoms
1661 * can be on top of each other.
1663 nbl
->cj
[cj_ind
].excl
&= ~(1U << (i
*nbl
->na_cj
+ j
));
1669 if (nlist
->nrj
> nlist
->jindex
[nri
])
1671 /* Actually add this new, non-empty, list */
1673 nlist
->jindex
[nlist
->nri
] = nlist
->nrj
;
1680 /* All interactions are perturbed, we can skip this entry */
1681 nbl_ci
->cj_ind_end
= cj_ind_start
;
1682 nbl
->ncjInUse
-= cj_ind_end
- cj_ind_start
;
1686 /* Return the index of atom a within a cluster */
1687 static inline int cj_mod_cj4(int cj
)
1689 return cj
& (c_nbnxnGpuJgroupSize
- 1);
1692 /* Convert a j-cluster to a cj4 group */
1693 static inline int cj_to_cj4(int cj
)
1695 return cj
/c_nbnxnGpuJgroupSize
;
1698 /* Return the index of an j-atom within a warp */
1699 static inline int a_mod_wj(int a
)
1701 return a
& (c_nbnxnGpuClusterSize
/c_nbnxnGpuClusterpairSplit
- 1);
1704 /* As make_fep_list above, but for super/sub lists. */
1705 static void make_fep_list(gmx::ArrayRef
<const int> atomIndices
,
1706 const nbnxn_atomdata_t
*nbat
,
1707 NbnxnPairlistGpu
*nbl
,
1708 gmx_bool bDiagRemoved
,
1709 const nbnxn_sci_t
*nbl_sci
,
1720 int ind_i
, ind_j
, ai
, aj
;
1724 const nbnxn_cj4_t
*cj4
;
1726 const int numJClusterGroups
= nbl_sci
->numJClusterGroups();
1727 if (numJClusterGroups
== 0)
1733 const int sci
= nbl_sci
->sci
;
1735 const int cj4_ind_start
= nbl_sci
->cj4_ind_start
;
1736 const int cj4_ind_end
= nbl_sci
->cj4_ind_end
;
1738 /* Here we process one super-cell, max #atoms na_sc, versus a list
1739 * cj4 entries, each with max c_nbnxnGpuJgroupSize cj's, each
1740 * of size na_cj atoms.
1741 * On the GPU we don't support energy groups (yet).
1742 * So for each of the na_sc i-atoms, we need max one FEP list
1743 * for each max_nrj_fep j-atoms.
1745 nri_max
= nbl
->na_sc
*nbl
->na_cj
*(1 + (numJClusterGroups
*c_nbnxnGpuJgroupSize
)/max_nrj_fep
);
1746 if (nlist
->nri
+ nri_max
> nlist
->maxnri
)
1748 nlist
->maxnri
= over_alloc_large(nlist
->nri
+ nri_max
);
1749 reallocate_nblist(nlist
);
1752 /* Loop over the atoms in the i super-cluster */
1753 for (int c
= 0; c
< c_gpuNumClusterPerCell
; c
++)
1755 c_abs
= sci
*c_gpuNumClusterPerCell
+ c
;
1757 for (int i
= 0; i
< nbl
->na_ci
; i
++)
1759 ind_i
= c_abs
*nbl
->na_ci
+ i
;
1760 ai
= atomIndices
[ind_i
];
1764 nlist
->jindex
[nri
+1] = nlist
->jindex
[nri
];
1765 nlist
->iinr
[nri
] = ai
;
1766 /* With GPUs, energy groups are not supported */
1767 nlist
->gid
[nri
] = 0;
1768 nlist
->shift
[nri
] = nbl_sci
->shift
& NBNXN_CI_SHIFT
;
1770 bFEP_i
= iGrid
.atomIsPerturbed(c_abs
- iGrid
.cellOffset()*c_gpuNumClusterPerCell
, i
);
1772 xi
= nbat
->x()[ind_i
*nbat
->xstride
+XX
] + shx
;
1773 yi
= nbat
->x()[ind_i
*nbat
->xstride
+YY
] + shy
;
1774 zi
= nbat
->x()[ind_i
*nbat
->xstride
+ZZ
] + shz
;
1776 const int nrjMax
= nlist
->nrj
+ numJClusterGroups
*c_nbnxnGpuJgroupSize
*nbl
->na_cj
;
1777 if (nrjMax
> nlist
->maxnrj
)
1779 nlist
->maxnrj
= over_alloc_small(nrjMax
);
1780 srenew(nlist
->jjnr
, nlist
->maxnrj
);
1781 srenew(nlist
->excl_fep
, nlist
->maxnrj
);
1784 for (int cj4_ind
= cj4_ind_start
; cj4_ind
< cj4_ind_end
; cj4_ind
++)
1786 cj4
= &nbl
->cj4
[cj4_ind
];
1788 for (int gcj
= 0; gcj
< c_nbnxnGpuJgroupSize
; gcj
++)
1790 if ((cj4
->imei
[0].imask
& (1U << (gcj
*c_gpuNumClusterPerCell
+ c
))) == 0)
1792 /* Skip this ci for this cj */
1797 cj4
->cj
[gcj
] - jGrid
.cellOffset()*c_gpuNumClusterPerCell
;
1799 if (bFEP_i
|| jGrid
.clusterIsPerturbed(cjr
))
1801 for (int j
= 0; j
< nbl
->na_cj
; j
++)
1803 /* Is this interaction perturbed and not excluded? */
1804 ind_j
= (jGrid
.cellOffset()*c_gpuNumClusterPerCell
+ cjr
)*nbl
->na_cj
+ j
;
1805 aj
= atomIndices
[ind_j
];
1807 (bFEP_i
|| jGrid
.atomIsPerturbed(cjr
, j
)) &&
1808 (!bDiagRemoved
|| ind_j
>= ind_i
))
1811 unsigned int excl_bit
;
1814 const int jHalf
= j
/(c_nbnxnGpuClusterSize
/c_nbnxnGpuClusterpairSplit
);
1815 nbnxn_excl_t
&excl
=
1816 get_exclusion_mask(nbl
, cj4_ind
, jHalf
);
1818 excl_pair
= a_mod_wj(j
)*nbl
->na_ci
+ i
;
1819 excl_bit
= (1U << (gcj
*c_gpuNumClusterPerCell
+ c
));
1821 dx
= nbat
->x()[ind_j
*nbat
->xstride
+XX
] - xi
;
1822 dy
= nbat
->x()[ind_j
*nbat
->xstride
+YY
] - yi
;
1823 dz
= nbat
->x()[ind_j
*nbat
->xstride
+ZZ
] - zi
;
1825 /* The unpruned GPU list has more than 2/3
1826 * of the atom pairs beyond rlist. Using
1827 * this list will cause a lot of overhead
1828 * in the CPU FEP kernels, especially
1829 * relative to the fast GPU kernels.
1830 * So we prune the FEP list here.
1832 if (dx
*dx
+ dy
*dy
+ dz
*dz
< rlist_fep2
)
1834 if (nlist
->nrj
- nlist
->jindex
[nri
] >= max_nrj_fep
)
1836 fep_list_new_nri_copy(nlist
);
1840 /* Add it to the FEP list */
1841 nlist
->jjnr
[nlist
->nrj
] = aj
;
1842 nlist
->excl_fep
[nlist
->nrj
] = (excl
.pair
[excl_pair
] & excl_bit
) ? 1 : 0;
1846 /* Exclude it from the normal list.
1847 * Note that the charge and LJ parameters have
1848 * been set to zero, but we need to avoid 0/0,
1849 * as perturbed atoms can be on top of each other.
1851 excl
.pair
[excl_pair
] &= ~excl_bit
;
1855 /* Note that we could mask out this pair in imask
1856 * if all i- and/or all j-particles are perturbed.
1857 * But since the perturbed pairs on the CPU will
1858 * take an order of magnitude more time, the GPU
1859 * will finish before the CPU and there is no gain.
1865 if (nlist
->nrj
> nlist
->jindex
[nri
])
1867 /* Actually add this new, non-empty, list */
1869 nlist
->jindex
[nlist
->nri
] = nlist
->nrj
;
1876 /* Set all atom-pair exclusions for a GPU type list i-entry
1878 * Sets all atom-pair exclusions from the topology stored in exclusions
1879 * as masks in the pair-list for i-super-cluster list entry iEntry.
1882 setExclusionsForIEntry(const Nbnxm::GridSet
&gridSet
,
1883 NbnxnPairlistGpu
*nbl
,
1884 gmx_bool diagRemoved
,
1885 int gmx_unused na_cj_2log
,
1886 const nbnxn_sci_t
&iEntry
,
1887 const t_blocka
&exclusions
)
1889 if (iEntry
.numJClusterGroups() == 0)
1895 /* Set the search ranges using start and end j-cluster indices.
1896 * Note that here we can not use cj4_ind_end, since the last cj4
1897 * can be only partially filled, so we use cj_ind.
1899 const JListRanges
ranges(iEntry
.cj4_ind_start
*c_nbnxnGpuJgroupSize
,
1901 gmx::makeConstArrayRef(nbl
->cj4
));
1903 GMX_ASSERT(nbl
->na_ci
== c_nbnxnGpuClusterSize
, "na_ci should match the GPU cluster size");
1904 constexpr int c_clusterSize
= c_nbnxnGpuClusterSize
;
1905 constexpr int c_superClusterSize
= c_nbnxnGpuNumClusterPerSupercluster
*c_nbnxnGpuClusterSize
;
1907 const int iSuperCluster
= iEntry
.sci
;
1909 gmx::ArrayRef
<const int> atomIndices
= gridSet
.atomIndices();
1910 gmx::ArrayRef
<const int> cell
= gridSet
.cells();
1912 /* Loop over the atoms in the i super-cluster */
1913 for (int i
= 0; i
< c_superClusterSize
; i
++)
1915 const int iIndex
= iSuperCluster
*c_superClusterSize
+ i
;
1916 const int iAtom
= atomIndices
[iIndex
];
1919 const int iCluster
= i
/c_clusterSize
;
1921 /* Loop over the topology-based exclusions for this i-atom */
1922 for (int exclIndex
= exclusions
.index
[iAtom
]; exclIndex
< exclusions
.index
[iAtom
+ 1]; exclIndex
++)
1924 const int jAtom
= exclusions
.a
[exclIndex
];
1928 /* The self exclusions are already set, save some time */
1932 /* Get the index of the j-atom in the nbnxn atom data */
1933 const int jIndex
= cell
[jAtom
];
1935 /* Without shifts we only calculate interactions j>i
1936 * for one-way pair-lists.
1938 /* NOTE: We would like to use iIndex on the right hand side,
1939 * but that makes this routine 25% slower with gcc6/7.
1940 * Even using c_superClusterSize makes it slower.
1941 * Either of these changes triggers peeling of the exclIndex
1942 * loop, which apparently leads to far less efficient code.
1944 if (diagRemoved
&& jIndex
<= iSuperCluster
*nbl
->na_sc
+ i
)
1949 const int jCluster
= jIndex
/c_clusterSize
;
1951 /* Check whether the cluster is in our list? */
1952 if (jCluster
>= ranges
.cjFirst
&& jCluster
<= ranges
.cjLast
)
1955 findJClusterInJList(jCluster
, ranges
,
1956 gmx::makeConstArrayRef(nbl
->cj4
));
1960 /* We found an exclusion, clear the corresponding
1963 const unsigned int pairMask
= (1U << (cj_mod_cj4(index
)*c_gpuNumClusterPerCell
+ iCluster
));
1964 /* Check if the i-cluster interacts with the j-cluster */
1965 if (nbl_imask0(nbl
, index
) & pairMask
)
1967 const int innerI
= (i
& (c_clusterSize
- 1));
1968 const int innerJ
= (jIndex
& (c_clusterSize
- 1));
1970 /* Determine which j-half (CUDA warp) we are in */
1971 const int jHalf
= innerJ
/(c_clusterSize
/c_nbnxnGpuClusterpairSplit
);
1973 nbnxn_excl_t
&interactionMask
=
1974 get_exclusion_mask(nbl
, cj_to_cj4(index
), jHalf
);
1976 interactionMask
.pair
[a_mod_wj(innerJ
)*c_clusterSize
+ innerI
] &= ~pairMask
;
1985 /* Make a new ci entry at the back of nbl->ci */
1986 static void addNewIEntry(NbnxnPairlistCpu
*nbl
, int ci
, int shift
, int flags
)
1990 ciEntry
.shift
= shift
;
1991 /* Store the interaction flags along with the shift */
1992 ciEntry
.shift
|= flags
;
1993 ciEntry
.cj_ind_start
= nbl
->cj
.size();
1994 ciEntry
.cj_ind_end
= nbl
->cj
.size();
1995 nbl
->ci
.push_back(ciEntry
);
1998 /* Make a new sci entry at index nbl->nsci */
1999 static void addNewIEntry(NbnxnPairlistGpu
*nbl
, int sci
, int shift
, int gmx_unused flags
)
2001 nbnxn_sci_t sciEntry
;
2003 sciEntry
.shift
= shift
;
2004 sciEntry
.cj4_ind_start
= nbl
->cj4
.size();
2005 sciEntry
.cj4_ind_end
= nbl
->cj4
.size();
2007 nbl
->sci
.push_back(sciEntry
);
2010 /* Sort the simple j-list cj on exclusions.
2011 * Entries with exclusions will all be sorted to the beginning of the list.
2013 static void sort_cj_excl(nbnxn_cj_t
*cj
, int ncj
,
2014 NbnxnPairlistCpuWork
*work
)
2016 work
->cj
.resize(ncj
);
2018 /* Make a list of the j-cells involving exclusions */
2020 for (int j
= 0; j
< ncj
; j
++)
2022 if (cj
[j
].excl
!= NBNXN_INTERACTION_MASK_ALL
)
2024 work
->cj
[jnew
++] = cj
[j
];
2027 /* Check if there are exclusions at all or not just the first entry */
2028 if (!((jnew
== 0) ||
2029 (jnew
== 1 && cj
[0].excl
!= NBNXN_INTERACTION_MASK_ALL
)))
2031 for (int j
= 0; j
< ncj
; j
++)
2033 if (cj
[j
].excl
== NBNXN_INTERACTION_MASK_ALL
)
2035 work
->cj
[jnew
++] = cj
[j
];
2038 for (int j
= 0; j
< ncj
; j
++)
2040 cj
[j
] = work
->cj
[j
];
2045 /* Close this simple list i entry */
2046 static void closeIEntry(NbnxnPairlistCpu
*nbl
,
2047 int gmx_unused sp_max_av
,
2048 gmx_bool gmx_unused progBal
,
2049 float gmx_unused nsp_tot_est
,
2050 int gmx_unused thread
,
2051 int gmx_unused nthread
)
2053 nbnxn_ci_t
&ciEntry
= nbl
->ci
.back();
2055 /* All content of the new ci entry have already been filled correctly,
2056 * we only need to sort and increase counts or remove the entry when empty.
2058 const int jlen
= ciEntry
.cj_ind_end
- ciEntry
.cj_ind_start
;
2061 sort_cj_excl(nbl
->cj
.data() + ciEntry
.cj_ind_start
, jlen
, nbl
->work
.get());
2063 /* The counts below are used for non-bonded pair/flop counts
2064 * and should therefore match the available kernel setups.
2066 if (!(ciEntry
.shift
& NBNXN_CI_DO_COUL(0)))
2068 nbl
->work
->ncj_noq
+= jlen
;
2070 else if ((ciEntry
.shift
& NBNXN_CI_HALF_LJ(0)) ||
2071 !(ciEntry
.shift
& NBNXN_CI_DO_LJ(0)))
2073 nbl
->work
->ncj_hlj
+= jlen
;
2078 /* Entry is empty: remove it */
2083 /* Split sci entry for load balancing on the GPU.
2084 * Splitting ensures we have enough lists to fully utilize the whole GPU.
2085 * With progBal we generate progressively smaller lists, which improves
2086 * load balancing. As we only know the current count on our own thread,
2087 * we will need to estimate the current total amount of i-entries.
2088 * As the lists get concatenated later, this estimate depends
2089 * both on nthread and our own thread index.
2091 static void split_sci_entry(NbnxnPairlistGpu
*nbl
,
2093 gmx_bool progBal
, float nsp_tot_est
,
2094 int thread
, int nthread
)
2102 /* Estimate the total numbers of ci's of the nblist combined
2103 * over all threads using the target number of ci's.
2105 nsp_est
= (nsp_tot_est
*thread
)/nthread
+ nbl
->nci_tot
;
2107 /* The first ci blocks should be larger, to avoid overhead.
2108 * The last ci blocks should be smaller, to improve load balancing.
2109 * The factor 3/2 makes the first block 3/2 times the target average
2110 * and ensures that the total number of blocks end up equal to
2111 * that of equally sized blocks of size nsp_target_av.
2113 nsp_max
= static_cast<int>(nsp_target_av
*(nsp_tot_est
*1.5/(nsp_est
+ nsp_tot_est
)));
2117 nsp_max
= nsp_target_av
;
2120 const int cj4_start
= nbl
->sci
.back().cj4_ind_start
;
2121 const int cj4_end
= nbl
->sci
.back().cj4_ind_end
;
2122 const int j4len
= cj4_end
- cj4_start
;
2124 if (j4len
> 1 && j4len
*c_gpuNumClusterPerCell
*c_nbnxnGpuJgroupSize
> nsp_max
)
2126 /* Modify the last ci entry and process the cj4's again */
2132 for (int cj4
= cj4_start
; cj4
< cj4_end
; cj4
++)
2134 int nsp_cj4_p
= nsp_cj4
;
2135 /* Count the number of cluster pairs in this cj4 group */
2137 for (int p
= 0; p
< c_gpuNumClusterPerCell
*c_nbnxnGpuJgroupSize
; p
++)
2139 nsp_cj4
+= (nbl
->cj4
[cj4
].imei
[0].imask
>> p
) & 1;
2142 /* If adding the current cj4 with nsp_cj4 pairs get us further
2143 * away from our target nsp_max, split the list before this cj4.
2145 if (nsp
> 0 && nsp_max
- nsp
< nsp
+ nsp_cj4
- nsp_max
)
2147 /* Split the list at cj4 */
2148 nbl
->sci
.back().cj4_ind_end
= cj4
;
2149 /* Create a new sci entry */
2151 sciNew
.sci
= nbl
->sci
.back().sci
;
2152 sciNew
.shift
= nbl
->sci
.back().shift
;
2153 sciNew
.cj4_ind_start
= cj4
;
2154 nbl
->sci
.push_back(sciNew
);
2157 nsp_cj4_e
= nsp_cj4_p
;
2163 /* Put the remaining cj4's in the last sci entry */
2164 nbl
->sci
.back().cj4_ind_end
= cj4_end
;
2166 /* Possibly balance out the last two sci's
2167 * by moving the last cj4 of the second last sci.
2169 if (nsp_sci
- nsp_cj4_e
>= nsp
+ nsp_cj4_e
)
2171 GMX_ASSERT(nbl
->sci
.size() >= 2, "We expect at least two elements");
2172 nbl
->sci
[nbl
->sci
.size() - 2].cj4_ind_end
--;
2173 nbl
->sci
[nbl
->sci
.size() - 1].cj4_ind_start
--;
2178 /* Clost this super/sub list i entry */
2179 static void closeIEntry(NbnxnPairlistGpu
*nbl
,
2181 gmx_bool progBal
, float nsp_tot_est
,
2182 int thread
, int nthread
)
2184 nbnxn_sci_t
&sciEntry
= *getOpenIEntry(nbl
);
2186 /* All content of the new ci entry have already been filled correctly,
2187 * we only need to, potentially, split or remove the entry when empty.
2189 int j4len
= sciEntry
.numJClusterGroups();
2192 /* We can only have complete blocks of 4 j-entries in a list,
2193 * so round the count up before closing.
2195 int ncj4
= (nbl
->work
->cj_ind
+ c_nbnxnGpuJgroupSize
- 1)/c_nbnxnGpuJgroupSize
;
2196 nbl
->work
->cj_ind
= ncj4
*c_nbnxnGpuJgroupSize
;
2200 /* Measure the size of the new entry and potentially split it */
2201 split_sci_entry(nbl
, nsp_max_av
, progBal
, nsp_tot_est
,
2207 /* Entry is empty: remove it */
2208 nbl
->sci
.pop_back();
2212 /* Syncs the working array before adding another grid pair to the GPU list */
2213 static void sync_work(NbnxnPairlistCpu gmx_unused
*nbl
)
2217 /* Syncs the working array before adding another grid pair to the GPU list */
2218 static void sync_work(NbnxnPairlistGpu
*nbl
)
2220 nbl
->work
->cj_ind
= nbl
->cj4
.size()*c_nbnxnGpuJgroupSize
;
2223 /* Clears an NbnxnPairlistCpu data structure */
2224 static void clear_pairlist(NbnxnPairlistCpu
*nbl
)
2230 nbl
->ciOuter
.clear();
2231 nbl
->cjOuter
.clear();
2233 nbl
->work
->ncj_noq
= 0;
2234 nbl
->work
->ncj_hlj
= 0;
2237 /* Clears an NbnxnPairlistGpu data structure */
2238 static void clear_pairlist(NbnxnPairlistGpu
*nbl
)
2242 nbl
->excl
.resize(1);
2246 /* Clears a group scheme pair list */
2247 static void clear_pairlist_fep(t_nblist
*nl
)
2251 if (nl
->jindex
== nullptr)
2253 snew(nl
->jindex
, 1);
2258 /* Sets a simple list i-cell bounding box, including PBC shift */
2259 static inline void set_icell_bb_simple(gmx::ArrayRef
<const BoundingBox
> bb
,
2261 real shx
, real shy
, real shz
,
2264 bb_ci
->lower
.x
= bb
[ci
].lower
.x
+ shx
;
2265 bb_ci
->lower
.y
= bb
[ci
].lower
.y
+ shy
;
2266 bb_ci
->lower
.z
= bb
[ci
].lower
.z
+ shz
;
2267 bb_ci
->upper
.x
= bb
[ci
].upper
.x
+ shx
;
2268 bb_ci
->upper
.y
= bb
[ci
].upper
.y
+ shy
;
2269 bb_ci
->upper
.z
= bb
[ci
].upper
.z
+ shz
;
2272 /* Sets a simple list i-cell bounding box, including PBC shift */
2273 static inline void set_icell_bb(const Grid
&iGrid
,
2275 real shx
, real shy
, real shz
,
2276 NbnxnPairlistCpuWork
*work
)
2278 set_icell_bb_simple(iGrid
.iBoundingBoxes(), ci
, shx
, shy
, shz
,
2279 &work
->iClusterData
.bb
[0]);
2283 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
2284 static void set_icell_bbxxxx_supersub(gmx::ArrayRef
<const float> bb
,
2286 real shx
, real shy
, real shz
,
2289 constexpr int cellBBStride
= packedBoundingBoxesIndex(c_gpuNumClusterPerCell
);
2290 constexpr int pbbStride
= c_packedBoundingBoxesDimSize
;
2291 const int ia
= ci
*cellBBStride
;
2292 for (int m
= 0; m
< cellBBStride
; m
+= c_packedBoundingBoxesSize
)
2294 for (int i
= 0; i
< pbbStride
; i
++)
2296 bb_ci
[m
+ 0*pbbStride
+ i
] = bb
[ia
+ m
+ 0*pbbStride
+ i
] + shx
;
2297 bb_ci
[m
+ 1*pbbStride
+ i
] = bb
[ia
+ m
+ 1*pbbStride
+ i
] + shy
;
2298 bb_ci
[m
+ 2*pbbStride
+ i
] = bb
[ia
+ m
+ 2*pbbStride
+ i
] + shz
;
2299 bb_ci
[m
+ 3*pbbStride
+ i
] = bb
[ia
+ m
+ 3*pbbStride
+ i
] + shx
;
2300 bb_ci
[m
+ 4*pbbStride
+ i
] = bb
[ia
+ m
+ 4*pbbStride
+ i
] + shy
;
2301 bb_ci
[m
+ 5*pbbStride
+ i
] = bb
[ia
+ m
+ 5*pbbStride
+ i
] + shz
;
2307 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
2308 gmx_unused
static void set_icell_bb_supersub(gmx::ArrayRef
<const BoundingBox
> bb
,
2310 real shx
, real shy
, real shz
,
2313 for (int i
= 0; i
< c_gpuNumClusterPerCell
; i
++)
2315 set_icell_bb_simple(bb
, ci
*c_gpuNumClusterPerCell
+i
,
2321 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
2322 gmx_unused
static void set_icell_bb(const Grid
&iGrid
,
2324 real shx
, real shy
, real shz
,
2325 NbnxnPairlistGpuWork
*work
)
2328 set_icell_bbxxxx_supersub(iGrid
.packedBoundingBoxes(), ci
, shx
, shy
, shz
,
2329 work
->iSuperClusterData
.bbPacked
.data());
2331 set_icell_bb_supersub(iGrid
.iBoundingBoxes(), ci
, shx
, shy
, shz
,
2332 work
->iSuperClusterData
.bb
.data());
2336 /* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
2337 static void icell_set_x_simple(int ci
,
2338 real shx
, real shy
, real shz
,
2339 int stride
, const real
*x
,
2340 NbnxnPairlistCpuWork::IClusterData
*iClusterData
)
2342 const int ia
= ci
*c_nbnxnCpuIClusterSize
;
2344 for (int i
= 0; i
< c_nbnxnCpuIClusterSize
; i
++)
2346 iClusterData
->x
[i
*STRIDE_XYZ
+XX
] = x
[(ia
+i
)*stride
+XX
] + shx
;
2347 iClusterData
->x
[i
*STRIDE_XYZ
+YY
] = x
[(ia
+i
)*stride
+YY
] + shy
;
2348 iClusterData
->x
[i
*STRIDE_XYZ
+ZZ
] = x
[(ia
+i
)*stride
+ZZ
] + shz
;
2352 static void icell_set_x(int ci
,
2353 real shx
, real shy
, real shz
,
2354 int stride
, const real
*x
,
2355 const ClusterDistanceKernelType kernelType
,
2356 NbnxnPairlistCpuWork
*work
)
2361 #ifdef GMX_NBNXN_SIMD_4XN
2362 case ClusterDistanceKernelType::CpuSimd_4xM
:
2363 icell_set_x_simd_4xn(ci
, shx
, shy
, shz
, stride
, x
, work
);
2366 #ifdef GMX_NBNXN_SIMD_2XNN
2367 case ClusterDistanceKernelType::CpuSimd_2xMM
:
2368 icell_set_x_simd_2xnn(ci
, shx
, shy
, shz
, stride
, x
, work
);
2372 case ClusterDistanceKernelType::CpuPlainC
:
2373 icell_set_x_simple(ci
, shx
, shy
, shz
, stride
, x
, &work
->iClusterData
);
2376 GMX_ASSERT(false, "Unhandled case");
2381 /* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
2382 static void icell_set_x(int ci
,
2383 real shx
, real shy
, real shz
,
2384 int stride
, const real
*x
,
2385 ClusterDistanceKernelType gmx_unused kernelType
,
2386 NbnxnPairlistGpuWork
*work
)
2388 #if !GMX_SIMD4_HAVE_REAL
2390 real
* x_ci
= work
->iSuperClusterData
.x
.data();
2392 int ia
= ci
*c_gpuNumClusterPerCell
*c_nbnxnGpuClusterSize
;
2393 for (int i
= 0; i
< c_gpuNumClusterPerCell
*c_nbnxnGpuClusterSize
; i
++)
2395 x_ci
[i
*DIM
+ XX
] = x
[(ia
+i
)*stride
+ XX
] + shx
;
2396 x_ci
[i
*DIM
+ YY
] = x
[(ia
+i
)*stride
+ YY
] + shy
;
2397 x_ci
[i
*DIM
+ ZZ
] = x
[(ia
+i
)*stride
+ ZZ
] + shz
;
2400 #else /* !GMX_SIMD4_HAVE_REAL */
2402 real
* x_ci
= work
->iSuperClusterData
.xSimd
.data();
2404 for (int si
= 0; si
< c_gpuNumClusterPerCell
; si
++)
2406 for (int i
= 0; i
< c_nbnxnGpuClusterSize
; i
+= GMX_SIMD4_WIDTH
)
2408 int io
= si
*c_nbnxnGpuClusterSize
+ i
;
2409 int ia
= ci
*c_gpuNumClusterPerCell
*c_nbnxnGpuClusterSize
+ io
;
2410 for (int j
= 0; j
< GMX_SIMD4_WIDTH
; j
++)
2412 x_ci
[io
*DIM
+ j
+ XX
*GMX_SIMD4_WIDTH
] = x
[(ia
+ j
)*stride
+ XX
] + shx
;
2413 x_ci
[io
*DIM
+ j
+ YY
*GMX_SIMD4_WIDTH
] = x
[(ia
+ j
)*stride
+ YY
] + shy
;
2414 x_ci
[io
*DIM
+ j
+ ZZ
*GMX_SIMD4_WIDTH
] = x
[(ia
+ j
)*stride
+ ZZ
] + shz
;
2419 #endif /* !GMX_SIMD4_HAVE_REAL */
2422 static real
minimum_subgrid_size_xy(const Grid
&grid
)
2424 const Grid::Dimensions
&dims
= grid
.dimensions();
2426 if (grid
.geometry().isSimple
)
2428 return std::min(dims
.cellSize
[XX
], dims
.cellSize
[YY
]);
2432 return std::min(dims
.cellSize
[XX
]/c_gpuNumClusterPerCellX
,
2433 dims
.cellSize
[YY
]/c_gpuNumClusterPerCellY
);
2437 static real
effective_buffer_1x1_vs_MxN(const Grid
&iGrid
,
2440 const real eff_1x1_buffer_fac_overest
= 0.1;
2442 /* Determine an atom-pair list cut-off buffer size for atom pairs,
2443 * to be added to rlist (including buffer) used for MxN.
2444 * This is for converting an MxN list to a 1x1 list. This means we can't
2445 * use the normal buffer estimate, as we have an MxN list in which
2446 * some atom pairs beyond rlist are missing. We want to capture
2447 * the beneficial effect of buffering by extra pairs just outside rlist,
2448 * while removing the useless pairs that are further away from rlist.
2449 * (Also the buffer could have been set manually not using the estimate.)
2450 * This buffer size is an overestimate.
2451 * We add 10% of the smallest grid sub-cell dimensions.
2452 * Note that the z-size differs per cell and we don't use this,
2453 * so we overestimate.
2454 * With PME, the 10% value gives a buffer that is somewhat larger
2455 * than the effective buffer with a tolerance of 0.005 kJ/mol/ps.
2456 * Smaller tolerances or using RF lead to a smaller effective buffer,
2457 * so 10% gives a safe overestimate.
2459 return eff_1x1_buffer_fac_overest
*(minimum_subgrid_size_xy(iGrid
) +
2460 minimum_subgrid_size_xy(jGrid
));
2463 /* Estimates the interaction volume^2 for non-local interactions */
2464 static real
nonlocal_vol2(const struct gmx_domdec_zones_t
*zones
, const rvec ls
, real r
)
2472 /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
2473 * not home interaction volume^2. As these volumes are not additive,
2474 * this is an overestimate, but it would only be significant in the limit
2475 * of small cells, where we anyhow need to split the lists into
2476 * as small parts as possible.
2479 for (int z
= 0; z
< zones
->n
; z
++)
2481 if (zones
->shift
[z
][XX
] + zones
->shift
[z
][YY
] + zones
->shift
[z
][ZZ
] == 1)
2486 for (int d
= 0; d
< DIM
; d
++)
2488 if (zones
->shift
[z
][d
] == 0)
2492 za
*= zones
->size
[z
].x1
[d
] - zones
->size
[z
].x0
[d
];
2496 /* 4 octants of a sphere */
2497 vold_est
= 0.25*M_PI
*r
*r
*r
*r
;
2498 /* 4 quarter pie slices on the edges */
2499 vold_est
+= 4*cl
*M_PI
/6.0*r
*r
*r
;
2500 /* One rectangular volume on a face */
2501 vold_est
+= ca
*0.5*r
*r
;
2503 vol2_est_tot
+= vold_est
*za
;
2507 return vol2_est_tot
;
2510 /* Estimates the average size of a full j-list for super/sub setup */
2511 static void get_nsubpair_target(const Nbnxm::GridSet
&gridSet
,
2512 const InteractionLocality iloc
,
2514 const int min_ci_balanced
,
2515 int *nsubpair_target
,
2516 float *nsubpair_tot_est
)
2518 /* The target value of 36 seems to be the optimum for Kepler.
2519 * Maxwell is less sensitive to the exact value.
2521 const int nsubpair_target_min
= 36;
2522 real r_eff_sup
, vol_est
, nsp_est
, nsp_est_nl
;
2524 const Grid
&grid
= gridSet
.grids()[0];
2526 /* We don't need to balance list sizes if:
2527 * - We didn't request balancing.
2528 * - The number of grid cells >= the number of lists requested,
2529 * since we will always generate at least #cells lists.
2530 * - We don't have any cells, since then there won't be any lists.
2532 if (min_ci_balanced
<= 0 || grid
.numCells() >= min_ci_balanced
|| grid
.numCells() == 0)
2534 /* nsubpair_target==0 signals no balancing */
2535 *nsubpair_target
= 0;
2536 *nsubpair_tot_est
= 0;
2542 const int numAtomsCluster
= grid
.geometry().numAtomsICluster
;
2543 const Grid::Dimensions
&dims
= grid
.dimensions();
2545 ls
[XX
] = dims
.cellSize
[XX
]/c_gpuNumClusterPerCellX
;
2546 ls
[YY
] = dims
.cellSize
[YY
]/c_gpuNumClusterPerCellY
;
2547 ls
[ZZ
] = numAtomsCluster
/(dims
.atomDensity
*ls
[XX
]*ls
[YY
]);
2549 /* The formulas below are a heuristic estimate of the average nsj per si*/
2550 r_eff_sup
= rlist
+ nbnxn_get_rlist_effective_inc(numAtomsCluster
, ls
);
2552 if (!gridSet
.domainSetup().haveMultipleDomains
||
2553 gridSet
.domainSetup().zones
->n
== 1)
2560 gmx::square(dims
.atomDensity
/numAtomsCluster
)*
2561 nonlocal_vol2(gridSet
.domainSetup().zones
, ls
, r_eff_sup
);
2564 if (iloc
== InteractionLocality::Local
)
2566 /* Sub-cell interacts with itself */
2567 vol_est
= ls
[XX
]*ls
[YY
]*ls
[ZZ
];
2568 /* 6/2 rectangular volume on the faces */
2569 vol_est
+= (ls
[XX
]*ls
[YY
] + ls
[XX
]*ls
[ZZ
] + ls
[YY
]*ls
[ZZ
])*r_eff_sup
;
2570 /* 12/2 quarter pie slices on the edges */
2571 vol_est
+= 2*(ls
[XX
] + ls
[YY
] + ls
[ZZ
])*0.25*M_PI
*gmx::square(r_eff_sup
);
2572 /* 4 octants of a sphere */
2573 vol_est
+= 0.5*4.0/3.0*M_PI
*gmx::power3(r_eff_sup
);
2575 /* Estimate the number of cluster pairs as the local number of
2576 * clusters times the volume they interact with times the density.
2578 nsp_est
= grid
.numClusters()*vol_est
*dims
.atomDensity
/numAtomsCluster
;
2580 /* Subtract the non-local pair count */
2581 nsp_est
-= nsp_est_nl
;
2583 /* For small cut-offs nsp_est will be an underesimate.
2584 * With DD nsp_est_nl is an overestimate so nsp_est can get negative.
2585 * So to avoid too small or negative nsp_est we set a minimum of
2586 * all cells interacting with all 3^3 direct neighbors (3^3-1)/2+1=14.
2587 * This might be a slight overestimate for small non-periodic groups of
2588 * atoms as will occur for a local domain with DD, but for small
2589 * groups of atoms we'll anyhow be limited by nsubpair_target_min,
2590 * so this overestimation will not matter.
2592 nsp_est
= std::max(nsp_est
, grid
.numClusters()*14._real
);
2596 fprintf(debug
, "nsp_est local %5.1f non-local %5.1f\n",
2597 nsp_est
, nsp_est_nl
);
2602 nsp_est
= nsp_est_nl
;
2605 /* Thus the (average) maximum j-list size should be as follows.
2606 * Since there is overhead, we shouldn't make the lists too small
2607 * (and we can't chop up j-groups) so we use a minimum target size of 36.
2609 *nsubpair_target
= std::max(nsubpair_target_min
,
2610 roundToInt(nsp_est
/min_ci_balanced
));
2611 *nsubpair_tot_est
= nsp_est
;
2615 fprintf(debug
, "nbl nsp estimate %.1f, nsubpair_target %d\n",
2616 nsp_est
, *nsubpair_target
);
2620 /* Debug list print function */
2621 static void print_nblist_ci_cj(FILE *fp
,
2622 const NbnxnPairlistCpu
&nbl
)
2624 for (const nbnxn_ci_t
&ciEntry
: nbl
.ci
)
2626 fprintf(fp
, "ci %4d shift %2d ncj %3d\n",
2627 ciEntry
.ci
, ciEntry
.shift
,
2628 ciEntry
.cj_ind_end
- ciEntry
.cj_ind_start
);
2630 for (int j
= ciEntry
.cj_ind_start
; j
< ciEntry
.cj_ind_end
; j
++)
2632 fprintf(fp
, " cj %5d imask %x\n",
2639 /* Debug list print function */
2640 static void print_nblist_sci_cj(FILE *fp
,
2641 const NbnxnPairlistGpu
&nbl
)
2643 for (const nbnxn_sci_t
&sci
: nbl
.sci
)
2645 fprintf(fp
, "ci %4d shift %2d ncj4 %2d\n",
2647 sci
.numJClusterGroups());
2650 for (int j4
= sci
.cj4_ind_start
; j4
< sci
.cj4_ind_end
; j4
++)
2652 for (int j
= 0; j
< c_nbnxnGpuJgroupSize
; j
++)
2654 fprintf(fp
, " sj %5d imask %x\n",
2656 nbl
.cj4
[j4
].imei
[0].imask
);
2657 for (int si
= 0; si
< c_gpuNumClusterPerCell
; si
++)
2659 if (nbl
.cj4
[j4
].imei
[0].imask
& (1U << (j
*c_gpuNumClusterPerCell
+ si
)))
2666 fprintf(fp
, "ci %4d shift %2d ncj4 %2d ncp %3d\n",
2668 sci
.numJClusterGroups(),
2673 /* Combine pair lists *nbl generated on multiple threads nblc */
2674 static void combine_nblists(gmx::ArrayRef
<const NbnxnPairlistGpu
> nbls
,
2675 NbnxnPairlistGpu
*nblc
)
2677 int nsci
= nblc
->sci
.size();
2678 int ncj4
= nblc
->cj4
.size();
2679 int nexcl
= nblc
->excl
.size();
2680 for (auto &nbl
: nbls
)
2682 nsci
+= nbl
.sci
.size();
2683 ncj4
+= nbl
.cj4
.size();
2684 nexcl
+= nbl
.excl
.size();
2687 /* Resize with the final, combined size, so we can fill in parallel */
2688 /* NOTE: For better performance we should use default initialization */
2689 nblc
->sci
.resize(nsci
);
2690 nblc
->cj4
.resize(ncj4
);
2691 nblc
->excl
.resize(nexcl
);
2693 /* Each thread should copy its own data to the combined arrays,
2694 * as otherwise data will go back and forth between different caches.
2696 const int gmx_unused nthreads
= gmx_omp_nthreads_get(emntPairsearch
);
2698 #pragma omp parallel for num_threads(nthreads) schedule(static)
2699 for (gmx::index n
= 0; n
< nbls
.ssize(); n
++)
2703 /* Determine the offset in the combined data for our thread.
2704 * Note that the original sizes in nblc are lost.
2706 int sci_offset
= nsci
;
2707 int cj4_offset
= ncj4
;
2708 int excl_offset
= nexcl
;
2710 for (gmx::index i
= n
; i
< nbls
.ssize(); i
++)
2712 sci_offset
-= nbls
[i
].sci
.size();
2713 cj4_offset
-= nbls
[i
].cj4
.size();
2714 excl_offset
-= nbls
[i
].excl
.size();
2717 const NbnxnPairlistGpu
&nbli
= nbls
[n
];
2719 for (size_t i
= 0; i
< nbli
.sci
.size(); i
++)
2721 nblc
->sci
[sci_offset
+ i
] = nbli
.sci
[i
];
2722 nblc
->sci
[sci_offset
+ i
].cj4_ind_start
+= cj4_offset
;
2723 nblc
->sci
[sci_offset
+ i
].cj4_ind_end
+= cj4_offset
;
2726 for (size_t j4
= 0; j4
< nbli
.cj4
.size(); j4
++)
2728 nblc
->cj4
[cj4_offset
+ j4
] = nbli
.cj4
[j4
];
2729 nblc
->cj4
[cj4_offset
+ j4
].imei
[0].excl_ind
+= excl_offset
;
2730 nblc
->cj4
[cj4_offset
+ j4
].imei
[1].excl_ind
+= excl_offset
;
2733 for (size_t j4
= 0; j4
< nbli
.excl
.size(); j4
++)
2735 nblc
->excl
[excl_offset
+ j4
] = nbli
.excl
[j4
];
2738 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
2741 for (auto &nbl
: nbls
)
2743 nblc
->nci_tot
+= nbl
.nci_tot
;
2747 static void balance_fep_lists(gmx::ArrayRef
< std::unique_ptr
< t_nblist
>> fepLists
,
2748 gmx::ArrayRef
<PairsearchWork
> work
)
2750 const int numLists
= fepLists
.ssize();
2754 /* Nothing to balance */
2758 /* Count the total i-lists and pairs */
2761 for (const auto &list
: fepLists
)
2763 nri_tot
+= list
->nri
;
2764 nrj_tot
+= list
->nrj
;
2767 const int nrj_target
= (nrj_tot
+ numLists
- 1)/numLists
;
2769 GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded
) == numLists
,
2770 "We should have as many work objects as FEP lists");
2772 #pragma omp parallel for schedule(static) num_threads(numLists)
2773 for (int th
= 0; th
< numLists
; th
++)
2777 t_nblist
*nbl
= work
[th
].nbl_fep
.get();
2779 /* Note that here we allocate for the total size, instead of
2780 * a per-thread esimate (which is hard to obtain).
2782 if (nri_tot
> nbl
->maxnri
)
2784 nbl
->maxnri
= over_alloc_large(nri_tot
);
2785 reallocate_nblist(nbl
);
2787 if (nri_tot
> nbl
->maxnri
|| nrj_tot
> nbl
->maxnrj
)
2789 nbl
->maxnrj
= over_alloc_small(nrj_tot
);
2790 srenew(nbl
->jjnr
, nbl
->maxnrj
);
2791 srenew(nbl
->excl_fep
, nbl
->maxnrj
);
2794 clear_pairlist_fep(nbl
);
2796 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
2799 /* Loop over the source lists and assign and copy i-entries */
2801 t_nblist
*nbld
= work
[th_dest
].nbl_fep
.get();
2802 for (int th
= 0; th
< numLists
; th
++)
2804 const t_nblist
*nbls
= fepLists
[th
].get();
2806 for (int i
= 0; i
< nbls
->nri
; i
++)
2810 /* The number of pairs in this i-entry */
2811 nrj
= nbls
->jindex
[i
+1] - nbls
->jindex
[i
];
2813 /* Decide if list th_dest is too large and we should procede
2814 * to the next destination list.
2816 if (th_dest
+ 1 < numLists
&& nbld
->nrj
> 0 &&
2817 nbld
->nrj
+ nrj
- nrj_target
> nrj_target
- nbld
->nrj
)
2820 nbld
= work
[th_dest
].nbl_fep
.get();
2823 nbld
->iinr
[nbld
->nri
] = nbls
->iinr
[i
];
2824 nbld
->gid
[nbld
->nri
] = nbls
->gid
[i
];
2825 nbld
->shift
[nbld
->nri
] = nbls
->shift
[i
];
2827 for (int j
= nbls
->jindex
[i
]; j
< nbls
->jindex
[i
+1]; j
++)
2829 nbld
->jjnr
[nbld
->nrj
] = nbls
->jjnr
[j
];
2830 nbld
->excl_fep
[nbld
->nrj
] = nbls
->excl_fep
[j
];
2834 nbld
->jindex
[nbld
->nri
] = nbld
->nrj
;
2838 /* Swap the list pointers */
2839 for (int th
= 0; th
< numLists
; th
++)
2841 fepLists
[th
].swap(work
[th
].nbl_fep
);
2845 fprintf(debug
, "nbl_fep[%d] nri %4d nrj %4d\n",
2853 /* Returns the next ci to be processes by our thread */
2854 static gmx_bool
next_ci(const Grid
&grid
,
2855 int nth
, int ci_block
,
2856 int *ci_x
, int *ci_y
,
2862 if (*ci_b
== ci_block
)
2864 /* Jump to the next block assigned to this task */
2865 *ci
+= (nth
- 1)*ci_block
;
2869 if (*ci
>= grid
.numCells())
2874 while (*ci
>= grid
.firstCellInColumn(*ci_x
*grid
.dimensions().numCells
[YY
] + *ci_y
+ 1))
2877 if (*ci_y
== grid
.dimensions().numCells
[YY
])
2887 /* Returns the distance^2 for which we put cell pairs in the list
2888 * without checking atom pair distances. This is usually < rlist^2.
2890 static float boundingbox_only_distance2(const Grid::Dimensions
&iGridDims
,
2891 const Grid::Dimensions
&jGridDims
,
2895 /* If the distance between two sub-cell bounding boxes is less
2896 * than this distance, do not check the distance between
2897 * all particle pairs in the sub-cell, since then it is likely
2898 * that the box pair has atom pairs within the cut-off.
2899 * We use the nblist cut-off minus 0.5 times the average x/y diagonal
2900 * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
2901 * Using more than 0.5 gains at most 0.5%.
2902 * If forces are calculated more than twice, the performance gain
2903 * in the force calculation outweighs the cost of checking.
2904 * Note that with subcell lists, the atom-pair distance check
2905 * is only performed when only 1 out of 8 sub-cells in within range,
2906 * this is because the GPU is much faster than the cpu.
2911 bbx
= 0.5*(iGridDims
.cellSize
[XX
] + jGridDims
.cellSize
[XX
]);
2912 bby
= 0.5*(iGridDims
.cellSize
[YY
] + jGridDims
.cellSize
[YY
]);
2915 bbx
/= c_gpuNumClusterPerCellX
;
2916 bby
/= c_gpuNumClusterPerCellY
;
2919 rbb2
= std::max(0.0, rlist
- 0.5*std::sqrt(bbx
*bbx
+ bby
*bby
));
2925 return (float)((1+GMX_FLOAT_EPS
)*rbb2
);
2929 static int get_ci_block_size(const Grid
&iGrid
,
2930 const bool haveMultipleDomains
,
2933 const int ci_block_enum
= 5;
2934 const int ci_block_denom
= 11;
2935 const int ci_block_min_atoms
= 16;
2938 /* Here we decide how to distribute the blocks over the threads.
2939 * We use prime numbers to try to avoid that the grid size becomes
2940 * a multiple of the number of threads, which would lead to some
2941 * threads getting "inner" pairs and others getting boundary pairs,
2942 * which in turns will lead to load imbalance between threads.
2943 * Set the block size as 5/11/ntask times the average number of cells
2944 * in a y,z slab. This should ensure a quite uniform distribution
2945 * of the grid parts of the different thread along all three grid
2946 * zone boundaries with 3D domain decomposition. At the same time
2947 * the blocks will not become too small.
2949 GMX_ASSERT(iGrid
.dimensions().numCells
[XX
] > 0, "Grid can't be empty");
2950 GMX_ASSERT(numLists
> 0, "We need at least one list");
2951 ci_block
= (iGrid
.numCells()*ci_block_enum
)/(ci_block_denom
*iGrid
.dimensions().numCells
[XX
]*numLists
);
2953 const int numAtomsPerCell
= iGrid
.geometry().numAtomsPerCell
;
2955 /* Ensure the blocks are not too small: avoids cache invalidation */
2956 if (ci_block
*numAtomsPerCell
< ci_block_min_atoms
)
2958 ci_block
= (ci_block_min_atoms
+ numAtomsPerCell
- 1)/numAtomsPerCell
;
2961 /* Without domain decomposition
2962 * or with less than 3 blocks per task, divide in nth blocks.
2964 if (!haveMultipleDomains
|| numLists
*3*ci_block
> iGrid
.numCells())
2966 ci_block
= (iGrid
.numCells() + numLists
- 1)/numLists
;
2969 if (ci_block
> 1 && (numLists
- 1)*ci_block
>= iGrid
.numCells())
2971 /* Some threads have no work. Although reducing the block size
2972 * does not decrease the block count on the first few threads,
2973 * with GPUs better mixing of "upper" cells that have more empty
2974 * clusters results in a somewhat lower max load over all threads.
2975 * Without GPUs the regime of so few atoms per thread is less
2976 * performance relevant, but with 8-wide SIMD the same reasoning
2977 * applies, since the pair list uses 4 i-atom "sub-clusters".
2985 /* Returns the number of bits to right-shift a cluster index to obtain
2986 * the corresponding force buffer flag index.
2988 static int getBufferFlagShift(int numAtomsPerCluster
)
2990 int bufferFlagShift
= 0;
2991 while ((numAtomsPerCluster
<< bufferFlagShift
) < NBNXN_BUFFERFLAG_SIZE
)
2996 return bufferFlagShift
;
2999 static bool pairlistIsSimple(const NbnxnPairlistCpu gmx_unused
&pairlist
)
3004 static bool pairlistIsSimple(const NbnxnPairlistGpu gmx_unused
&pairlist
)
3010 makeClusterListWrapper(NbnxnPairlistCpu
*nbl
,
3011 const Grid gmx_unused
&iGrid
,
3014 const int firstCell
,
3016 const bool excludeSubDiagonal
,
3017 const nbnxn_atomdata_t
*nbat
,
3020 const ClusterDistanceKernelType kernelType
,
3021 int *numDistanceChecks
)
3025 case ClusterDistanceKernelType::CpuPlainC
:
3026 makeClusterListSimple(jGrid
,
3027 nbl
, ci
, firstCell
, lastCell
,
3033 #ifdef GMX_NBNXN_SIMD_4XN
3034 case ClusterDistanceKernelType::CpuSimd_4xM
:
3035 makeClusterListSimd4xn(jGrid
,
3036 nbl
, ci
, firstCell
, lastCell
,
3043 #ifdef GMX_NBNXN_SIMD_2XNN
3044 case ClusterDistanceKernelType::CpuSimd_2xMM
:
3045 makeClusterListSimd2xnn(jGrid
,
3046 nbl
, ci
, firstCell
, lastCell
,
3054 GMX_ASSERT(false, "Unhandled kernel type");
3059 makeClusterListWrapper(NbnxnPairlistGpu
*nbl
,
3060 const Grid
&gmx_unused iGrid
,
3063 const int firstCell
,
3065 const bool excludeSubDiagonal
,
3066 const nbnxn_atomdata_t
*nbat
,
3069 ClusterDistanceKernelType gmx_unused kernelType
,
3070 int *numDistanceChecks
)
3072 for (int cj
= firstCell
; cj
<= lastCell
; cj
++)
3074 make_cluster_list_supersub(iGrid
, jGrid
,
3077 nbat
->xstride
, nbat
->x().data(),
3083 static int getNumSimpleJClustersInList(const NbnxnPairlistCpu
&nbl
)
3085 return nbl
.cj
.size();
3088 static int getNumSimpleJClustersInList(const gmx_unused NbnxnPairlistGpu
&nbl
)
3093 static void incrementNumSimpleJClustersInList(NbnxnPairlistCpu
*nbl
,
3096 nbl
->ncjInUse
+= nbl
->cj
.size();
3097 nbl
->ncjInUse
-= ncj_old_j
;
3100 static void incrementNumSimpleJClustersInList(NbnxnPairlistGpu gmx_unused
*nbl
,
3101 int gmx_unused ncj_old_j
)
3105 static void checkListSizeConsistency(const NbnxnPairlistCpu
&nbl
,
3106 const bool haveFreeEnergy
)
3108 GMX_RELEASE_ASSERT(static_cast<size_t>(nbl
.ncjInUse
) == nbl
.cj
.size() || haveFreeEnergy
,
3109 "Without free-energy all cj pair-list entries should be in use. "
3110 "Note that subsequent code does not make use of the equality, "
3111 "this check is only here to catch bugs");
3114 static void checkListSizeConsistency(const NbnxnPairlistGpu gmx_unused
&nbl
,
3115 bool gmx_unused haveFreeEnergy
)
3117 /* We currently can not check consistency here */
3120 /* Set the buffer flags for newly added entries in the list */
3121 static void setBufferFlags(const NbnxnPairlistCpu
&nbl
,
3122 const int ncj_old_j
,
3123 const int gridj_flag_shift
,
3124 gmx_bitmask_t
*gridj_flag
,
3127 if (gmx::ssize(nbl
.cj
) > ncj_old_j
)
3129 int cbFirst
= nbl
.cj
[ncj_old_j
].cj
>> gridj_flag_shift
;
3130 int cbLast
= nbl
.cj
.back().cj
>> gridj_flag_shift
;
3131 for (int cb
= cbFirst
; cb
<= cbLast
; cb
++)
3133 bitmask_init_bit(&gridj_flag
[cb
], th
);
3138 static void setBufferFlags(const NbnxnPairlistGpu gmx_unused
&nbl
,
3139 int gmx_unused ncj_old_j
,
3140 int gmx_unused gridj_flag_shift
,
3141 gmx_bitmask_t gmx_unused
*gridj_flag
,
3144 GMX_ASSERT(false, "This function should never be called");
3147 /* Generates the part of pair-list nbl assigned to our thread */
3148 template <typename T
>
3149 static void nbnxn_make_pairlist_part(const Nbnxm::GridSet
&gridSet
,
3152 PairsearchWork
*work
,
3153 const nbnxn_atomdata_t
*nbat
,
3154 const t_blocka
&exclusions
,
3156 const PairlistType pairlistType
,
3158 gmx_bool bFBufferFlag
,
3161 float nsubpair_tot_est
,
3170 int ci_b
, ci
, ci_x
, ci_y
, ci_xy
;
3172 real bx0
, bx1
, by0
, by1
, bz0
, bz1
;
3174 real d2cx
, d2z
, d2z_cx
, d2z_cy
, d2zx
, d2zxy
, d2xy
;
3175 int cxf
, cxl
, cyf
, cyf_x
, cyl
;
3176 int numDistanceChecks
;
3177 int gridi_flag_shift
= 0, gridj_flag_shift
= 0;
3178 gmx_bitmask_t
*gridj_flag
= nullptr;
3179 int ncj_old_i
, ncj_old_j
;
3181 if (jGrid
.geometry().isSimple
!= pairlistIsSimple(*nbl
) ||
3182 iGrid
.geometry().isSimple
!= pairlistIsSimple(*nbl
))
3184 gmx_incons("Grid incompatible with pair-list");
3188 GMX_ASSERT(nbl
->na_ci
== jGrid
.geometry().numAtomsICluster
,
3189 "The cluster sizes in the list and grid should match");
3190 nbl
->na_cj
= JClusterSizePerListType
[pairlistType
];
3191 na_cj_2log
= get_2log(nbl
->na_cj
);
3197 /* Determine conversion of clusters to flag blocks */
3198 gridi_flag_shift
= getBufferFlagShift(nbl
->na_ci
);
3199 gridj_flag_shift
= getBufferFlagShift(nbl
->na_cj
);
3201 gridj_flag
= work
->buffer_flags
.flag
;
3204 gridSet
.getBox(box
);
3206 const bool haveFep
= gridSet
.haveFep();
3208 const real rlist2
= nbl
->rlist
*nbl
->rlist
;
3210 // Select the cluster pair distance kernel type
3211 const ClusterDistanceKernelType kernelType
=
3212 getClusterDistanceKernelType(pairlistType
, *nbat
);
3214 if (haveFep
&& !pairlistIsSimple(*nbl
))
3216 /* Determine an atom-pair list cut-off distance for FEP atom pairs.
3217 * We should not simply use rlist, since then we would not have
3218 * the small, effective buffering of the NxN lists.
3219 * The buffer is on overestimate, but the resulting cost for pairs
3220 * beyond rlist is neglible compared to the FEP pairs within rlist.
3222 rl_fep2
= nbl
->rlist
+ effective_buffer_1x1_vs_MxN(iGrid
, jGrid
);
3226 fprintf(debug
, "nbl_fep atom-pair rlist %f\n", rl_fep2
);
3228 rl_fep2
= rl_fep2
*rl_fep2
;
3231 const Grid::Dimensions
&iGridDims
= iGrid
.dimensions();
3232 const Grid::Dimensions
&jGridDims
= jGrid
.dimensions();
3234 rbb2
= boundingbox_only_distance2(iGridDims
, jGridDims
, nbl
->rlist
, pairlistIsSimple(*nbl
));
3238 fprintf(debug
, "nbl bounding box only distance %f\n", std::sqrt(rbb2
));
3241 const bool isIntraGridList
= (&iGrid
== &jGrid
);
3243 /* Set the shift range */
3244 for (int d
= 0; d
< DIM
; d
++)
3246 /* Check if we need periodicity shifts.
3247 * Without PBC or with domain decomposition we don't need them.
3249 if (d
>= ePBC2npbcdim(gridSet
.domainSetup().ePBC
) ||
3250 gridSet
.domainSetup().haveMultipleDomainsPerDim
[d
])
3256 const real listRangeCellToCell
=
3257 listRangeForGridCellToGridCell(rlist
, iGrid
.dimensions(), jGrid
.dimensions());
3259 box
[XX
][XX
] - fabs(box
[YY
][XX
]) - fabs(box
[ZZ
][XX
]) < listRangeCellToCell
)
3269 const bool bSimple
= pairlistIsSimple(*nbl
);
3270 gmx::ArrayRef
<const BoundingBox
> bb_i
;
3272 gmx::ArrayRef
<const float> pbb_i
;
3275 bb_i
= iGrid
.iBoundingBoxes();
3279 pbb_i
= iGrid
.packedBoundingBoxes();
3282 /* We use the normal bounding box format for both grid types */
3283 bb_i
= iGrid
.iBoundingBoxes();
3285 gmx::ArrayRef
<const BoundingBox1D
> bbcz_i
= iGrid
.zBoundingBoxes();
3286 gmx::ArrayRef
<const int> flags_i
= iGrid
.clusterFlags();
3287 gmx::ArrayRef
<const BoundingBox1D
> bbcz_j
= jGrid
.zBoundingBoxes();
3288 int cell0_i
= iGrid
.cellOffset();
3292 fprintf(debug
, "nbl nc_i %d col.av. %.1f ci_block %d\n",
3293 iGrid
.numCells(), iGrid
.numCells()/static_cast<double>(iGrid
.numColumns()), ci_block
);
3296 numDistanceChecks
= 0;
3298 const real listRangeBBToJCell2
= gmx::square(listRangeForBoundingBoxToGridCell(rlist
, jGrid
.dimensions()));
3300 /* Initially ci_b and ci to 1 before where we want them to start,
3301 * as they will both be incremented in next_ci.
3304 ci
= th
*ci_block
- 1;
3307 while (next_ci(iGrid
, nth
, ci_block
, &ci_x
, &ci_y
, &ci_b
, &ci
))
3309 if (bSimple
&& flags_i
[ci
] == 0)
3313 ncj_old_i
= getNumSimpleJClustersInList(*nbl
);
3316 if (!isIntraGridList
&& shp
[XX
] == 0)
3320 bx1
= bb_i
[ci
].upper
.x
;
3324 bx1
= iGridDims
.lowerCorner
[XX
] + (real(ci_x
)+1)*iGridDims
.cellSize
[XX
];
3326 if (bx1
< jGridDims
.lowerCorner
[XX
])
3328 d2cx
= gmx::square(jGridDims
.lowerCorner
[XX
] - bx1
);
3330 if (d2cx
>= listRangeBBToJCell2
)
3337 ci_xy
= ci_x
*iGridDims
.numCells
[YY
] + ci_y
;
3339 /* Loop over shift vectors in three dimensions */
3340 for (int tz
= -shp
[ZZ
]; tz
<= shp
[ZZ
]; tz
++)
3342 const real shz
= real(tz
)*box
[ZZ
][ZZ
];
3344 bz0
= bbcz_i
[ci
].lower
+ shz
;
3345 bz1
= bbcz_i
[ci
].upper
+ shz
;
3353 d2z
= gmx::square(bz1
);
3357 d2z
= gmx::square(bz0
- box
[ZZ
][ZZ
]);
3360 d2z_cx
= d2z
+ d2cx
;
3362 if (d2z_cx
>= rlist2
)
3367 bz1_frac
= bz1
/real(iGrid
.numCellsInColumn(ci_xy
));
3372 /* The check with bz1_frac close to or larger than 1 comes later */
3374 for (int ty
= -shp
[YY
]; ty
<= shp
[YY
]; ty
++)
3376 const real shy
= real(ty
)*box
[YY
][YY
] + real(tz
)*box
[ZZ
][YY
];
3380 by0
= bb_i
[ci
].lower
.y
+ shy
;
3381 by1
= bb_i
[ci
].upper
.y
+ shy
;
3385 by0
= iGridDims
.lowerCorner
[YY
] + (real(ci_y
) )*iGridDims
.cellSize
[YY
] + shy
;
3386 by1
= iGridDims
.lowerCorner
[YY
] + (real(ci_y
) + 1)*iGridDims
.cellSize
[YY
] + shy
;
3389 get_cell_range
<YY
>(by0
, by1
,
3400 if (by1
< jGridDims
.lowerCorner
[YY
])
3402 d2z_cy
+= gmx::square(jGridDims
.lowerCorner
[YY
] - by1
);
3404 else if (by0
> jGridDims
.upperCorner
[YY
])
3406 d2z_cy
+= gmx::square(by0
- jGridDims
.upperCorner
[YY
]);
3409 for (int tx
= -shp
[XX
]; tx
<= shp
[XX
]; tx
++)
3411 const int shift
= XYZ2IS(tx
, ty
, tz
);
3413 const bool excludeSubDiagonal
= (isIntraGridList
&& shift
== CENTRAL
);
3415 if (c_pbcShiftBackward
&& isIntraGridList
&& shift
> CENTRAL
)
3420 const real shx
= real(tx
)*box
[XX
][XX
] + real(ty
)*box
[YY
][XX
] + real(tz
)*box
[ZZ
][XX
];
3424 bx0
= bb_i
[ci
].lower
.x
+ shx
;
3425 bx1
= bb_i
[ci
].upper
.x
+ shx
;
3429 bx0
= iGridDims
.lowerCorner
[XX
] + (real(ci_x
) )*iGridDims
.cellSize
[XX
] + shx
;
3430 bx1
= iGridDims
.lowerCorner
[XX
] + (real(ci_x
)+1)*iGridDims
.cellSize
[XX
] + shx
;
3433 get_cell_range
<XX
>(bx0
, bx1
,
3443 addNewIEntry(nbl
, cell0_i
+ci
, shift
, flags_i
[ci
]);
3445 if ((!c_pbcShiftBackward
|| excludeSubDiagonal
) &&
3448 /* Leave the pairs with i > j.
3449 * x is the major index, so skip half of it.
3454 set_icell_bb(iGrid
, ci
, shx
, shy
, shz
,
3457 icell_set_x(cell0_i
+ci
, shx
, shy
, shz
,
3458 nbat
->xstride
, nbat
->x().data(),
3462 for (int cx
= cxf
; cx
<= cxl
; cx
++)
3464 const real cx_real
= cx
;
3466 if (jGridDims
.lowerCorner
[XX
] + cx_real
*jGridDims
.cellSize
[XX
] > bx1
)
3468 d2zx
+= gmx::square(jGridDims
.lowerCorner
[XX
] + cx_real
*jGridDims
.cellSize
[XX
] - bx1
);
3470 else if (jGridDims
.lowerCorner
[XX
] + (cx_real
+1)*jGridDims
.cellSize
[XX
] < bx0
)
3472 d2zx
+= gmx::square(jGridDims
.lowerCorner
[XX
] + (cx_real
+1)*jGridDims
.cellSize
[XX
] - bx0
);
3475 if (isIntraGridList
&&
3477 (!c_pbcShiftBackward
|| shift
== CENTRAL
) &&
3480 /* Leave the pairs with i > j.
3481 * Skip half of y when i and j have the same x.
3490 for (int cy
= cyf_x
; cy
<= cyl
; cy
++)
3492 const int columnStart
= jGrid
.firstCellInColumn(cx
*jGridDims
.numCells
[YY
] + cy
);
3493 const int columnEnd
= jGrid
.firstCellInColumn(cx
*jGridDims
.numCells
[YY
] + cy
+ 1);
3495 const real cy_real
= cy
;
3497 if (jGridDims
.lowerCorner
[YY
] + cy_real
*jGridDims
.cellSize
[YY
] > by1
)
3499 d2zxy
+= gmx::square(jGridDims
.lowerCorner
[YY
] + cy_real
*jGridDims
.cellSize
[YY
] - by1
);
3501 else if (jGridDims
.lowerCorner
[YY
] + (cy_real
+ 1)*jGridDims
.cellSize
[YY
] < by0
)
3503 d2zxy
+= gmx::square(jGridDims
.lowerCorner
[YY
] + (cy_real
+ 1)*jGridDims
.cellSize
[YY
] - by0
);
3505 if (columnStart
< columnEnd
&& d2zxy
< listRangeBBToJCell2
)
3507 /* To improve efficiency in the common case
3508 * of a homogeneous particle distribution,
3509 * we estimate the index of the middle cell
3510 * in range (midCell). We search down and up
3511 * starting from this index.
3513 * Note that the bbcz_j array contains bounds
3514 * for i-clusters, thus for clusters of 4 atoms.
3515 * For the common case where the j-cluster size
3516 * is 8, we could step with a stride of 2,
3517 * but we do not do this because it would
3518 * complicate this code even more.
3520 int midCell
= columnStart
+ static_cast<int>(bz1_frac
*static_cast<real
>(columnEnd
- columnStart
));
3521 if (midCell
>= columnEnd
)
3523 midCell
= columnEnd
- 1;
3528 /* Find the lowest cell that can possibly
3530 * Check if we hit the bottom of the grid,
3531 * if the j-cell is below the i-cell and if so,
3532 * if it is within range.
3534 int downTestCell
= midCell
;
3535 while (downTestCell
>= columnStart
&&
3536 (bbcz_j
[downTestCell
].upper
>= bz0
||
3537 d2xy
+ gmx::square(bbcz_j
[downTestCell
].upper
- bz0
) < rlist2
))
3541 int firstCell
= downTestCell
+ 1;
3543 /* Find the highest cell that can possibly
3545 * Check if we hit the top of the grid,
3546 * if the j-cell is above the i-cell and if so,
3547 * if it is within range.
3549 int upTestCell
= midCell
+ 1;
3550 while (upTestCell
< columnEnd
&&
3551 (bbcz_j
[upTestCell
].lower
<= bz1
||
3552 d2xy
+ gmx::square(bbcz_j
[upTestCell
].lower
- bz1
) < rlist2
))
3556 int lastCell
= upTestCell
- 1;
3558 #define NBNXN_REFCODE 0
3561 /* Simple reference code, for debugging,
3562 * overrides the more complex code above.
3564 firstCell
= columnEnd
;
3566 for (int k
= columnStart
; k
< columnEnd
; k
++)
3568 if (d2xy
+ gmx::square(bbcz_j
[k
*NNBSBB_D
+ 1] - bz0
) < rlist2
&&
3573 if (d2xy
+ gmx::square(bbcz_j
[k
*NNBSBB_D
] - bz1
) < rlist2
&&
3582 if (isIntraGridList
)
3584 /* We want each atom/cell pair only once,
3585 * only use cj >= ci.
3587 if (!c_pbcShiftBackward
|| shift
== CENTRAL
)
3589 firstCell
= std::max(firstCell
, ci
);
3593 if (firstCell
<= lastCell
)
3595 GMX_ASSERT(firstCell
>= columnStart
&& lastCell
< columnEnd
, "The range should reside within the current grid column");
3597 /* For f buffer flags with simple lists */
3598 ncj_old_j
= getNumSimpleJClustersInList(*nbl
);
3600 makeClusterListWrapper(nbl
,
3602 jGrid
, firstCell
, lastCell
,
3607 &numDistanceChecks
);
3611 setBufferFlags(*nbl
, ncj_old_j
, gridj_flag_shift
,
3615 incrementNumSimpleJClustersInList(nbl
, ncj_old_j
);
3621 /* Set the exclusions for this ci list */
3622 setExclusionsForIEntry(gridSet
,
3626 *getOpenIEntry(nbl
),
3631 make_fep_list(gridSet
.atomIndices(), nbat
, nbl
,
3636 iGrid
, jGrid
, nbl_fep
);
3639 /* Close this ci list */
3642 progBal
, nsubpair_tot_est
,
3648 if (bFBufferFlag
&& getNumSimpleJClustersInList(*nbl
) > ncj_old_i
)
3650 bitmask_init_bit(&(work
->buffer_flags
.flag
[(iGrid
.cellOffset() + ci
) >> gridi_flag_shift
]), th
);
3654 work
->ndistc
= numDistanceChecks
;
3656 checkListSizeConsistency(*nbl
, haveFep
);
3660 fprintf(debug
, "number of distance checks %d\n", numDistanceChecks
);
3662 print_nblist_statistics(debug
, *nbl
, gridSet
, rlist
);
3666 fprintf(debug
, "nbl FEP list pairs: %d\n", nbl_fep
->nrj
);
3671 static void reduce_buffer_flags(gmx::ArrayRef
<PairsearchWork
> searchWork
,
3673 const nbnxn_buffer_flags_t
*dest
)
3675 for (int s
= 0; s
< nsrc
; s
++)
3677 gmx_bitmask_t
* flag
= searchWork
[s
].buffer_flags
.flag
;
3679 for (int b
= 0; b
< dest
->nflag
; b
++)
3681 bitmask_union(&(dest
->flag
[b
]), flag
[b
]);
3686 static void print_reduction_cost(const nbnxn_buffer_flags_t
*flags
, int nout
)
3688 int nelem
, nkeep
, ncopy
, nred
, out
;
3689 gmx_bitmask_t mask_0
;
3695 bitmask_init_bit(&mask_0
, 0);
3696 for (int b
= 0; b
< flags
->nflag
; b
++)
3698 if (bitmask_is_equal(flags
->flag
[b
], mask_0
))
3700 /* Only flag 0 is set, no copy of reduction required */
3704 else if (!bitmask_is_zero(flags
->flag
[b
]))
3707 for (out
= 0; out
< nout
; out
++)
3709 if (bitmask_is_set(flags
->flag
[b
], out
))
3726 fprintf(debug
, "nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
3728 nelem
/static_cast<double>(flags
->nflag
),
3729 nkeep
/static_cast<double>(flags
->nflag
),
3730 ncopy
/static_cast<double>(flags
->nflag
),
3731 nred
/static_cast<double>(flags
->nflag
));
3734 /* Copies the list entries from src to dest when cjStart <= *cjGlobal < cjEnd.
3735 * *cjGlobal is updated with the cj count in src.
3736 * When setFlags==true, flag bit t is set in flag for all i and j clusters.
3738 template<bool setFlags
>
3739 static void copySelectedListRange(const nbnxn_ci_t
* gmx_restrict srcCi
,
3740 const NbnxnPairlistCpu
* gmx_restrict src
,
3741 NbnxnPairlistCpu
* gmx_restrict dest
,
3742 gmx_bitmask_t
*flag
,
3743 int iFlagShift
, int jFlagShift
, int t
)
3745 const int ncj
= srcCi
->cj_ind_end
- srcCi
->cj_ind_start
;
3747 dest
->ci
.push_back(*srcCi
);
3748 dest
->ci
.back().cj_ind_start
= dest
->cj
.size();
3749 dest
->ci
.back().cj_ind_end
= dest
->ci
.back().cj_ind_start
+ ncj
;
3753 bitmask_init_bit(&flag
[srcCi
->ci
>> iFlagShift
], t
);
3756 for (int j
= srcCi
->cj_ind_start
; j
< srcCi
->cj_ind_end
; j
++)
3758 dest
->cj
.push_back(src
->cj
[j
]);
3762 /* NOTE: This is relatively expensive, since this
3763 * operation is done for all elements in the list,
3764 * whereas at list generation this is done only
3765 * once for each flag entry.
3767 bitmask_init_bit(&flag
[src
->cj
[j
].cj
>> jFlagShift
], t
);
3772 #if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && __GNUC__ == 7
3773 /* Avoid gcc 7 avx512 loop vectorization bug (actually only needed with -mavx512f) */
3774 #pragma GCC push_options
3775 #pragma GCC optimize ("no-tree-vectorize")
3778 /* Returns the number of cluster pairs that are in use summed over all lists */
3779 static int countClusterpairs(gmx::ArrayRef
<const NbnxnPairlistCpu
> pairlists
)
3781 /* gcc 7 with -mavx512f can miss the contributions of 16 consecutive
3782 * elements to the sum calculated in this loop. Above we have disabled
3783 * loop vectorization to avoid this bug.
3786 for (const auto &pairlist
: pairlists
)
3788 ncjTotal
+= pairlist
.ncjInUse
;
3793 #if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && __GNUC__ == 7
3794 #pragma GCC pop_options
3797 /* This routine re-balances the pairlists such that all are nearly equally
3798 * sized. Only whole i-entries are moved between lists. These are moved
3799 * between the ends of the lists, such that the buffer reduction cost should
3800 * not change significantly.
3801 * Note that all original reduction flags are currently kept. This can lead
3802 * to reduction of parts of the force buffer that could be avoided. But since
3803 * the original lists are quite balanced, this will only give minor overhead.
3805 static void rebalanceSimpleLists(gmx::ArrayRef
<const NbnxnPairlistCpu
> srcSet
,
3806 gmx::ArrayRef
<NbnxnPairlistCpu
> destSet
,
3807 gmx::ArrayRef
<PairsearchWork
> searchWork
)
3809 const int ncjTotal
= countClusterpairs(srcSet
);
3810 const int numLists
= srcSet
.ssize();
3811 const int ncjTarget
= (ncjTotal
+ numLists
- 1)/numLists
;
3813 #pragma omp parallel num_threads(numLists)
3815 int t
= gmx_omp_get_thread_num();
3817 int cjStart
= ncjTarget
* t
;
3818 int cjEnd
= ncjTarget
*(t
+ 1);
3820 /* The destination pair-list for task/thread t */
3821 NbnxnPairlistCpu
&dest
= destSet
[t
];
3823 clear_pairlist(&dest
);
3824 dest
.na_cj
= srcSet
[0].na_cj
;
3826 /* Note that the flags in the work struct (still) contain flags
3827 * for all entries that are present in srcSet->nbl[t].
3829 gmx_bitmask_t
*flag
= searchWork
[t
].buffer_flags
.flag
;
3831 int iFlagShift
= getBufferFlagShift(dest
.na_ci
);
3832 int jFlagShift
= getBufferFlagShift(dest
.na_cj
);
3835 for (int s
= 0; s
< numLists
&& cjGlobal
< cjEnd
; s
++)
3837 const NbnxnPairlistCpu
*src
= &srcSet
[s
];
3839 if (cjGlobal
+ src
->ncjInUse
> cjStart
)
3841 for (gmx::index i
= 0; i
< gmx::ssize(src
->ci
) && cjGlobal
< cjEnd
; i
++)
3843 const nbnxn_ci_t
*srcCi
= &src
->ci
[i
];
3844 int ncj
= srcCi
->cj_ind_end
- srcCi
->cj_ind_start
;
3845 if (cjGlobal
>= cjStart
)
3847 /* If the source list is not our own, we need to set
3848 * extra flags (the template bool parameter).
3852 copySelectedListRange
3855 flag
, iFlagShift
, jFlagShift
, t
);
3859 copySelectedListRange
3862 &dest
, flag
, iFlagShift
, jFlagShift
, t
);
3870 cjGlobal
+= src
->ncjInUse
;
3874 dest
.ncjInUse
= dest
.cj
.size();
3878 const int ncjTotalNew
= countClusterpairs(destSet
);
3879 GMX_RELEASE_ASSERT(ncjTotalNew
== ncjTotal
, "The total size of the lists before and after rebalancing should match");
3883 /* Returns if the pairlists are so imbalanced that it is worth rebalancing. */
3884 static bool checkRebalanceSimpleLists(gmx::ArrayRef
<const NbnxnPairlistCpu
> lists
)
3886 int numLists
= lists
.ssize();
3889 for (int s
= 0; s
< numLists
; s
++)
3891 ncjMax
= std::max(ncjMax
, lists
[s
].ncjInUse
);
3892 ncjTotal
+= lists
[s
].ncjInUse
;
3896 fprintf(debug
, "Pair-list ncjMax %d ncjTotal %d\n", ncjMax
, ncjTotal
);
3898 /* The rebalancing adds 3% extra time to the search. Heuristically we
3899 * determined that under common conditions the non-bonded kernel balance
3900 * improvement will outweigh this when the imbalance is more than 3%.
3901 * But this will, obviously, depend on search vs kernel time and nstlist.
3903 const real rebalanceTolerance
= 1.03;
3905 return real(numLists
*ncjMax
) > real(ncjTotal
)*rebalanceTolerance
;
3908 /* Perform a count (linear) sort to sort the smaller lists to the end.
3909 * This avoids load imbalance on the GPU, as large lists will be
3910 * scheduled and executed first and the smaller lists later.
3911 * Load balancing between multi-processors only happens at the end
3912 * and there smaller lists lead to more effective load balancing.
3913 * The sorting is done on the cj4 count, not on the actual pair counts.
3914 * Not only does this make the sort faster, but it also results in
3915 * better load balancing than using a list sorted on exact load.
3916 * This function swaps the pointer in the pair list to avoid a copy operation.
3918 static void sort_sci(NbnxnPairlistGpu
*nbl
)
3920 if (nbl
->cj4
.size() <= nbl
->sci
.size())
3922 /* nsci = 0 or all sci have size 1, sorting won't change the order */
3926 NbnxnPairlistGpuWork
&work
= *nbl
->work
;
3928 /* We will distinguish differences up to double the average */
3929 const int m
= static_cast<int>((2*ssize(nbl
->cj4
))/ssize(nbl
->sci
));
3931 /* Resize work.sci_sort so we can sort into it */
3932 work
.sci_sort
.resize(nbl
->sci
.size());
3934 std::vector
<int> &sort
= work
.sortBuffer
;
3935 /* Set up m + 1 entries in sort, initialized at 0 */
3937 sort
.resize(m
+ 1, 0);
3938 /* Count the entries of each size */
3939 for (const nbnxn_sci_t
&sci
: nbl
->sci
)
3941 int i
= std::min(m
, sci
.numJClusterGroups());
3944 /* Calculate the offset for each count */
3947 for (gmx::index i
= m
- 1; i
>= 0; i
--)
3950 sort
[i
] = sort
[i
+ 1] + s0
;
3954 /* Sort entries directly into place */
3955 gmx::ArrayRef
<nbnxn_sci_t
> sci_sort
= work
.sci_sort
;
3956 for (const nbnxn_sci_t
&sci
: nbl
->sci
)
3958 int i
= std::min(m
, sci
.numJClusterGroups());
3959 sci_sort
[sort
[i
]++] = sci
;
3962 /* Swap the sci pointers so we use the new, sorted list */
3963 std::swap(nbl
->sci
, work
.sci_sort
);
3966 //! Prepares CPU lists produced by the search for dynamic pruning
3967 static void prepareListsForDynamicPruning(gmx::ArrayRef
<NbnxnPairlistCpu
> lists
);
3970 PairlistSet::constructPairlists(const Nbnxm::GridSet
&gridSet
,
3971 gmx::ArrayRef
<PairsearchWork
> searchWork
,
3972 nbnxn_atomdata_t
*nbat
,
3973 const t_blocka
*excl
,
3974 const int minimumIlistCountForGpuBalancing
,
3976 SearchCycleCounting
*searchCycleCounting
)
3978 const real rlist
= params_
.rlistOuter
;
3980 int nsubpair_target
;
3981 float nsubpair_tot_est
;
3984 int np_tot
, np_noq
, np_hlj
, nap
;
3986 const int numLists
= (isCpuType_
? cpuLists_
.size() : gpuLists_
.size());
3990 fprintf(debug
, "ns making %d nblists\n", numLists
);
3993 nbat
->bUseBufferFlags
= (nbat
->out
.size() > 1);
3994 /* We should re-init the flags before making the first list */
3995 if (nbat
->bUseBufferFlags
&& locality_
== InteractionLocality::Local
)
3997 init_buffer_flags(&nbat
->buffer_flags
, nbat
->numAtoms());
4001 if (locality_
== InteractionLocality::Local
)
4003 /* Only zone (grid) 0 vs 0 */
4008 nzi
= gridSet
.domainSetup().zones
->nizone
;
4011 if (!isCpuType_
&& minimumIlistCountForGpuBalancing
> 0)
4013 get_nsubpair_target(gridSet
, locality_
, rlist
, minimumIlistCountForGpuBalancing
,
4014 &nsubpair_target
, &nsubpair_tot_est
);
4018 nsubpair_target
= 0;
4019 nsubpair_tot_est
= 0;
4022 /* Clear all pair-lists */
4023 for (int th
= 0; th
< numLists
; th
++)
4027 clear_pairlist(&cpuLists_
[th
]);
4031 clear_pairlist(&gpuLists_
[th
]);
4034 if (params_
.haveFep
)
4036 clear_pairlist_fep(fepLists_
[th
].get());
4040 const gmx_domdec_zones_t
*ddZones
= gridSet
.domainSetup().zones
;
4042 for (int zi
= 0; zi
< nzi
; zi
++)
4044 const Grid
&iGrid
= gridSet
.grids()[zi
];
4048 if (locality_
== InteractionLocality::Local
)
4055 zj0
= ddZones
->izone
[zi
].j0
;
4056 zj1
= ddZones
->izone
[zi
].j1
;
4062 for (int zj
= zj0
; zj
< zj1
; zj
++)
4064 const Grid
&jGrid
= gridSet
.grids()[zj
];
4068 fprintf(debug
, "ns search grid %d vs %d\n", zi
, zj
);
4071 searchCycleCounting
->start(enbsCCsearch
);
4073 ci_block
= get_ci_block_size(iGrid
, gridSet
.domainSetup().haveMultipleDomains
, numLists
);
4075 /* With GPU: generate progressively smaller lists for
4076 * load balancing for local only or non-local with 2 zones.
4078 progBal
= (locality_
== InteractionLocality::Local
|| ddZones
->n
<= 2);
4080 #pragma omp parallel for num_threads(numLists) schedule(static)
4081 for (int th
= 0; th
< numLists
; th
++)
4085 /* Re-init the thread-local work flag data before making
4086 * the first list (not an elegant conditional).
4088 if (nbat
->bUseBufferFlags
&& ((zi
== 0 && zj
== 0)))
4090 init_buffer_flags(&searchWork
[th
].buffer_flags
, nbat
->numAtoms());
4093 if (combineLists_
&& th
> 0)
4095 GMX_ASSERT(!isCpuType_
, "Can only combine GPU lists");
4097 clear_pairlist(&gpuLists_
[th
]);
4100 PairsearchWork
&work
= searchWork
[th
];
4102 work
.cycleCounter
.start();
4104 t_nblist
*fepListPtr
= (fepLists_
.empty() ? nullptr : fepLists_
[th
].get());
4106 /* Divide the i cells equally over the pairlists */
4109 nbnxn_make_pairlist_part(gridSet
, iGrid
, jGrid
,
4112 params_
.pairlistType
,
4114 nbat
->bUseBufferFlags
,
4116 progBal
, nsubpair_tot_est
,
4123 nbnxn_make_pairlist_part(gridSet
, iGrid
, jGrid
,
4126 params_
.pairlistType
,
4128 nbat
->bUseBufferFlags
,
4130 progBal
, nsubpair_tot_est
,
4136 work
.cycleCounter
.stop();
4138 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
4140 searchCycleCounting
->stop(enbsCCsearch
);
4145 for (int th
= 0; th
< numLists
; th
++)
4147 inc_nrnb(nrnb
, eNR_NBNXN_DIST2
, searchWork
[th
].ndistc
);
4151 const NbnxnPairlistCpu
&nbl
= cpuLists_
[th
];
4152 np_tot
+= nbl
.cj
.size();
4153 np_noq
+= nbl
.work
->ncj_noq
;
4154 np_hlj
+= nbl
.work
->ncj_hlj
;
4158 const NbnxnPairlistGpu
&nbl
= gpuLists_
[th
];
4159 /* This count ignores potential subsequent pair pruning */
4160 np_tot
+= nbl
.nci_tot
;
4165 nap
= cpuLists_
[0].na_ci
*cpuLists_
[0].na_cj
;
4169 nap
= gmx::square(gpuLists_
[0].na_ci
);
4171 natpair_ljq_
= (np_tot
- np_noq
)*nap
- np_hlj
*nap
/2;
4172 natpair_lj_
= np_noq
*nap
;
4173 natpair_q_
= np_hlj
*nap
/2;
4175 if (combineLists_
&& numLists
> 1)
4177 GMX_ASSERT(!isCpuType_
, "Can only combine GPU lists");
4179 searchCycleCounting
->start(enbsCCcombine
);
4181 combine_nblists(gmx::constArrayRefFromArray(&gpuLists_
[1], numLists
- 1),
4184 searchCycleCounting
->stop(enbsCCcombine
);
4191 if (numLists
> 1 && checkRebalanceSimpleLists(cpuLists_
))
4193 rebalanceSimpleLists(cpuLists_
, cpuListsWork_
, searchWork
);
4195 /* Swap the sets of pair lists */
4196 cpuLists_
.swap(cpuListsWork_
);
4201 /* Sort the entries on size, large ones first */
4202 if (combineLists_
|| gpuLists_
.size() == 1)
4204 sort_sci(&gpuLists_
[0]);
4208 #pragma omp parallel for num_threads(numLists) schedule(static)
4209 for (int th
= 0; th
< numLists
; th
++)
4213 sort_sci(&gpuLists_
[th
]);
4215 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
4220 if (nbat
->bUseBufferFlags
)
4222 reduce_buffer_flags(searchWork
, numLists
, &nbat
->buffer_flags
);
4225 if (gridSet
.haveFep())
4227 /* Balance the free-energy lists over all the threads */
4228 balance_fep_lists(fepLists_
, searchWork
);
4233 /* This is a fresh list, so not pruned, stored using ci.
4234 * ciOuter is invalid at this point.
4236 GMX_ASSERT(cpuLists_
[0].ciOuter
.empty(), "ciOuter is invalid so it should be empty");
4239 /* If we have more than one list, they either got rebalancing (CPU)
4240 * or combined (GPU), so we should dump the final result to debug.
4244 if (isCpuType_
&& cpuLists_
.size() > 1)
4246 for (auto &cpuList
: cpuLists_
)
4248 print_nblist_statistics(debug
, cpuList
, gridSet
, rlist
);
4251 else if (!isCpuType_
&& gpuLists_
.size() > 1)
4253 print_nblist_statistics(debug
, gpuLists_
[0], gridSet
, rlist
);
4263 for (auto &cpuList
: cpuLists_
)
4265 print_nblist_ci_cj(debug
, cpuList
);
4270 print_nblist_sci_cj(debug
, gpuLists_
[0]);
4274 if (nbat
->bUseBufferFlags
)
4276 print_reduction_cost(&nbat
->buffer_flags
, numLists
);
4280 if (params_
.useDynamicPruning
&& isCpuType_
)
4282 prepareListsForDynamicPruning(cpuLists_
);
4287 PairlistSets::construct(const InteractionLocality iLocality
,
4288 PairSearch
*pairSearch
,
4289 nbnxn_atomdata_t
*nbat
,
4290 const t_blocka
*excl
,
4294 pairlistSet(iLocality
).constructPairlists(pairSearch
->gridSet(), pairSearch
->work(),
4295 nbat
, excl
, minimumIlistCountForGpuBalancing_
,
4296 nrnb
, &pairSearch
->cycleCounting_
);
4298 if (iLocality
== Nbnxm::InteractionLocality::Local
)
4300 outerListCreationStep_
= step
;
4304 GMX_RELEASE_ASSERT(outerListCreationStep_
== step
,
4305 "Outer list should be created at the same step as the inner list");
4308 /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
4309 if (iLocality
== InteractionLocality::Local
)
4311 pairSearch
->cycleCounting_
.searchCount_
++;
4313 if (pairSearch
->cycleCounting_
.recordCycles_
&&
4314 (!pairSearch
->gridSet().domainSetup().haveMultipleDomains
|| iLocality
== InteractionLocality::NonLocal
) &&
4315 pairSearch
->cycleCounting_
.searchCount_
% 100 == 0)
4317 pairSearch
->cycleCounting_
.printCycles(stderr
, pairSearch
->work());
4322 nonbonded_verlet_t::constructPairlist(const Nbnxm::InteractionLocality iLocality
,
4323 const t_blocka
*excl
,
4327 pairlistSets_
->construct(iLocality
, pairSearch_
.get(), nbat
.get(), excl
,
4332 /* Launch the transfer of the pairlist to the GPU.
4334 * NOTE: The launch overhead is currently not timed separately
4336 Nbnxm::gpu_init_pairlist(gpu_nbv
,
4337 pairlistSets().pairlistSet(iLocality
).gpuList(),
4342 static void prepareListsForDynamicPruning(gmx::ArrayRef
<NbnxnPairlistCpu
> lists
)
4344 /* TODO: Restructure the lists so we have actual outer and inner
4345 * list objects so we can set a single pointer instead of
4346 * swapping several pointers.
4349 for (auto &list
: lists
)
4351 /* The search produced a list in ci/cj.
4352 * Swap the list pointers so we get the outer list is ciOuter,cjOuter
4353 * and we can prune that to get an inner list in ci/cj.
4355 GMX_RELEASE_ASSERT(list
.ciOuter
.empty() && list
.cjOuter
.empty(),
4356 "The outer lists should be empty before preparation");
4358 std::swap(list
.ci
, list
.ciOuter
);
4359 std::swap(list
.cj
, list
.cjOuter
);