2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
48 #include "gromacs/domdec/domdec_struct.h"
49 #include "gromacs/gmxlib/nrnb.h"
50 #include "gromacs/math/functions.h"
51 #include "gromacs/math/utilities.h"
52 #include "gromacs/math/vec.h"
53 #include "gromacs/mdlib/gmx_omp_nthreads.h"
54 #include "gromacs/mdlib/ns.h"
55 #include "gromacs/mdtypes/group.h"
56 #include "gromacs/mdtypes/md_enums.h"
57 #include "gromacs/nbnxm/atomdata.h"
58 #include "gromacs/nbnxm/gpu_data_mgmt.h"
59 #include "gromacs/nbnxm/nbnxm.h"
60 #include "gromacs/nbnxm/nbnxm_geometry.h"
61 #include "gromacs/nbnxm/nbnxm_simd.h"
62 #include "gromacs/nbnxm/pairlistset.h"
63 #include "gromacs/pbcutil/ishift.h"
64 #include "gromacs/pbcutil/pbc.h"
65 #include "gromacs/simd/simd.h"
66 #include "gromacs/simd/vector_operations.h"
67 #include "gromacs/topology/block.h"
68 #include "gromacs/utility/exceptions.h"
69 #include "gromacs/utility/fatalerror.h"
70 #include "gromacs/utility/gmxomp.h"
71 #include "gromacs/utility/smalloc.h"
75 #include "pairlistwork.h"
77 using namespace gmx
; // TODO: Remove when this file is moved into gmx namespace
79 // Convience alias for partial Nbnxn namespace usage
80 using InteractionLocality
= Nbnxm::InteractionLocality
;
82 /* We shift the i-particles backward for PBC.
83 * This leads to more conditionals than shifting forward.
84 * We do this to get more balanced pair lists.
86 constexpr bool c_pbcShiftBackward
= true;
89 static void nbs_cycle_clear(nbnxn_cycle_t
*cc
)
91 for (int i
= 0; i
< enbsCCnr
; i
++)
98 static double Mcyc_av(const nbnxn_cycle_t
*cc
)
100 return static_cast<double>(cc
->c
)*1e-6/cc
->count
;
103 static void nbs_cycle_print(FILE *fp
, const nbnxn_search
*nbs
)
106 fprintf(fp
, "ns %4d grid %4.1f search %4.1f red.f %5.3f",
107 nbs
->cc
[enbsCCgrid
].count
,
108 Mcyc_av(&nbs
->cc
[enbsCCgrid
]),
109 Mcyc_av(&nbs
->cc
[enbsCCsearch
]),
110 Mcyc_av(&nbs
->cc
[enbsCCreducef
]));
112 if (nbs
->work
.size() > 1)
114 if (nbs
->cc
[enbsCCcombine
].count
> 0)
116 fprintf(fp
, " comb %5.2f",
117 Mcyc_av(&nbs
->cc
[enbsCCcombine
]));
119 fprintf(fp
, " s. th");
120 for (const nbnxn_search_work_t
&work
: nbs
->work
)
122 fprintf(fp
, " %4.1f",
123 Mcyc_av(&work
.cc
[enbsCCsearch
]));
129 /* Layout for the nonbonded NxN pair lists */
130 enum class NbnxnLayout
132 NoSimd4x4
, // i-cluster size 4, j-cluster size 4
133 Simd4xN
, // i-cluster size 4, j-cluster size SIMD width
134 Simd2xNN
, // i-cluster size 4, j-cluster size half SIMD width
135 Gpu8x8x8
// i-cluster size 8, j-cluster size 8 + super-clustering
139 /* Returns the j-cluster size */
140 template <NbnxnLayout layout
>
141 static constexpr int jClusterSize()
143 static_assert(layout
== NbnxnLayout::NoSimd4x4
|| layout
== NbnxnLayout::Simd4xN
|| layout
== NbnxnLayout::Simd2xNN
, "Currently jClusterSize only supports CPU layouts");
145 return layout
== NbnxnLayout::Simd4xN
? GMX_SIMD_REAL_WIDTH
: (layout
== NbnxnLayout::Simd2xNN
? GMX_SIMD_REAL_WIDTH
/2 : c_nbnxnCpuIClusterSize
);
148 /*! \brief Returns the j-cluster index given the i-cluster index.
150 * \tparam jClusterSize The number of atoms in a j-cluster
151 * \tparam jSubClusterIndex The j-sub-cluster index (0/1), used when size(j-cluster) < size(i-cluster)
152 * \param[in] ci The i-cluster index
154 template <int jClusterSize
, int jSubClusterIndex
>
155 static inline int cjFromCi(int ci
)
157 static_assert(jClusterSize
== c_nbnxnCpuIClusterSize
/2 || jClusterSize
== c_nbnxnCpuIClusterSize
|| jClusterSize
== c_nbnxnCpuIClusterSize
*2, "Only j-cluster sizes 2, 4 and 8 are currently implemented");
159 static_assert(jSubClusterIndex
== 0 || jSubClusterIndex
== 1,
160 "Only sub-cluster indices 0 and 1 are supported");
162 if (jClusterSize
== c_nbnxnCpuIClusterSize
/2)
164 if (jSubClusterIndex
== 0)
170 return ((ci
+ 1) << 1) - 1;
173 else if (jClusterSize
== c_nbnxnCpuIClusterSize
)
183 /*! \brief Returns the j-cluster index given the i-cluster index.
185 * \tparam layout The pair-list layout
186 * \tparam jSubClusterIndex The j-sub-cluster index (0/1), used when size(j-cluster) < size(i-cluster)
187 * \param[in] ci The i-cluster index
189 template <NbnxnLayout layout
, int jSubClusterIndex
>
190 static inline int cjFromCi(int ci
)
192 constexpr int clusterSize
= jClusterSize
<layout
>();
194 return cjFromCi
<clusterSize
, jSubClusterIndex
>(ci
);
197 /* Returns the nbnxn coordinate data index given the i-cluster index */
198 template <NbnxnLayout layout
>
199 static inline int xIndexFromCi(int ci
)
201 constexpr int clusterSize
= jClusterSize
<layout
>();
203 static_assert(clusterSize
== c_nbnxnCpuIClusterSize
/2 || clusterSize
== c_nbnxnCpuIClusterSize
|| clusterSize
== c_nbnxnCpuIClusterSize
*2, "Only j-cluster sizes 2, 4 and 8 are currently implemented");
205 if (clusterSize
<= c_nbnxnCpuIClusterSize
)
207 /* Coordinates are stored packed in groups of 4 */
212 /* Coordinates packed in 8, i-cluster size is half the packing width */
213 return (ci
>> 1)*STRIDE_P8
+ (ci
& 1)*(c_packX8
>> 1);
217 /* Returns the nbnxn coordinate data index given the j-cluster index */
218 template <NbnxnLayout layout
>
219 static inline int xIndexFromCj(int cj
)
221 constexpr int clusterSize
= jClusterSize
<layout
>();
223 static_assert(clusterSize
== c_nbnxnCpuIClusterSize
/2 || clusterSize
== c_nbnxnCpuIClusterSize
|| clusterSize
== c_nbnxnCpuIClusterSize
*2, "Only j-cluster sizes 2, 4 and 8 are currently implemented");
225 if (clusterSize
== c_nbnxnCpuIClusterSize
/2)
227 /* Coordinates are stored packed in groups of 4 */
228 return (cj
>> 1)*STRIDE_P4
+ (cj
& 1)*(c_packX4
>> 1);
230 else if (clusterSize
== c_nbnxnCpuIClusterSize
)
232 /* Coordinates are stored packed in groups of 4 */
237 /* Coordinates are stored packed in groups of 8 */
243 /* Initializes a single nbnxn_pairlist_t data structure */
244 static void nbnxn_init_pairlist_fep(t_nblist
*nl
)
246 nl
->type
= GMX_NBLIST_INTERACTION_FREE_ENERGY
;
247 nl
->igeometry
= GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE
;
248 /* The interaction functions are set in the free energy kernel fuction */
261 nl
->jindex
= nullptr;
263 nl
->excl_fep
= nullptr;
267 static void free_nblist(t_nblist
*nl
)
277 nbnxn_search_work_t::nbnxn_search_work_t() :
280 buffer_flags({0, nullptr, 0}),
282 nbl_fep(new t_nblist
),
285 nbnxn_init_pairlist_fep(nbl_fep
.get());
290 nbnxn_search_work_t::~nbnxn_search_work_t()
292 sfree(buffer_flags
.flag
);
294 free_nblist(nbl_fep
.get());
297 nbnxn_search::nbnxn_search(int ePBC
,
298 const ivec
*n_dd_cells
,
299 const gmx_domdec_zones_t
*zones
,
310 // The correct value will be set during the gridding
314 DomDec
= n_dd_cells
!= nullptr;
317 for (int d
= 0; d
< DIM
; d
++)
319 if ((*n_dd_cells
)[d
] > 1)
322 /* Each grid matches a DD zone */
328 grid
.resize(numGrids
);
330 /* Initialize detailed nbsearch cycle counting */
331 print_cycles
= (getenv("GMX_NBNXN_CYCLE") != nullptr);
335 static void init_buffer_flags(nbnxn_buffer_flags_t
*flags
,
338 flags
->nflag
= (natoms
+ NBNXN_BUFFERFLAG_SIZE
- 1)/NBNXN_BUFFERFLAG_SIZE
;
339 if (flags
->nflag
> flags
->flag_nalloc
)
341 flags
->flag_nalloc
= over_alloc_large(flags
->nflag
);
342 srenew(flags
->flag
, flags
->flag_nalloc
);
344 for (int b
= 0; b
< flags
->nflag
; b
++)
346 bitmask_clear(&(flags
->flag
[b
]));
350 /* Returns the pair-list cutoff between a bounding box and a grid cell given an atom-to-atom pair-list cutoff
352 * Given a cutoff distance between atoms, this functions returns the cutoff
353 * distance2 between a bounding box of a group of atoms and a grid cell.
354 * Since atoms can be geometrically outside of the cell they have been
355 * assigned to (when atom groups instead of individual atoms are assigned
356 * to cells), this distance returned can be larger than the input.
358 static real
listRangeForBoundingBoxToGridCell(real rlist
,
359 const nbnxn_grid_t
&grid
)
361 return rlist
+ grid
.maxAtomGroupRadius
;
364 /* Returns the pair-list cutoff between a grid cells given an atom-to-atom pair-list cutoff
366 * Given a cutoff distance between atoms, this functions returns the cutoff
367 * distance2 between two grid cells.
368 * Since atoms can be geometrically outside of the cell they have been
369 * assigned to (when atom groups instead of individual atoms are assigned
370 * to cells), this distance returned can be larger than the input.
372 static real
listRangeForGridCellToGridCell(real rlist
,
373 const nbnxn_grid_t
&iGrid
,
374 const nbnxn_grid_t
&jGrid
)
376 return rlist
+ iGrid
.maxAtomGroupRadius
+ jGrid
.maxAtomGroupRadius
;
379 /* Determines the cell range along one dimension that
380 * the bounding box b0 - b1 sees.
383 static void get_cell_range(real b0
, real b1
,
384 const nbnxn_grid_t
&jGrid
,
385 real d2
, real rlist
, int *cf
, int *cl
)
387 real listRangeBBToCell2
= gmx::square(listRangeForBoundingBoxToGridCell(rlist
, jGrid
));
388 real distanceInCells
= (b0
- jGrid
.c0
[dim
])*jGrid
.invCellSize
[dim
];
389 *cf
= std::max(static_cast<int>(distanceInCells
), 0);
392 d2
+ gmx::square((b0
- jGrid
.c0
[dim
]) - (*cf
- 1 + 1)*jGrid
.cellSize
[dim
]) < listRangeBBToCell2
)
397 *cl
= std::min(static_cast<int>((b1
- jGrid
.c0
[dim
])*jGrid
.invCellSize
[dim
]), jGrid
.numCells
[dim
] - 1);
398 while (*cl
< jGrid
.numCells
[dim
] - 1 &&
399 d2
+ gmx::square((*cl
+ 1)*jGrid
.cellSize
[dim
] - (b1
- jGrid
.c0
[dim
])) < listRangeBBToCell2
)
405 /* Reference code calculating the distance^2 between two bounding boxes */
407 static float box_dist2(float bx0, float bx1, float by0,
408 float by1, float bz0, float bz1,
409 const nbnxn_bb_t *bb)
412 float dl, dh, dm, dm0;
416 dl = bx0 - bb->upper[BB_X];
417 dh = bb->lower[BB_X] - bx1;
418 dm = std::max(dl, dh);
419 dm0 = std::max(dm, 0.0f);
422 dl = by0 - bb->upper[BB_Y];
423 dh = bb->lower[BB_Y] - by1;
424 dm = std::max(dl, dh);
425 dm0 = std::max(dm, 0.0f);
428 dl = bz0 - bb->upper[BB_Z];
429 dh = bb->lower[BB_Z] - bz1;
430 dm = std::max(dl, dh);
431 dm0 = std::max(dm, 0.0f);
438 /* Plain C code calculating the distance^2 between two bounding boxes */
439 static float subc_bb_dist2(int si
,
440 const nbnxn_bb_t
*bb_i_ci
,
442 gmx::ArrayRef
<const nbnxn_bb_t
> bb_j_all
)
444 const nbnxn_bb_t
*bb_i
= bb_i_ci
+ si
;
445 const nbnxn_bb_t
*bb_j
= bb_j_all
.data() + csj
;
448 float dl
, dh
, dm
, dm0
;
450 dl
= bb_i
->lower
[BB_X
] - bb_j
->upper
[BB_X
];
451 dh
= bb_j
->lower
[BB_X
] - bb_i
->upper
[BB_X
];
452 dm
= std::max(dl
, dh
);
453 dm0
= std::max(dm
, 0.0f
);
456 dl
= bb_i
->lower
[BB_Y
] - bb_j
->upper
[BB_Y
];
457 dh
= bb_j
->lower
[BB_Y
] - bb_i
->upper
[BB_Y
];
458 dm
= std::max(dl
, dh
);
459 dm0
= std::max(dm
, 0.0f
);
462 dl
= bb_i
->lower
[BB_Z
] - bb_j
->upper
[BB_Z
];
463 dh
= bb_j
->lower
[BB_Z
] - bb_i
->upper
[BB_Z
];
464 dm
= std::max(dl
, dh
);
465 dm0
= std::max(dm
, 0.0f
);
471 #if NBNXN_SEARCH_BB_SIMD4
473 /* 4-wide SIMD code for bb distance for bb format xyz0 */
474 static float subc_bb_dist2_simd4(int si
,
475 const nbnxn_bb_t
*bb_i_ci
,
477 gmx::ArrayRef
<const nbnxn_bb_t
> bb_j_all
)
479 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
482 Simd4Float bb_i_S0
, bb_i_S1
;
483 Simd4Float bb_j_S0
, bb_j_S1
;
489 bb_i_S0
= load4(&bb_i_ci
[si
].lower
[0]);
490 bb_i_S1
= load4(&bb_i_ci
[si
].upper
[0]);
491 bb_j_S0
= load4(&bb_j_all
[csj
].lower
[0]);
492 bb_j_S1
= load4(&bb_j_all
[csj
].upper
[0]);
494 dl_S
= bb_i_S0
- bb_j_S1
;
495 dh_S
= bb_j_S0
- bb_i_S1
;
497 dm_S
= max(dl_S
, dh_S
);
498 dm0_S
= max(dm_S
, simd4SetZeroF());
500 return dotProduct(dm0_S
, dm0_S
);
503 /* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
504 #define SUBC_BB_DIST2_SIMD4_XXXX_INNER(si, bb_i, d2) \
508 Simd4Float dx_0, dy_0, dz_0; \
509 Simd4Float dx_1, dy_1, dz_1; \
511 Simd4Float mx, my, mz; \
512 Simd4Float m0x, m0y, m0z; \
514 Simd4Float d2x, d2y, d2z; \
515 Simd4Float d2s, d2t; \
517 shi = (si)*NNBSBB_D*DIM; \
519 xi_l = load4((bb_i)+shi+0*STRIDE_PBB); \
520 yi_l = load4((bb_i)+shi+1*STRIDE_PBB); \
521 zi_l = load4((bb_i)+shi+2*STRIDE_PBB); \
522 xi_h = load4((bb_i)+shi+3*STRIDE_PBB); \
523 yi_h = load4((bb_i)+shi+4*STRIDE_PBB); \
524 zi_h = load4((bb_i)+shi+5*STRIDE_PBB); \
526 dx_0 = xi_l - xj_h; \
527 dy_0 = yi_l - yj_h; \
528 dz_0 = zi_l - zj_h; \
530 dx_1 = xj_l - xi_h; \
531 dy_1 = yj_l - yi_h; \
532 dz_1 = zj_l - zi_h; \
534 mx = max(dx_0, dx_1); \
535 my = max(dy_0, dy_1); \
536 mz = max(dz_0, dz_1); \
538 m0x = max(mx, zero); \
539 m0y = max(my, zero); \
540 m0z = max(mz, zero); \
549 store4((d2)+(si), d2t); \
552 /* 4-wide SIMD code for nsi bb distances for bb format xxxxyyyyzzzz */
553 static void subc_bb_dist2_simd4_xxxx(const float *bb_j
,
554 int nsi
, const float *bb_i
,
557 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
560 Simd4Float xj_l
, yj_l
, zj_l
;
561 Simd4Float xj_h
, yj_h
, zj_h
;
562 Simd4Float xi_l
, yi_l
, zi_l
;
563 Simd4Float xi_h
, yi_h
, zi_h
;
569 xj_l
= Simd4Float(bb_j
[0*STRIDE_PBB
]);
570 yj_l
= Simd4Float(bb_j
[1*STRIDE_PBB
]);
571 zj_l
= Simd4Float(bb_j
[2*STRIDE_PBB
]);
572 xj_h
= Simd4Float(bb_j
[3*STRIDE_PBB
]);
573 yj_h
= Simd4Float(bb_j
[4*STRIDE_PBB
]);
574 zj_h
= Simd4Float(bb_j
[5*STRIDE_PBB
]);
576 /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.
577 * But as we know the number of iterations is 1 or 2, we unroll manually.
579 SUBC_BB_DIST2_SIMD4_XXXX_INNER(0, bb_i
, d2
);
580 if (STRIDE_PBB
< nsi
)
582 SUBC_BB_DIST2_SIMD4_XXXX_INNER(STRIDE_PBB
, bb_i
, d2
);
586 #endif /* NBNXN_SEARCH_BB_SIMD4 */
589 /* Returns if any atom pair from two clusters is within distance sqrt(rlist2) */
590 static inline gmx_bool
591 clusterpair_in_range(const NbnxnPairlistGpuWork
&work
,
593 int csj
, int stride
, const real
*x_j
,
596 #if !GMX_SIMD4_HAVE_REAL
599 * All coordinates are stored as xyzxyz...
602 const real
*x_i
= work
.iSuperClusterData
.x
.data();
604 for (int i
= 0; i
< c_nbnxnGpuClusterSize
; i
++)
606 int i0
= (si
*c_nbnxnGpuClusterSize
+ i
)*DIM
;
607 for (int j
= 0; j
< c_nbnxnGpuClusterSize
; j
++)
609 int j0
= (csj
*c_nbnxnGpuClusterSize
+ j
)*stride
;
611 real d2
= gmx::square(x_i
[i0
] - x_j
[j0
]) + gmx::square(x_i
[i0
+1] - x_j
[j0
+1]) + gmx::square(x_i
[i0
+2] - x_j
[j0
+2]);
622 #else /* !GMX_SIMD4_HAVE_REAL */
624 /* 4-wide SIMD version.
625 * The coordinates x_i are stored as xxxxyyyy..., x_j is stored xyzxyz...
626 * Using 8-wide AVX(2) is not faster on Intel Sandy Bridge and Haswell.
628 static_assert(c_nbnxnGpuClusterSize
== 8 || c_nbnxnGpuClusterSize
== 4,
629 "A cluster is hard-coded to 4/8 atoms.");
631 Simd4Real rc2_S
= Simd4Real(rlist2
);
633 const real
*x_i
= work
.iSuperClusterData
.xSimd
.data();
635 int dim_stride
= c_nbnxnGpuClusterSize
*DIM
;
636 Simd4Real ix_S0
= load4(x_i
+ si
*dim_stride
+ 0*GMX_SIMD4_WIDTH
);
637 Simd4Real iy_S0
= load4(x_i
+ si
*dim_stride
+ 1*GMX_SIMD4_WIDTH
);
638 Simd4Real iz_S0
= load4(x_i
+ si
*dim_stride
+ 2*GMX_SIMD4_WIDTH
);
640 Simd4Real ix_S1
, iy_S1
, iz_S1
;
641 if (c_nbnxnGpuClusterSize
== 8)
643 ix_S1
= load4(x_i
+ si
*dim_stride
+ 3*GMX_SIMD4_WIDTH
);
644 iy_S1
= load4(x_i
+ si
*dim_stride
+ 4*GMX_SIMD4_WIDTH
);
645 iz_S1
= load4(x_i
+ si
*dim_stride
+ 5*GMX_SIMD4_WIDTH
);
647 /* We loop from the outer to the inner particles to maximize
648 * the chance that we find a pair in range quickly and return.
650 int j0
= csj
*c_nbnxnGpuClusterSize
;
651 int j1
= j0
+ c_nbnxnGpuClusterSize
- 1;
654 Simd4Real jx0_S
, jy0_S
, jz0_S
;
655 Simd4Real jx1_S
, jy1_S
, jz1_S
;
657 Simd4Real dx_S0
, dy_S0
, dz_S0
;
658 Simd4Real dx_S1
, dy_S1
, dz_S1
;
659 Simd4Real dx_S2
, dy_S2
, dz_S2
;
660 Simd4Real dx_S3
, dy_S3
, dz_S3
;
671 Simd4Bool wco_any_S01
, wco_any_S23
, wco_any_S
;
673 jx0_S
= Simd4Real(x_j
[j0
*stride
+0]);
674 jy0_S
= Simd4Real(x_j
[j0
*stride
+1]);
675 jz0_S
= Simd4Real(x_j
[j0
*stride
+2]);
677 jx1_S
= Simd4Real(x_j
[j1
*stride
+0]);
678 jy1_S
= Simd4Real(x_j
[j1
*stride
+1]);
679 jz1_S
= Simd4Real(x_j
[j1
*stride
+2]);
681 /* Calculate distance */
682 dx_S0
= ix_S0
- jx0_S
;
683 dy_S0
= iy_S0
- jy0_S
;
684 dz_S0
= iz_S0
- jz0_S
;
685 dx_S2
= ix_S0
- jx1_S
;
686 dy_S2
= iy_S0
- jy1_S
;
687 dz_S2
= iz_S0
- jz1_S
;
688 if (c_nbnxnGpuClusterSize
== 8)
690 dx_S1
= ix_S1
- jx0_S
;
691 dy_S1
= iy_S1
- jy0_S
;
692 dz_S1
= iz_S1
- jz0_S
;
693 dx_S3
= ix_S1
- jx1_S
;
694 dy_S3
= iy_S1
- jy1_S
;
695 dz_S3
= iz_S1
- jz1_S
;
698 /* rsq = dx*dx+dy*dy+dz*dz */
699 rsq_S0
= norm2(dx_S0
, dy_S0
, dz_S0
);
700 rsq_S2
= norm2(dx_S2
, dy_S2
, dz_S2
);
701 if (c_nbnxnGpuClusterSize
== 8)
703 rsq_S1
= norm2(dx_S1
, dy_S1
, dz_S1
);
704 rsq_S3
= norm2(dx_S3
, dy_S3
, dz_S3
);
707 wco_S0
= (rsq_S0
< rc2_S
);
708 wco_S2
= (rsq_S2
< rc2_S
);
709 if (c_nbnxnGpuClusterSize
== 8)
711 wco_S1
= (rsq_S1
< rc2_S
);
712 wco_S3
= (rsq_S3
< rc2_S
);
714 if (c_nbnxnGpuClusterSize
== 8)
716 wco_any_S01
= wco_S0
|| wco_S1
;
717 wco_any_S23
= wco_S2
|| wco_S3
;
718 wco_any_S
= wco_any_S01
|| wco_any_S23
;
722 wco_any_S
= wco_S0
|| wco_S2
;
725 if (anyTrue(wco_any_S
))
736 #endif /* !GMX_SIMD4_HAVE_REAL */
739 /* Returns the j-cluster index for index cjIndex in a cj list */
740 static inline int nblCj(gmx::ArrayRef
<const nbnxn_cj_t
> cjList
,
743 return cjList
[cjIndex
].cj
;
746 /* Returns the j-cluster index for index cjIndex in a cj4 list */
747 static inline int nblCj(gmx::ArrayRef
<const nbnxn_cj4_t
> cj4List
,
750 return cj4List
[cjIndex
/c_nbnxnGpuJgroupSize
].cj
[cjIndex
& (c_nbnxnGpuJgroupSize
- 1)];
753 /* Returns the i-interaction mask of the j sub-cell for index cj_ind */
754 static unsigned int nbl_imask0(const NbnxnPairlistGpu
*nbl
, int cj_ind
)
756 return nbl
->cj4
[cj_ind
/c_nbnxnGpuJgroupSize
].imei
[0].imask
;
759 /* Initializes a single NbnxnPairlistCpu data structure */
760 static void nbnxn_init_pairlist(NbnxnPairlistCpu
*nbl
)
762 nbl
->na_ci
= c_nbnxnCpuIClusterSize
;
765 nbl
->ciOuter
.clear();
768 nbl
->cjOuter
.clear();
771 nbl
->work
= new NbnxnPairlistCpuWork();
774 NbnxnPairlistGpu::NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy
) :
775 na_ci(c_nbnxnGpuClusterSize
),
776 na_cj(c_nbnxnGpuClusterSize
),
777 na_sc(c_gpuNumClusterPerCell
*c_nbnxnGpuClusterSize
),
779 sci({}, {pinningPolicy
}),
780 cj4({}, {pinningPolicy
}),
781 excl({}, {pinningPolicy
}),
784 static_assert(c_nbnxnGpuNumClusterPerSupercluster
== c_gpuNumClusterPerCell
,
785 "The search code assumes that the a super-cluster matches a search grid cell");
787 static_assert(sizeof(cj4
[0].imei
[0].imask
)*8 >= c_nbnxnGpuJgroupSize
*c_gpuNumClusterPerCell
,
788 "The i super-cluster cluster interaction mask does not contain a sufficient number of bits");
790 static_assert(sizeof(excl
[0])*8 >= c_nbnxnGpuJgroupSize
*c_gpuNumClusterPerCell
, "The GPU exclusion mask does not contain a sufficient number of bits");
792 // We always want a first entry without any exclusions
795 work
= new NbnxnPairlistGpuWork();
798 void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t
*nbl_list
)
801 (nbl_list
->params
.pairlistType
== PairlistType::Simple4x2
||
802 nbl_list
->params
.pairlistType
== PairlistType::Simple4x4
||
803 nbl_list
->params
.pairlistType
== PairlistType::Simple4x8
);
804 // Currently GPU lists are always combined
805 nbl_list
->bCombined
= !nbl_list
->bSimple
;
807 nbl_list
->nnbl
= gmx_omp_nthreads_get(emntNonbonded
);
809 if (!nbl_list
->bCombined
&&
810 nbl_list
->nnbl
> NBNXN_BUFFERFLAG_MAX_THREADS
)
812 gmx_fatal(FARGS
, "%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
813 nbl_list
->nnbl
, NBNXN_BUFFERFLAG_MAX_THREADS
, NBNXN_BUFFERFLAG_MAX_THREADS
);
816 if (nbl_list
->bSimple
)
818 snew(nbl_list
->nbl
, nbl_list
->nnbl
);
819 if (nbl_list
->nnbl
> 1)
821 snew(nbl_list
->nbl_work
, nbl_list
->nnbl
);
826 snew(nbl_list
->nblGpu
, nbl_list
->nnbl
);
828 nbl_list
->nbl_fep
.resize(nbl_list
->nnbl
);
829 /* Execute in order to avoid memory interleaving between threads */
830 #pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
831 for (int i
= 0; i
< nbl_list
->nnbl
; i
++)
835 /* Allocate the nblist data structure locally on each thread
836 * to optimize memory access for NUMA architectures.
838 if (nbl_list
->bSimple
)
840 nbl_list
->nbl
[i
] = new NbnxnPairlistCpu();
842 nbnxn_init_pairlist(nbl_list
->nbl
[i
]);
843 if (nbl_list
->nnbl
> 1)
845 nbl_list
->nbl_work
[i
] = new NbnxnPairlistCpu();
846 nbnxn_init_pairlist(nbl_list
->nbl_work
[i
]);
851 /* Only list 0 is used on the GPU, use normal allocation for i>0 */
852 auto pinningPolicy
= (i
== 0 ? gmx::PinningPolicy::PinnedIfSupported
: gmx::PinningPolicy::CannotBePinned
);
854 nbl_list
->nblGpu
[i
] = new NbnxnPairlistGpu(pinningPolicy
);
857 snew(nbl_list
->nbl_fep
[i
], 1);
858 nbnxn_init_pairlist_fep(nbl_list
->nbl_fep
[i
]);
860 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
864 /* Print statistics of a pair list, used for debug output */
865 static void print_nblist_statistics(FILE *fp
, const NbnxnPairlistCpu
*nbl
,
866 const nbnxn_search
*nbs
, real rl
)
868 const nbnxn_grid_t
*grid
;
872 grid
= &nbs
->grid
[0];
874 fprintf(fp
, "nbl nci %zu ncj %d\n",
875 nbl
->ci
.size(), nbl
->ncjInUse
);
876 fprintf(fp
, "nbl na_cj %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
877 nbl
->na_cj
, rl
, nbl
->ncjInUse
, nbl
->ncjInUse
/static_cast<double>(grid
->nc
),
878 nbl
->ncjInUse
/static_cast<double>(grid
->nc
)*grid
->na_cj
,
879 nbl
->ncjInUse
/static_cast<double>(grid
->nc
)*grid
->na_cj
/(0.5*4.0/3.0*M_PI
*rl
*rl
*rl
*grid
->nc
*grid
->na_cj
/(grid
->size
[XX
]*grid
->size
[YY
]*grid
->size
[ZZ
])));
881 fprintf(fp
, "nbl average j cell list length %.1f\n",
882 0.25*nbl
->ncjInUse
/std::max(static_cast<double>(nbl
->ci
.size()), 1.0));
884 for (int s
= 0; s
< SHIFTS
; s
++)
889 for (const nbnxn_ci_t
&ciEntry
: nbl
->ci
)
891 cs
[ciEntry
.shift
& NBNXN_CI_SHIFT
] +=
892 ciEntry
.cj_ind_end
- ciEntry
.cj_ind_start
;
894 int j
= ciEntry
.cj_ind_start
;
895 while (j
< ciEntry
.cj_ind_end
&&
896 nbl
->cj
[j
].excl
!= NBNXN_INTERACTION_MASK_ALL
)
902 fprintf(fp
, "nbl cell pairs, total: %zu excl: %d %.1f%%\n",
903 nbl
->cj
.size(), npexcl
, 100*npexcl
/std::max(static_cast<double>(nbl
->cj
.size()), 1.0));
904 for (int s
= 0; s
< SHIFTS
; s
++)
908 fprintf(fp
, "nbl shift %2d ncj %3d\n", s
, cs
[s
]);
913 /* Print statistics of a pair lists, used for debug output */
914 static void print_nblist_statistics(FILE *fp
, const NbnxnPairlistGpu
*nbl
,
915 const nbnxn_search
*nbs
, real rl
)
917 const nbnxn_grid_t
*grid
;
919 int c
[c_gpuNumClusterPerCell
+ 1];
920 double sum_nsp
, sum_nsp2
;
923 /* This code only produces correct statistics with domain decomposition */
924 grid
= &nbs
->grid
[0];
926 fprintf(fp
, "nbl nsci %zu ncj4 %zu nsi %d excl4 %zu\n",
927 nbl
->sci
.size(), nbl
->cj4
.size(), nbl
->nci_tot
, nbl
->excl
.size());
928 fprintf(fp
, "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
929 nbl
->na_ci
, rl
, nbl
->nci_tot
, nbl
->nci_tot
/static_cast<double>(grid
->nsubc_tot
),
930 nbl
->nci_tot
/static_cast<double>(grid
->nsubc_tot
)*grid
->na_c
,
931 nbl
->nci_tot
/static_cast<double>(grid
->nsubc_tot
)*grid
->na_c
/(0.5*4.0/3.0*M_PI
*rl
*rl
*rl
*grid
->nsubc_tot
*grid
->na_c
/(grid
->size
[XX
]*grid
->size
[YY
]*grid
->size
[ZZ
])));
936 for (int si
= 0; si
<= c_gpuNumClusterPerCell
; si
++)
940 for (const nbnxn_sci_t
&sci
: nbl
->sci
)
943 for (int j4
= sci
.cj4_ind_start
; j4
< sci
.cj4_ind_end
; j4
++)
945 for (int j
= 0; j
< c_nbnxnGpuJgroupSize
; j
++)
948 for (int si
= 0; si
< c_gpuNumClusterPerCell
; si
++)
950 if (nbl
->cj4
[j4
].imei
[0].imask
& (1U << (j
*c_gpuNumClusterPerCell
+ si
)))
961 nsp_max
= std::max(nsp_max
, nsp
);
963 if (!nbl
->sci
.empty())
965 sum_nsp
/= nbl
->sci
.size();
966 sum_nsp2
/= nbl
->sci
.size();
968 fprintf(fp
, "nbl #cluster-pairs: av %.1f stddev %.1f max %d\n",
969 sum_nsp
, std::sqrt(sum_nsp2
- sum_nsp
*sum_nsp
), nsp_max
);
971 if (!nbl
->cj4
.empty())
973 for (b
= 0; b
<= c_gpuNumClusterPerCell
; b
++)
975 fprintf(fp
, "nbl j-list #i-subcell %d %7d %4.1f\n",
976 b
, c
[b
], 100.0*c
[b
]/size_t {nbl
->cj4
.size()*c_nbnxnGpuJgroupSize
});
981 /* Returns a pointer to the exclusion mask for j-cluster-group \p cj4 and warp \p warp
982 * Generates a new exclusion entry when the j-cluster-group uses
983 * the default all-interaction mask at call time, so the returned mask
984 * can be modified when needed.
986 static nbnxn_excl_t
*get_exclusion_mask(NbnxnPairlistGpu
*nbl
,
990 if (nbl
->cj4
[cj4
].imei
[warp
].excl_ind
== 0)
992 /* No exclusions set, make a new list entry */
993 const size_t oldSize
= nbl
->excl
.size();
994 GMX_ASSERT(oldSize
>= 1, "We should always have entry [0]");
995 /* Add entry with default values: no exclusions */
996 nbl
->excl
.resize(oldSize
+ 1);
997 nbl
->cj4
[cj4
].imei
[warp
].excl_ind
= oldSize
;
1000 return &nbl
->excl
[nbl
->cj4
[cj4
].imei
[warp
].excl_ind
];
1003 static void set_self_and_newton_excls_supersub(NbnxnPairlistGpu
*nbl
,
1004 int cj4_ind
, int sj_offset
,
1005 int i_cluster_in_cell
)
1007 nbnxn_excl_t
*excl
[c_nbnxnGpuClusterpairSplit
];
1009 /* Here we only set the set self and double pair exclusions */
1011 /* Reserve extra elements, so the resize() in get_exclusion_mask()
1012 * will not invalidate excl entries in the loop below
1014 nbl
->excl
.reserve(nbl
->excl
.size() + c_nbnxnGpuClusterpairSplit
);
1015 for (int w
= 0; w
< c_nbnxnGpuClusterpairSplit
; w
++)
1017 excl
[w
] = get_exclusion_mask(nbl
, cj4_ind
, w
);
1020 /* Only minor < major bits set */
1021 for (int ej
= 0; ej
< nbl
->na_ci
; ej
++)
1024 for (int ei
= ej
; ei
< nbl
->na_ci
; ei
++)
1026 excl
[w
]->pair
[(ej
& (c_nbnxnGpuJgroupSize
-1))*nbl
->na_ci
+ ei
] &=
1027 ~(1U << (sj_offset
*c_gpuNumClusterPerCell
+ i_cluster_in_cell
));
1032 /* Returns a diagonal or off-diagonal interaction mask for plain C lists */
1033 static unsigned int get_imask(gmx_bool rdiag
, int ci
, int cj
)
1035 return (rdiag
&& ci
== cj
? NBNXN_INTERACTION_MASK_DIAG
: NBNXN_INTERACTION_MASK_ALL
);
1038 /* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
1039 gmx_unused
static unsigned int get_imask_simd_j2(gmx_bool rdiag
, int ci
, int cj
)
1041 return (rdiag
&& ci
*2 == cj
? NBNXN_INTERACTION_MASK_DIAG_J2_0
:
1042 (rdiag
&& ci
*2+1 == cj
? NBNXN_INTERACTION_MASK_DIAG_J2_1
:
1043 NBNXN_INTERACTION_MASK_ALL
));
1046 /* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
1047 gmx_unused
static unsigned int get_imask_simd_j4(gmx_bool rdiag
, int ci
, int cj
)
1049 return (rdiag
&& ci
== cj
? NBNXN_INTERACTION_MASK_DIAG
: NBNXN_INTERACTION_MASK_ALL
);
1052 /* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
1053 gmx_unused
static unsigned int get_imask_simd_j8(gmx_bool rdiag
, int ci
, int cj
)
1055 return (rdiag
&& ci
== cj
*2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0
:
1056 (rdiag
&& ci
== cj
*2+1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1
:
1057 NBNXN_INTERACTION_MASK_ALL
));
1061 #if GMX_SIMD_REAL_WIDTH == 2
1062 #define get_imask_simd_4xn get_imask_simd_j2
1064 #if GMX_SIMD_REAL_WIDTH == 4
1065 #define get_imask_simd_4xn get_imask_simd_j4
1067 #if GMX_SIMD_REAL_WIDTH == 8
1068 #define get_imask_simd_4xn get_imask_simd_j8
1069 #define get_imask_simd_2xnn get_imask_simd_j4
1071 #if GMX_SIMD_REAL_WIDTH == 16
1072 #define get_imask_simd_2xnn get_imask_simd_j8
1076 /* Plain C code for checking and adding cluster-pairs to the list.
1078 * \param[in] gridj The j-grid
1079 * \param[in,out] nbl The pair-list to store the cluster pairs in
1080 * \param[in] icluster The index of the i-cluster
1081 * \param[in] jclusterFirst The first cluster in the j-range
1082 * \param[in] jclusterLast The last cluster in the j-range
1083 * \param[in] excludeSubDiagonal Exclude atom pairs with i-index > j-index
1084 * \param[in] x_j Coordinates for the j-atom, in xyz format
1085 * \param[in] rlist2 The squared list cut-off
1086 * \param[in] rbb2 The squared cut-off for putting cluster-pairs in the list based on bounding box distance only
1087 * \param[in,out] numDistanceChecks The number of distance checks performed
1090 makeClusterListSimple(const nbnxn_grid_t
&jGrid
,
1091 NbnxnPairlistCpu
* nbl
,
1095 bool excludeSubDiagonal
,
1096 const real
* gmx_restrict x_j
,
1099 int * gmx_restrict numDistanceChecks
)
1101 const nbnxn_bb_t
* gmx_restrict bb_ci
= nbl
->work
->iClusterData
.bb
.data();
1102 const real
* gmx_restrict x_ci
= nbl
->work
->iClusterData
.x
.data();
1107 while (!InRange
&& jclusterFirst
<= jclusterLast
)
1109 real d2
= subc_bb_dist2(0, bb_ci
, jclusterFirst
, jGrid
.bb
);
1110 *numDistanceChecks
+= 2;
1112 /* Check if the distance is within the distance where
1113 * we use only the bounding box distance rbb,
1114 * or within the cut-off and there is at least one atom pair
1115 * within the cut-off.
1121 else if (d2
< rlist2
)
1123 int cjf_gl
= jGrid
.cell0
+ jclusterFirst
;
1124 for (int i
= 0; i
< c_nbnxnCpuIClusterSize
&& !InRange
; i
++)
1126 for (int j
= 0; j
< c_nbnxnCpuIClusterSize
; j
++)
1128 InRange
= InRange
||
1129 (gmx::square(x_ci
[i
*STRIDE_XYZ
+XX
] - x_j
[(cjf_gl
*c_nbnxnCpuIClusterSize
+j
)*STRIDE_XYZ
+XX
]) +
1130 gmx::square(x_ci
[i
*STRIDE_XYZ
+YY
] - x_j
[(cjf_gl
*c_nbnxnCpuIClusterSize
+j
)*STRIDE_XYZ
+YY
]) +
1131 gmx::square(x_ci
[i
*STRIDE_XYZ
+ZZ
] - x_j
[(cjf_gl
*c_nbnxnCpuIClusterSize
+j
)*STRIDE_XYZ
+ZZ
]) < rlist2
);
1134 *numDistanceChecks
+= c_nbnxnCpuIClusterSize
*c_nbnxnCpuIClusterSize
;
1147 while (!InRange
&& jclusterLast
> jclusterFirst
)
1149 real d2
= subc_bb_dist2(0, bb_ci
, jclusterLast
, jGrid
.bb
);
1150 *numDistanceChecks
+= 2;
1152 /* Check if the distance is within the distance where
1153 * we use only the bounding box distance rbb,
1154 * or within the cut-off and there is at least one atom pair
1155 * within the cut-off.
1161 else if (d2
< rlist2
)
1163 int cjl_gl
= jGrid
.cell0
+ jclusterLast
;
1164 for (int i
= 0; i
< c_nbnxnCpuIClusterSize
&& !InRange
; i
++)
1166 for (int j
= 0; j
< c_nbnxnCpuIClusterSize
; j
++)
1168 InRange
= InRange
||
1169 (gmx::square(x_ci
[i
*STRIDE_XYZ
+XX
] - x_j
[(cjl_gl
*c_nbnxnCpuIClusterSize
+j
)*STRIDE_XYZ
+XX
]) +
1170 gmx::square(x_ci
[i
*STRIDE_XYZ
+YY
] - x_j
[(cjl_gl
*c_nbnxnCpuIClusterSize
+j
)*STRIDE_XYZ
+YY
]) +
1171 gmx::square(x_ci
[i
*STRIDE_XYZ
+ZZ
] - x_j
[(cjl_gl
*c_nbnxnCpuIClusterSize
+j
)*STRIDE_XYZ
+ZZ
]) < rlist2
);
1174 *numDistanceChecks
+= c_nbnxnCpuIClusterSize
*c_nbnxnCpuIClusterSize
;
1182 if (jclusterFirst
<= jclusterLast
)
1184 for (int jcluster
= jclusterFirst
; jcluster
<= jclusterLast
; jcluster
++)
1186 /* Store cj and the interaction mask */
1188 cjEntry
.cj
= jGrid
.cell0
+ jcluster
;
1189 cjEntry
.excl
= get_imask(excludeSubDiagonal
, icluster
, jcluster
);
1190 nbl
->cj
.push_back(cjEntry
);
1192 /* Increase the closing index in the i list */
1193 nbl
->ci
.back().cj_ind_end
= nbl
->cj
.size();
1197 #ifdef GMX_NBNXN_SIMD_4XN
1198 #include "gromacs/nbnxm/pairlist_simd_4xm.h"
1200 #ifdef GMX_NBNXN_SIMD_2XNN
1201 #include "gromacs/nbnxm/pairlist_simd_2xmm.h"
1204 /* Plain C or SIMD4 code for making a pair list of super-cell sci vs scj.
1205 * Checks bounding box distances and possibly atom pair distances.
1207 static void make_cluster_list_supersub(const nbnxn_grid_t
&iGrid
,
1208 const nbnxn_grid_t
&jGrid
,
1209 NbnxnPairlistGpu
*nbl
,
1212 const bool excludeSubDiagonal
,
1217 int *numDistanceChecks
)
1219 NbnxnPairlistGpuWork
&work
= *nbl
->work
;
1222 const float *pbb_ci
= work
.iSuperClusterData
.bbPacked
.data();
1224 const nbnxn_bb_t
*bb_ci
= work
.iSuperClusterData
.bb
.data();
1227 assert(c_nbnxnGpuClusterSize
== iGrid
.na_c
);
1228 assert(c_nbnxnGpuClusterSize
== jGrid
.na_c
);
1230 /* We generate the pairlist mainly based on bounding-box distances
1231 * and do atom pair distance based pruning on the GPU.
1232 * Only if a j-group contains a single cluster-pair, we try to prune
1233 * that pair based on atom distances on the CPU to avoid empty j-groups.
1235 #define PRUNE_LIST_CPU_ONE 1
1236 #define PRUNE_LIST_CPU_ALL 0
1238 #if PRUNE_LIST_CPU_ONE
1242 float *d2l
= work
.distanceBuffer
.data();
1244 for (int subc
= 0; subc
< jGrid
.nsubc
[scj
]; subc
++)
1246 const int cj4_ind
= work
.cj_ind
/c_nbnxnGpuJgroupSize
;
1247 const int cj_offset
= work
.cj_ind
- cj4_ind
*c_nbnxnGpuJgroupSize
;
1248 const int cj
= scj
*c_gpuNumClusterPerCell
+ subc
;
1250 const int cj_gl
= jGrid
.cell0
*c_gpuNumClusterPerCell
+ cj
;
1253 if (excludeSubDiagonal
&& sci
== scj
)
1259 ci1
= iGrid
.nsubc
[sci
];
1263 /* Determine all ci1 bb distances in one call with SIMD4 */
1264 subc_bb_dist2_simd4_xxxx(jGrid
.pbb
.data() + (cj
>> STRIDE_PBB_2LOG
)*NNBSBB_XXXX
+ (cj
& (STRIDE_PBB
-1)),
1266 *numDistanceChecks
+= c_nbnxnGpuClusterSize
*2;
1270 unsigned int imask
= 0;
1271 /* We use a fixed upper-bound instead of ci1 to help optimization */
1272 for (int ci
= 0; ci
< c_gpuNumClusterPerCell
; ci
++)
1280 /* Determine the bb distance between ci and cj */
1281 d2l
[ci
] = subc_bb_dist2(ci
, bb_ci
, cj
, jGrid
.bb
);
1282 *numDistanceChecks
+= 2;
1286 #if PRUNE_LIST_CPU_ALL
1287 /* Check if the distance is within the distance where
1288 * we use only the bounding box distance rbb,
1289 * or within the cut-off and there is at least one atom pair
1290 * within the cut-off. This check is very costly.
1292 *numDistanceChecks
+= c_nbnxnGpuClusterSize
*c_nbnxnGpuClusterSize
;
1295 clusterpair_in_range(work
, ci
, cj_gl
, stride
, x
, rlist2
)))
1297 /* Check if the distance between the two bounding boxes
1298 * in within the pair-list cut-off.
1303 /* Flag this i-subcell to be taken into account */
1304 imask
|= (1U << (cj_offset
*c_gpuNumClusterPerCell
+ ci
));
1306 #if PRUNE_LIST_CPU_ONE
1314 #if PRUNE_LIST_CPU_ONE
1315 /* If we only found 1 pair, check if any atoms are actually
1316 * within the cut-off, so we could get rid of it.
1318 if (npair
== 1 && d2l
[ci_last
] >= rbb2
&&
1319 !clusterpair_in_range(work
, ci_last
, cj_gl
, stride
, x
, rlist2
))
1321 imask
&= ~(1U << (cj_offset
*c_gpuNumClusterPerCell
+ ci_last
));
1328 /* We have at least one cluster pair: add a j-entry */
1329 if (static_cast<size_t>(cj4_ind
) == nbl
->cj4
.size())
1331 nbl
->cj4
.resize(nbl
->cj4
.size() + 1);
1333 nbnxn_cj4_t
*cj4
= &nbl
->cj4
[cj4_ind
];
1335 cj4
->cj
[cj_offset
] = cj_gl
;
1337 /* Set the exclusions for the ci==sj entry.
1338 * Here we don't bother to check if this entry is actually flagged,
1339 * as it will nearly always be in the list.
1341 if (excludeSubDiagonal
&& sci
== scj
)
1343 set_self_and_newton_excls_supersub(nbl
, cj4_ind
, cj_offset
, subc
);
1346 /* Copy the cluster interaction mask to the list */
1347 for (int w
= 0; w
< c_nbnxnGpuClusterpairSplit
; w
++)
1349 cj4
->imei
[w
].imask
|= imask
;
1352 nbl
->work
->cj_ind
++;
1354 /* Keep the count */
1355 nbl
->nci_tot
+= npair
;
1357 /* Increase the closing index in i super-cell list */
1358 nbl
->sci
.back().cj4_ind_end
=
1359 (nbl
->work
->cj_ind
+ c_nbnxnGpuJgroupSize
- 1)/c_nbnxnGpuJgroupSize
;
1364 /* Returns how many contiguous j-clusters we have starting in the i-list */
1365 template <typename CjListType
>
1366 static int numContiguousJClusters(const int cjIndexStart
,
1367 const int cjIndexEnd
,
1368 gmx::ArrayRef
<const CjListType
> cjList
)
1370 const int firstJCluster
= nblCj(cjList
, cjIndexStart
);
1372 int numContiguous
= 0;
1374 while (cjIndexStart
+ numContiguous
< cjIndexEnd
&&
1375 nblCj(cjList
, cjIndexStart
+ numContiguous
) == firstJCluster
+ numContiguous
)
1380 return numContiguous
;
1384 * \brief Helper struct for efficient searching for excluded atoms in a j-list
1388 /*! \brief Constructs a j-list range from \p cjList with the given index range */
1389 template <typename CjListType
>
1390 JListRanges(int cjIndexStart
,
1392 gmx::ArrayRef
<const CjListType
> cjList
);
1394 int cjIndexStart
; //!< The start index in the j-list
1395 int cjIndexEnd
; //!< The end index in the j-list
1396 int cjFirst
; //!< The j-cluster with index cjIndexStart
1397 int cjLast
; //!< The j-cluster with index cjIndexEnd-1
1398 int numDirect
; //!< Up to cjIndexStart+numDirect the j-clusters are cjFirst + the index offset
1402 template <typename CjListType
>
1403 JListRanges::JListRanges(int cjIndexStart
,
1405 gmx::ArrayRef
<const CjListType
> cjList
) :
1406 cjIndexStart(cjIndexStart
),
1407 cjIndexEnd(cjIndexEnd
)
1409 GMX_ASSERT(cjIndexEnd
> cjIndexStart
, "JListRanges should only be called with non-empty lists");
1411 cjFirst
= nblCj(cjList
, cjIndexStart
);
1412 cjLast
= nblCj(cjList
, cjIndexEnd
- 1);
1414 /* Determine how many contiguous j-cells we have starting
1415 * from the first i-cell. This number can be used to directly
1416 * calculate j-cell indices for excluded atoms.
1418 numDirect
= numContiguousJClusters(cjIndexStart
, cjIndexEnd
, cjList
);
1422 /* Return the index of \p jCluster in the given range or -1 when not present
1424 * Note: This code is executed very often and therefore performance is
1425 * important. It should be inlined and fully optimized.
1427 template <typename CjListType
>
1429 findJClusterInJList(int jCluster
,
1430 const JListRanges
&ranges
,
1431 gmx::ArrayRef
<const CjListType
> cjList
)
1435 if (jCluster
< ranges
.cjFirst
+ ranges
.numDirect
)
1437 /* We can calculate the index directly using the offset */
1438 index
= ranges
.cjIndexStart
+ jCluster
- ranges
.cjFirst
;
1442 /* Search for jCluster using bisection */
1444 int rangeStart
= ranges
.cjIndexStart
+ ranges
.numDirect
;
1445 int rangeEnd
= ranges
.cjIndexEnd
;
1447 while (index
== -1 && rangeStart
< rangeEnd
)
1449 rangeMiddle
= (rangeStart
+ rangeEnd
) >> 1;
1451 const int clusterMiddle
= nblCj(cjList
, rangeMiddle
);
1453 if (jCluster
== clusterMiddle
)
1455 index
= rangeMiddle
;
1457 else if (jCluster
< clusterMiddle
)
1459 rangeEnd
= rangeMiddle
;
1463 rangeStart
= rangeMiddle
+ 1;
1471 // TODO: Get rid of the two functions below by renaming sci to ci (or something better)
1473 /* Return the i-entry in the list we are currently operating on */
1474 static nbnxn_ci_t
*getOpenIEntry(NbnxnPairlistCpu
*nbl
)
1476 return &nbl
->ci
.back();
1479 /* Return the i-entry in the list we are currently operating on */
1480 static nbnxn_sci_t
*getOpenIEntry(NbnxnPairlistGpu
*nbl
)
1482 return &nbl
->sci
.back();
1485 /* Set all atom-pair exclusions for a simple type list i-entry
1487 * Set all atom-pair exclusions from the topology stored in exclusions
1488 * as masks in the pair-list for simple list entry iEntry.
1491 setExclusionsForIEntry(const nbnxn_search
*nbs
,
1492 NbnxnPairlistCpu
*nbl
,
1493 gmx_bool diagRemoved
,
1495 const nbnxn_ci_t
&iEntry
,
1496 const t_blocka
&exclusions
)
1498 if (iEntry
.cj_ind_end
== iEntry
.cj_ind_start
)
1500 /* Empty list: no exclusions */
1504 const JListRanges
ranges(iEntry
.cj_ind_start
, iEntry
.cj_ind_end
, gmx::makeConstArrayRef(nbl
->cj
));
1506 const int iCluster
= iEntry
.ci
;
1508 gmx::ArrayRef
<const int> cell
= nbs
->cell
;
1510 /* Loop over the atoms in the i-cluster */
1511 for (int i
= 0; i
< nbl
->na_ci
; i
++)
1513 const int iIndex
= iCluster
*nbl
->na_ci
+ i
;
1514 const int iAtom
= nbs
->a
[iIndex
];
1517 /* Loop over the topology-based exclusions for this i-atom */
1518 for (int exclIndex
= exclusions
.index
[iAtom
]; exclIndex
< exclusions
.index
[iAtom
+ 1]; exclIndex
++)
1520 const int jAtom
= exclusions
.a
[exclIndex
];
1524 /* The self exclusion are already set, save some time */
1528 /* Get the index of the j-atom in the nbnxn atom data */
1529 const int jIndex
= cell
[jAtom
];
1531 /* Without shifts we only calculate interactions j>i
1532 * for one-way pair-lists.
1534 if (diagRemoved
&& jIndex
<= iIndex
)
1539 const int jCluster
= (jIndex
>> na_cj_2log
);
1541 /* Could the cluster se be in our list? */
1542 if (jCluster
>= ranges
.cjFirst
&& jCluster
<= ranges
.cjLast
)
1545 findJClusterInJList(jCluster
, ranges
,
1546 gmx::makeConstArrayRef(nbl
->cj
));
1550 /* We found an exclusion, clear the corresponding
1553 const int innerJ
= jIndex
- (jCluster
<< na_cj_2log
);
1555 nbl
->cj
[index
].excl
&= ~(1U << ((i
<< na_cj_2log
) + innerJ
));
1563 /* Add a new i-entry to the FEP list and copy the i-properties */
1564 static inline void fep_list_new_nri_copy(t_nblist
*nlist
)
1566 /* Add a new i-entry */
1569 assert(nlist
->nri
< nlist
->maxnri
);
1571 /* Duplicate the last i-entry, except for jindex, which continues */
1572 nlist
->iinr
[nlist
->nri
] = nlist
->iinr
[nlist
->nri
-1];
1573 nlist
->shift
[nlist
->nri
] = nlist
->shift
[nlist
->nri
-1];
1574 nlist
->gid
[nlist
->nri
] = nlist
->gid
[nlist
->nri
-1];
1575 nlist
->jindex
[nlist
->nri
] = nlist
->nrj
;
1578 /* For load balancing of the free-energy lists over threads, we set
1579 * the maximum nrj size of an i-entry to 40. This leads to good
1580 * load balancing in the worst case scenario of a single perturbed
1581 * particle on 16 threads, while not introducing significant overhead.
1582 * Note that half of the perturbed pairs will anyhow end up in very small lists,
1583 * since non perturbed i-particles will see few perturbed j-particles).
1585 const int max_nrj_fep
= 40;
1587 /* Exclude the perturbed pairs from the Verlet list. This is only done to avoid
1588 * singularities for overlapping particles (0/0), since the charges and
1589 * LJ parameters have been zeroed in the nbnxn data structure.
1590 * Simultaneously make a group pair list for the perturbed pairs.
1592 static void make_fep_list(const nbnxn_search
*nbs
,
1593 const nbnxn_atomdata_t
*nbat
,
1594 NbnxnPairlistCpu
*nbl
,
1595 gmx_bool bDiagRemoved
,
1597 real gmx_unused shx
,
1598 real gmx_unused shy
,
1599 real gmx_unused shz
,
1600 real gmx_unused rlist_fep2
,
1601 const nbnxn_grid_t
&iGrid
,
1602 const nbnxn_grid_t
&jGrid
,
1605 int ci
, cj_ind_start
, cj_ind_end
, cja
, cjr
;
1607 int ngid
, gid_i
= 0, gid_j
, gid
;
1608 int egp_shift
, egp_mask
;
1610 int ind_i
, ind_j
, ai
, aj
;
1612 gmx_bool bFEP_i
, bFEP_i_all
;
1614 if (nbl_ci
->cj_ind_end
== nbl_ci
->cj_ind_start
)
1622 cj_ind_start
= nbl_ci
->cj_ind_start
;
1623 cj_ind_end
= nbl_ci
->cj_ind_end
;
1625 /* In worst case we have alternating energy groups
1626 * and create #atom-pair lists, which means we need the size
1627 * of a cluster pair (na_ci*na_cj) times the number of cj's.
1629 nri_max
= nbl
->na_ci
*nbl
->na_cj
*(cj_ind_end
- cj_ind_start
);
1630 if (nlist
->nri
+ nri_max
> nlist
->maxnri
)
1632 nlist
->maxnri
= over_alloc_large(nlist
->nri
+ nri_max
);
1633 reallocate_nblist(nlist
);
1636 const nbnxn_atomdata_t::Params
&nbatParams
= nbat
->params();
1638 ngid
= nbatParams
.nenergrp
;
1640 if (ngid
*jGrid
.na_cj
> gmx::index(sizeof(gid_cj
)*8))
1642 gmx_fatal(FARGS
, "The Verlet scheme with %dx%d kernels and free-energy only supports up to %zu energy groups",
1643 iGrid
.na_c
, jGrid
.na_cj
, (sizeof(gid_cj
)*8)/jGrid
.na_cj
);
1646 egp_shift
= nbatParams
.neg_2log
;
1647 egp_mask
= (1 << egp_shift
) - 1;
1649 /* Loop over the atoms in the i sub-cell */
1651 for (int i
= 0; i
< nbl
->na_ci
; i
++)
1653 ind_i
= ci
*nbl
->na_ci
+ i
;
1658 nlist
->jindex
[nri
+1] = nlist
->jindex
[nri
];
1659 nlist
->iinr
[nri
] = ai
;
1660 /* The actual energy group pair index is set later */
1661 nlist
->gid
[nri
] = 0;
1662 nlist
->shift
[nri
] = nbl_ci
->shift
& NBNXN_CI_SHIFT
;
1664 bFEP_i
= ((iGrid
.fep
[ci
- iGrid
.cell0
] & (1 << i
)) != 0u);
1666 bFEP_i_all
= bFEP_i_all
&& bFEP_i
;
1668 if (nlist
->nrj
+ (cj_ind_end
- cj_ind_start
)*nbl
->na_cj
> nlist
->maxnrj
)
1670 nlist
->maxnrj
= over_alloc_small(nlist
->nrj
+ (cj_ind_end
- cj_ind_start
)*nbl
->na_cj
);
1671 srenew(nlist
->jjnr
, nlist
->maxnrj
);
1672 srenew(nlist
->excl_fep
, nlist
->maxnrj
);
1677 gid_i
= (nbatParams
.energrp
[ci
] >> (egp_shift
*i
)) & egp_mask
;
1680 for (int cj_ind
= cj_ind_start
; cj_ind
< cj_ind_end
; cj_ind
++)
1682 unsigned int fep_cj
;
1684 cja
= nbl
->cj
[cj_ind
].cj
;
1686 if (jGrid
.na_cj
== jGrid
.na_c
)
1688 cjr
= cja
- jGrid
.cell0
;
1689 fep_cj
= jGrid
.fep
[cjr
];
1692 gid_cj
= nbatParams
.energrp
[cja
];
1695 else if (2*jGrid
.na_cj
== jGrid
.na_c
)
1697 cjr
= cja
- jGrid
.cell0
*2;
1698 /* Extract half of the ci fep/energrp mask */
1699 fep_cj
= (jGrid
.fep
[cjr
>>1] >> ((cjr
&1)*jGrid
.na_cj
)) & ((1<<jGrid
.na_cj
) - 1);
1702 gid_cj
= nbatParams
.energrp
[cja
>>1] >> ((cja
&1)*jGrid
.na_cj
*egp_shift
) & ((1<<(jGrid
.na_cj
*egp_shift
)) - 1);
1707 cjr
= cja
- (jGrid
.cell0
>>1);
1708 /* Combine two ci fep masks/energrp */
1709 fep_cj
= jGrid
.fep
[cjr
*2] + (jGrid
.fep
[cjr
*2+1] << jGrid
.na_c
);
1712 gid_cj
= nbatParams
.energrp
[cja
*2] + (nbatParams
.energrp
[cja
*2+1] << (jGrid
.na_c
*egp_shift
));
1716 if (bFEP_i
|| fep_cj
!= 0)
1718 for (int j
= 0; j
< nbl
->na_cj
; j
++)
1720 /* Is this interaction perturbed and not excluded? */
1721 ind_j
= cja
*nbl
->na_cj
+ j
;
1724 (bFEP_i
|| (fep_cj
& (1 << j
))) &&
1725 (!bDiagRemoved
|| ind_j
>= ind_i
))
1729 gid_j
= (gid_cj
>> (j
*egp_shift
)) & egp_mask
;
1730 gid
= GID(gid_i
, gid_j
, ngid
);
1732 if (nlist
->nrj
> nlist
->jindex
[nri
] &&
1733 nlist
->gid
[nri
] != gid
)
1735 /* Energy group pair changed: new list */
1736 fep_list_new_nri_copy(nlist
);
1739 nlist
->gid
[nri
] = gid
;
1742 if (nlist
->nrj
- nlist
->jindex
[nri
] >= max_nrj_fep
)
1744 fep_list_new_nri_copy(nlist
);
1748 /* Add it to the FEP list */
1749 nlist
->jjnr
[nlist
->nrj
] = aj
;
1750 nlist
->excl_fep
[nlist
->nrj
] = (nbl
->cj
[cj_ind
].excl
>> (i
*nbl
->na_cj
+ j
)) & 1;
1753 /* Exclude it from the normal list.
1754 * Note that the charge has been set to zero,
1755 * but we need to avoid 0/0, as perturbed atoms
1756 * can be on top of each other.
1758 nbl
->cj
[cj_ind
].excl
&= ~(1U << (i
*nbl
->na_cj
+ j
));
1764 if (nlist
->nrj
> nlist
->jindex
[nri
])
1766 /* Actually add this new, non-empty, list */
1768 nlist
->jindex
[nlist
->nri
] = nlist
->nrj
;
1775 /* All interactions are perturbed, we can skip this entry */
1776 nbl_ci
->cj_ind_end
= cj_ind_start
;
1777 nbl
->ncjInUse
-= cj_ind_end
- cj_ind_start
;
1781 /* Return the index of atom a within a cluster */
1782 static inline int cj_mod_cj4(int cj
)
1784 return cj
& (c_nbnxnGpuJgroupSize
- 1);
1787 /* Convert a j-cluster to a cj4 group */
1788 static inline int cj_to_cj4(int cj
)
1790 return cj
/c_nbnxnGpuJgroupSize
;
1793 /* Return the index of an j-atom within a warp */
1794 static inline int a_mod_wj(int a
)
1796 return a
& (c_nbnxnGpuClusterSize
/c_nbnxnGpuClusterpairSplit
- 1);
1799 /* As make_fep_list above, but for super/sub lists. */
1800 static void make_fep_list(const nbnxn_search
*nbs
,
1801 const nbnxn_atomdata_t
*nbat
,
1802 NbnxnPairlistGpu
*nbl
,
1803 gmx_bool bDiagRemoved
,
1804 const nbnxn_sci_t
*nbl_sci
,
1809 const nbnxn_grid_t
&iGrid
,
1810 const nbnxn_grid_t
&jGrid
,
1815 int ind_i
, ind_j
, ai
, aj
;
1819 const nbnxn_cj4_t
*cj4
;
1821 const int numJClusterGroups
= nbl_sci
->numJClusterGroups();
1822 if (numJClusterGroups
== 0)
1828 const int sci
= nbl_sci
->sci
;
1830 const int cj4_ind_start
= nbl_sci
->cj4_ind_start
;
1831 const int cj4_ind_end
= nbl_sci
->cj4_ind_end
;
1833 /* Here we process one super-cell, max #atoms na_sc, versus a list
1834 * cj4 entries, each with max c_nbnxnGpuJgroupSize cj's, each
1835 * of size na_cj atoms.
1836 * On the GPU we don't support energy groups (yet).
1837 * So for each of the na_sc i-atoms, we need max one FEP list
1838 * for each max_nrj_fep j-atoms.
1840 nri_max
= nbl
->na_sc
*nbl
->na_cj
*(1 + (numJClusterGroups
*c_nbnxnGpuJgroupSize
)/max_nrj_fep
);
1841 if (nlist
->nri
+ nri_max
> nlist
->maxnri
)
1843 nlist
->maxnri
= over_alloc_large(nlist
->nri
+ nri_max
);
1844 reallocate_nblist(nlist
);
1847 /* Loop over the atoms in the i super-cluster */
1848 for (int c
= 0; c
< c_gpuNumClusterPerCell
; c
++)
1850 c_abs
= sci
*c_gpuNumClusterPerCell
+ c
;
1852 for (int i
= 0; i
< nbl
->na_ci
; i
++)
1854 ind_i
= c_abs
*nbl
->na_ci
+ i
;
1859 nlist
->jindex
[nri
+1] = nlist
->jindex
[nri
];
1860 nlist
->iinr
[nri
] = ai
;
1861 /* With GPUs, energy groups are not supported */
1862 nlist
->gid
[nri
] = 0;
1863 nlist
->shift
[nri
] = nbl_sci
->shift
& NBNXN_CI_SHIFT
;
1865 bFEP_i
= ((iGrid
.fep
[c_abs
- iGrid
.cell0
*c_gpuNumClusterPerCell
] & (1 << i
)) != 0u);
1867 xi
= nbat
->x()[ind_i
*nbat
->xstride
+XX
] + shx
;
1868 yi
= nbat
->x()[ind_i
*nbat
->xstride
+YY
] + shy
;
1869 zi
= nbat
->x()[ind_i
*nbat
->xstride
+ZZ
] + shz
;
1871 const int nrjMax
= nlist
->nrj
+ numJClusterGroups
*c_nbnxnGpuJgroupSize
*nbl
->na_cj
;
1872 if (nrjMax
> nlist
->maxnrj
)
1874 nlist
->maxnrj
= over_alloc_small(nrjMax
);
1875 srenew(nlist
->jjnr
, nlist
->maxnrj
);
1876 srenew(nlist
->excl_fep
, nlist
->maxnrj
);
1879 for (int cj4_ind
= cj4_ind_start
; cj4_ind
< cj4_ind_end
; cj4_ind
++)
1881 cj4
= &nbl
->cj4
[cj4_ind
];
1883 for (int gcj
= 0; gcj
< c_nbnxnGpuJgroupSize
; gcj
++)
1885 unsigned int fep_cj
;
1887 if ((cj4
->imei
[0].imask
& (1U << (gcj
*c_gpuNumClusterPerCell
+ c
))) == 0)
1889 /* Skip this ci for this cj */
1894 cj4
->cj
[gcj
] - jGrid
.cell0
*c_gpuNumClusterPerCell
;
1896 fep_cj
= jGrid
.fep
[cjr
];
1898 if (bFEP_i
|| fep_cj
!= 0)
1900 for (int j
= 0; j
< nbl
->na_cj
; j
++)
1902 /* Is this interaction perturbed and not excluded? */
1903 ind_j
= (jGrid
.cell0
*c_gpuNumClusterPerCell
+ cjr
)*nbl
->na_cj
+ j
;
1906 (bFEP_i
|| (fep_cj
& (1 << j
))) &&
1907 (!bDiagRemoved
|| ind_j
>= ind_i
))
1910 unsigned int excl_bit
;
1913 const int jHalf
= j
/(c_nbnxnGpuClusterSize
/c_nbnxnGpuClusterpairSplit
);
1914 nbnxn_excl_t
*excl
=
1915 get_exclusion_mask(nbl
, cj4_ind
, jHalf
);
1917 excl_pair
= a_mod_wj(j
)*nbl
->na_ci
+ i
;
1918 excl_bit
= (1U << (gcj
*c_gpuNumClusterPerCell
+ c
));
1920 dx
= nbat
->x()[ind_j
*nbat
->xstride
+XX
] - xi
;
1921 dy
= nbat
->x()[ind_j
*nbat
->xstride
+YY
] - yi
;
1922 dz
= nbat
->x()[ind_j
*nbat
->xstride
+ZZ
] - zi
;
1924 /* The unpruned GPU list has more than 2/3
1925 * of the atom pairs beyond rlist. Using
1926 * this list will cause a lot of overhead
1927 * in the CPU FEP kernels, especially
1928 * relative to the fast GPU kernels.
1929 * So we prune the FEP list here.
1931 if (dx
*dx
+ dy
*dy
+ dz
*dz
< rlist_fep2
)
1933 if (nlist
->nrj
- nlist
->jindex
[nri
] >= max_nrj_fep
)
1935 fep_list_new_nri_copy(nlist
);
1939 /* Add it to the FEP list */
1940 nlist
->jjnr
[nlist
->nrj
] = aj
;
1941 nlist
->excl_fep
[nlist
->nrj
] = (excl
->pair
[excl_pair
] & excl_bit
) ? 1 : 0;
1945 /* Exclude it from the normal list.
1946 * Note that the charge and LJ parameters have
1947 * been set to zero, but we need to avoid 0/0,
1948 * as perturbed atoms can be on top of each other.
1950 excl
->pair
[excl_pair
] &= ~excl_bit
;
1954 /* Note that we could mask out this pair in imask
1955 * if all i- and/or all j-particles are perturbed.
1956 * But since the perturbed pairs on the CPU will
1957 * take an order of magnitude more time, the GPU
1958 * will finish before the CPU and there is no gain.
1964 if (nlist
->nrj
> nlist
->jindex
[nri
])
1966 /* Actually add this new, non-empty, list */
1968 nlist
->jindex
[nlist
->nri
] = nlist
->nrj
;
1975 /* Set all atom-pair exclusions for a GPU type list i-entry
1977 * Sets all atom-pair exclusions from the topology stored in exclusions
1978 * as masks in the pair-list for i-super-cluster list entry iEntry.
1981 setExclusionsForIEntry(const nbnxn_search
*nbs
,
1982 NbnxnPairlistGpu
*nbl
,
1983 gmx_bool diagRemoved
,
1984 int gmx_unused na_cj_2log
,
1985 const nbnxn_sci_t
&iEntry
,
1986 const t_blocka
&exclusions
)
1988 if (iEntry
.numJClusterGroups() == 0)
1994 /* Set the search ranges using start and end j-cluster indices.
1995 * Note that here we can not use cj4_ind_end, since the last cj4
1996 * can be only partially filled, so we use cj_ind.
1998 const JListRanges
ranges(iEntry
.cj4_ind_start
*c_nbnxnGpuJgroupSize
,
2000 gmx::makeConstArrayRef(nbl
->cj4
));
2002 GMX_ASSERT(nbl
->na_ci
== c_nbnxnGpuClusterSize
, "na_ci should match the GPU cluster size");
2003 constexpr int c_clusterSize
= c_nbnxnGpuClusterSize
;
2004 constexpr int c_superClusterSize
= c_nbnxnGpuNumClusterPerSupercluster
*c_nbnxnGpuClusterSize
;
2006 const int iSuperCluster
= iEntry
.sci
;
2008 gmx::ArrayRef
<const int> cell
= nbs
->cell
;
2010 /* Loop over the atoms in the i super-cluster */
2011 for (int i
= 0; i
< c_superClusterSize
; i
++)
2013 const int iIndex
= iSuperCluster
*c_superClusterSize
+ i
;
2014 const int iAtom
= nbs
->a
[iIndex
];
2017 const int iCluster
= i
/c_clusterSize
;
2019 /* Loop over the topology-based exclusions for this i-atom */
2020 for (int exclIndex
= exclusions
.index
[iAtom
]; exclIndex
< exclusions
.index
[iAtom
+ 1]; exclIndex
++)
2022 const int jAtom
= exclusions
.a
[exclIndex
];
2026 /* The self exclusions are already set, save some time */
2030 /* Get the index of the j-atom in the nbnxn atom data */
2031 const int jIndex
= cell
[jAtom
];
2033 /* Without shifts we only calculate interactions j>i
2034 * for one-way pair-lists.
2036 /* NOTE: We would like to use iIndex on the right hand side,
2037 * but that makes this routine 25% slower with gcc6/7.
2038 * Even using c_superClusterSize makes it slower.
2039 * Either of these changes triggers peeling of the exclIndex
2040 * loop, which apparently leads to far less efficient code.
2042 if (diagRemoved
&& jIndex
<= iSuperCluster
*nbl
->na_sc
+ i
)
2047 const int jCluster
= jIndex
/c_clusterSize
;
2049 /* Check whether the cluster is in our list? */
2050 if (jCluster
>= ranges
.cjFirst
&& jCluster
<= ranges
.cjLast
)
2053 findJClusterInJList(jCluster
, ranges
,
2054 gmx::makeConstArrayRef(nbl
->cj4
));
2058 /* We found an exclusion, clear the corresponding
2061 const unsigned int pairMask
= (1U << (cj_mod_cj4(index
)*c_gpuNumClusterPerCell
+ iCluster
));
2062 /* Check if the i-cluster interacts with the j-cluster */
2063 if (nbl_imask0(nbl
, index
) & pairMask
)
2065 const int innerI
= (i
& (c_clusterSize
- 1));
2066 const int innerJ
= (jIndex
& (c_clusterSize
- 1));
2068 /* Determine which j-half (CUDA warp) we are in */
2069 const int jHalf
= innerJ
/(c_clusterSize
/c_nbnxnGpuClusterpairSplit
);
2071 nbnxn_excl_t
*interactionMask
=
2072 get_exclusion_mask(nbl
, cj_to_cj4(index
), jHalf
);
2074 interactionMask
->pair
[a_mod_wj(innerJ
)*c_clusterSize
+ innerI
] &= ~pairMask
;
2083 /* Make a new ci entry at the back of nbl->ci */
2084 static void addNewIEntry(NbnxnPairlistCpu
*nbl
, int ci
, int shift
, int flags
)
2088 ciEntry
.shift
= shift
;
2089 /* Store the interaction flags along with the shift */
2090 ciEntry
.shift
|= flags
;
2091 ciEntry
.cj_ind_start
= nbl
->cj
.size();
2092 ciEntry
.cj_ind_end
= nbl
->cj
.size();
2093 nbl
->ci
.push_back(ciEntry
);
2096 /* Make a new sci entry at index nbl->nsci */
2097 static void addNewIEntry(NbnxnPairlistGpu
*nbl
, int sci
, int shift
, int gmx_unused flags
)
2099 nbnxn_sci_t sciEntry
;
2101 sciEntry
.shift
= shift
;
2102 sciEntry
.cj4_ind_start
= nbl
->cj4
.size();
2103 sciEntry
.cj4_ind_end
= nbl
->cj4
.size();
2105 nbl
->sci
.push_back(sciEntry
);
2108 /* Sort the simple j-list cj on exclusions.
2109 * Entries with exclusions will all be sorted to the beginning of the list.
2111 static void sort_cj_excl(nbnxn_cj_t
*cj
, int ncj
,
2112 NbnxnPairlistCpuWork
*work
)
2114 work
->cj
.resize(ncj
);
2116 /* Make a list of the j-cells involving exclusions */
2118 for (int j
= 0; j
< ncj
; j
++)
2120 if (cj
[j
].excl
!= NBNXN_INTERACTION_MASK_ALL
)
2122 work
->cj
[jnew
++] = cj
[j
];
2125 /* Check if there are exclusions at all or not just the first entry */
2126 if (!((jnew
== 0) ||
2127 (jnew
== 1 && cj
[0].excl
!= NBNXN_INTERACTION_MASK_ALL
)))
2129 for (int j
= 0; j
< ncj
; j
++)
2131 if (cj
[j
].excl
== NBNXN_INTERACTION_MASK_ALL
)
2133 work
->cj
[jnew
++] = cj
[j
];
2136 for (int j
= 0; j
< ncj
; j
++)
2138 cj
[j
] = work
->cj
[j
];
2143 /* Close this simple list i entry */
2144 static void closeIEntry(NbnxnPairlistCpu
*nbl
,
2145 int gmx_unused sp_max_av
,
2146 gmx_bool gmx_unused progBal
,
2147 float gmx_unused nsp_tot_est
,
2148 int gmx_unused thread
,
2149 int gmx_unused nthread
)
2151 nbnxn_ci_t
&ciEntry
= nbl
->ci
.back();
2153 /* All content of the new ci entry have already been filled correctly,
2154 * we only need to sort and increase counts or remove the entry when empty.
2156 const int jlen
= ciEntry
.cj_ind_end
- ciEntry
.cj_ind_start
;
2159 sort_cj_excl(nbl
->cj
.data() + ciEntry
.cj_ind_start
, jlen
, nbl
->work
);
2161 /* The counts below are used for non-bonded pair/flop counts
2162 * and should therefore match the available kernel setups.
2164 if (!(ciEntry
.shift
& NBNXN_CI_DO_COUL(0)))
2166 nbl
->work
->ncj_noq
+= jlen
;
2168 else if ((ciEntry
.shift
& NBNXN_CI_HALF_LJ(0)) ||
2169 !(ciEntry
.shift
& NBNXN_CI_DO_LJ(0)))
2171 nbl
->work
->ncj_hlj
+= jlen
;
2176 /* Entry is empty: remove it */
2181 /* Split sci entry for load balancing on the GPU.
2182 * Splitting ensures we have enough lists to fully utilize the whole GPU.
2183 * With progBal we generate progressively smaller lists, which improves
2184 * load balancing. As we only know the current count on our own thread,
2185 * we will need to estimate the current total amount of i-entries.
2186 * As the lists get concatenated later, this estimate depends
2187 * both on nthread and our own thread index.
2189 static void split_sci_entry(NbnxnPairlistGpu
*nbl
,
2191 gmx_bool progBal
, float nsp_tot_est
,
2192 int thread
, int nthread
)
2200 /* Estimate the total numbers of ci's of the nblist combined
2201 * over all threads using the target number of ci's.
2203 nsp_est
= (nsp_tot_est
*thread
)/nthread
+ nbl
->nci_tot
;
2205 /* The first ci blocks should be larger, to avoid overhead.
2206 * The last ci blocks should be smaller, to improve load balancing.
2207 * The factor 3/2 makes the first block 3/2 times the target average
2208 * and ensures that the total number of blocks end up equal to
2209 * that of equally sized blocks of size nsp_target_av.
2211 nsp_max
= static_cast<int>(nsp_target_av
*(nsp_tot_est
*1.5/(nsp_est
+ nsp_tot_est
)));
2215 nsp_max
= nsp_target_av
;
2218 const int cj4_start
= nbl
->sci
.back().cj4_ind_start
;
2219 const int cj4_end
= nbl
->sci
.back().cj4_ind_end
;
2220 const int j4len
= cj4_end
- cj4_start
;
2222 if (j4len
> 1 && j4len
*c_gpuNumClusterPerCell
*c_nbnxnGpuJgroupSize
> nsp_max
)
2224 /* Modify the last ci entry and process the cj4's again */
2230 for (int cj4
= cj4_start
; cj4
< cj4_end
; cj4
++)
2232 int nsp_cj4_p
= nsp_cj4
;
2233 /* Count the number of cluster pairs in this cj4 group */
2235 for (int p
= 0; p
< c_gpuNumClusterPerCell
*c_nbnxnGpuJgroupSize
; p
++)
2237 nsp_cj4
+= (nbl
->cj4
[cj4
].imei
[0].imask
>> p
) & 1;
2240 /* If adding the current cj4 with nsp_cj4 pairs get us further
2241 * away from our target nsp_max, split the list before this cj4.
2243 if (nsp
> 0 && nsp_max
- nsp
< nsp
+ nsp_cj4
- nsp_max
)
2245 /* Split the list at cj4 */
2246 nbl
->sci
.back().cj4_ind_end
= cj4
;
2247 /* Create a new sci entry */
2249 sciNew
.sci
= nbl
->sci
.back().sci
;
2250 sciNew
.shift
= nbl
->sci
.back().shift
;
2251 sciNew
.cj4_ind_start
= cj4
;
2252 nbl
->sci
.push_back(sciNew
);
2255 nsp_cj4_e
= nsp_cj4_p
;
2261 /* Put the remaining cj4's in the last sci entry */
2262 nbl
->sci
.back().cj4_ind_end
= cj4_end
;
2264 /* Possibly balance out the last two sci's
2265 * by moving the last cj4 of the second last sci.
2267 if (nsp_sci
- nsp_cj4_e
>= nsp
+ nsp_cj4_e
)
2269 GMX_ASSERT(nbl
->sci
.size() >= 2, "We expect at least two elements");
2270 nbl
->sci
[nbl
->sci
.size() - 2].cj4_ind_end
--;
2271 nbl
->sci
[nbl
->sci
.size() - 1].cj4_ind_start
--;
2276 /* Clost this super/sub list i entry */
2277 static void closeIEntry(NbnxnPairlistGpu
*nbl
,
2279 gmx_bool progBal
, float nsp_tot_est
,
2280 int thread
, int nthread
)
2282 nbnxn_sci_t
&sciEntry
= *getOpenIEntry(nbl
);
2284 /* All content of the new ci entry have already been filled correctly,
2285 * we only need to, potentially, split or remove the entry when empty.
2287 int j4len
= sciEntry
.numJClusterGroups();
2290 /* We can only have complete blocks of 4 j-entries in a list,
2291 * so round the count up before closing.
2293 int ncj4
= (nbl
->work
->cj_ind
+ c_nbnxnGpuJgroupSize
- 1)/c_nbnxnGpuJgroupSize
;
2294 nbl
->work
->cj_ind
= ncj4
*c_nbnxnGpuJgroupSize
;
2298 /* Measure the size of the new entry and potentially split it */
2299 split_sci_entry(nbl
, nsp_max_av
, progBal
, nsp_tot_est
,
2305 /* Entry is empty: remove it */
2306 nbl
->sci
.pop_back();
2310 /* Syncs the working array before adding another grid pair to the GPU list */
2311 static void sync_work(NbnxnPairlistCpu gmx_unused
*nbl
)
2315 /* Syncs the working array before adding another grid pair to the GPU list */
2316 static void sync_work(NbnxnPairlistGpu
*nbl
)
2318 nbl
->work
->cj_ind
= nbl
->cj4
.size()*c_nbnxnGpuJgroupSize
;
2321 /* Clears an NbnxnPairlistCpu data structure */
2322 static void clear_pairlist(NbnxnPairlistCpu
*nbl
)
2328 nbl
->ciOuter
.clear();
2329 nbl
->cjOuter
.clear();
2331 nbl
->work
->ncj_noq
= 0;
2332 nbl
->work
->ncj_hlj
= 0;
2335 /* Clears an NbnxnPairlistGpu data structure */
2336 static void clear_pairlist(NbnxnPairlistGpu
*nbl
)
2340 nbl
->excl
.resize(1);
2344 /* Clears a group scheme pair list */
2345 static void clear_pairlist_fep(t_nblist
*nl
)
2349 if (nl
->jindex
== nullptr)
2351 snew(nl
->jindex
, 1);
2356 /* Sets a simple list i-cell bounding box, including PBC shift */
2357 static inline void set_icell_bb_simple(gmx::ArrayRef
<const nbnxn_bb_t
> bb
,
2359 real shx
, real shy
, real shz
,
2362 bb_ci
->lower
[BB_X
] = bb
[ci
].lower
[BB_X
] + shx
;
2363 bb_ci
->lower
[BB_Y
] = bb
[ci
].lower
[BB_Y
] + shy
;
2364 bb_ci
->lower
[BB_Z
] = bb
[ci
].lower
[BB_Z
] + shz
;
2365 bb_ci
->upper
[BB_X
] = bb
[ci
].upper
[BB_X
] + shx
;
2366 bb_ci
->upper
[BB_Y
] = bb
[ci
].upper
[BB_Y
] + shy
;
2367 bb_ci
->upper
[BB_Z
] = bb
[ci
].upper
[BB_Z
] + shz
;
2370 /* Sets a simple list i-cell bounding box, including PBC shift */
2371 static inline void set_icell_bb(const nbnxn_grid_t
&iGrid
,
2373 real shx
, real shy
, real shz
,
2374 NbnxnPairlistCpuWork
*work
)
2376 set_icell_bb_simple(iGrid
.bb
, ci
, shx
, shy
, shz
, &work
->iClusterData
.bb
[0]);
2380 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
2381 static void set_icell_bbxxxx_supersub(gmx::ArrayRef
<const float> bb
,
2383 real shx
, real shy
, real shz
,
2386 int ia
= ci
*(c_gpuNumClusterPerCell
>> STRIDE_PBB_2LOG
)*NNBSBB_XXXX
;
2387 for (int m
= 0; m
< (c_gpuNumClusterPerCell
>> STRIDE_PBB_2LOG
)*NNBSBB_XXXX
; m
+= NNBSBB_XXXX
)
2389 for (int i
= 0; i
< STRIDE_PBB
; i
++)
2391 bb_ci
[m
+0*STRIDE_PBB
+i
] = bb
[ia
+m
+0*STRIDE_PBB
+i
] + shx
;
2392 bb_ci
[m
+1*STRIDE_PBB
+i
] = bb
[ia
+m
+1*STRIDE_PBB
+i
] + shy
;
2393 bb_ci
[m
+2*STRIDE_PBB
+i
] = bb
[ia
+m
+2*STRIDE_PBB
+i
] + shz
;
2394 bb_ci
[m
+3*STRIDE_PBB
+i
] = bb
[ia
+m
+3*STRIDE_PBB
+i
] + shx
;
2395 bb_ci
[m
+4*STRIDE_PBB
+i
] = bb
[ia
+m
+4*STRIDE_PBB
+i
] + shy
;
2396 bb_ci
[m
+5*STRIDE_PBB
+i
] = bb
[ia
+m
+5*STRIDE_PBB
+i
] + shz
;
2402 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
2403 gmx_unused
static void set_icell_bb_supersub(gmx::ArrayRef
<const nbnxn_bb_t
> bb
,
2405 real shx
, real shy
, real shz
,
2408 for (int i
= 0; i
< c_gpuNumClusterPerCell
; i
++)
2410 set_icell_bb_simple(bb
, ci
*c_gpuNumClusterPerCell
+i
,
2416 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
2417 gmx_unused
static void set_icell_bb(const nbnxn_grid_t
&iGrid
,
2419 real shx
, real shy
, real shz
,
2420 NbnxnPairlistGpuWork
*work
)
2423 set_icell_bbxxxx_supersub(iGrid
.pbb
, ci
, shx
, shy
, shz
,
2424 work
->iSuperClusterData
.bbPacked
.data());
2426 set_icell_bb_supersub(iGrid
.bb
, ci
, shx
, shy
, shz
,
2427 work
->iSuperClusterData
.bb
.data());
2431 /* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
2432 static void icell_set_x_simple(int ci
,
2433 real shx
, real shy
, real shz
,
2434 int stride
, const real
*x
,
2435 NbnxnPairlistCpuWork::IClusterData
*iClusterData
)
2437 const int ia
= ci
*c_nbnxnCpuIClusterSize
;
2439 for (int i
= 0; i
< c_nbnxnCpuIClusterSize
; i
++)
2441 iClusterData
->x
[i
*STRIDE_XYZ
+XX
] = x
[(ia
+i
)*stride
+XX
] + shx
;
2442 iClusterData
->x
[i
*STRIDE_XYZ
+YY
] = x
[(ia
+i
)*stride
+YY
] + shy
;
2443 iClusterData
->x
[i
*STRIDE_XYZ
+ZZ
] = x
[(ia
+i
)*stride
+ZZ
] + shz
;
2447 static void icell_set_x(int ci
,
2448 real shx
, real shy
, real shz
,
2449 int stride
, const real
*x
,
2450 const Nbnxm::KernelType kernelType
,
2451 NbnxnPairlistCpuWork
*work
)
2456 #ifdef GMX_NBNXN_SIMD_4XN
2457 case Nbnxm::KernelType::Cpu4xN_Simd_4xN
:
2458 icell_set_x_simd_4xn(ci
, shx
, shy
, shz
, stride
, x
, work
);
2461 #ifdef GMX_NBNXN_SIMD_2XNN
2462 case Nbnxm::KernelType::Cpu4xN_Simd_2xNN
:
2463 icell_set_x_simd_2xnn(ci
, shx
, shy
, shz
, stride
, x
, work
);
2467 case Nbnxm::KernelType::Cpu4x4_PlainC
:
2468 icell_set_x_simple(ci
, shx
, shy
, shz
, stride
, x
, &work
->iClusterData
);
2471 GMX_ASSERT(false, "Unhandled case");
2476 /* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
2477 static void icell_set_x(int ci
,
2478 real shx
, real shy
, real shz
,
2479 int stride
, const real
*x
,
2480 Nbnxm::KernelType gmx_unused kernelType
,
2481 NbnxnPairlistGpuWork
*work
)
2483 #if !GMX_SIMD4_HAVE_REAL
2485 real
* x_ci
= work
->iSuperClusterData
.x
.data();
2487 int ia
= ci
*c_gpuNumClusterPerCell
*c_nbnxnGpuClusterSize
;
2488 for (int i
= 0; i
< c_gpuNumClusterPerCell
*c_nbnxnGpuClusterSize
; i
++)
2490 x_ci
[i
*DIM
+ XX
] = x
[(ia
+i
)*stride
+ XX
] + shx
;
2491 x_ci
[i
*DIM
+ YY
] = x
[(ia
+i
)*stride
+ YY
] + shy
;
2492 x_ci
[i
*DIM
+ ZZ
] = x
[(ia
+i
)*stride
+ ZZ
] + shz
;
2495 #else /* !GMX_SIMD4_HAVE_REAL */
2497 real
* x_ci
= work
->iSuperClusterData
.xSimd
.data();
2499 for (int si
= 0; si
< c_gpuNumClusterPerCell
; si
++)
2501 for (int i
= 0; i
< c_nbnxnGpuClusterSize
; i
+= GMX_SIMD4_WIDTH
)
2503 int io
= si
*c_nbnxnGpuClusterSize
+ i
;
2504 int ia
= ci
*c_gpuNumClusterPerCell
*c_nbnxnGpuClusterSize
+ io
;
2505 for (int j
= 0; j
< GMX_SIMD4_WIDTH
; j
++)
2507 x_ci
[io
*DIM
+ j
+ XX
*GMX_SIMD4_WIDTH
] = x
[(ia
+ j
)*stride
+ XX
] + shx
;
2508 x_ci
[io
*DIM
+ j
+ YY
*GMX_SIMD4_WIDTH
] = x
[(ia
+ j
)*stride
+ YY
] + shy
;
2509 x_ci
[io
*DIM
+ j
+ ZZ
*GMX_SIMD4_WIDTH
] = x
[(ia
+ j
)*stride
+ ZZ
] + shz
;
2514 #endif /* !GMX_SIMD4_HAVE_REAL */
2517 static real
minimum_subgrid_size_xy(const nbnxn_grid_t
&grid
)
2521 return std::min(grid
.cellSize
[XX
], grid
.cellSize
[YY
]);
2525 return std::min(grid
.cellSize
[XX
]/c_gpuNumClusterPerCellX
,
2526 grid
.cellSize
[YY
]/c_gpuNumClusterPerCellY
);
2530 static real
effective_buffer_1x1_vs_MxN(const nbnxn_grid_t
&iGrid
,
2531 const nbnxn_grid_t
&jGrid
)
2533 const real eff_1x1_buffer_fac_overest
= 0.1;
2535 /* Determine an atom-pair list cut-off buffer size for atom pairs,
2536 * to be added to rlist (including buffer) used for MxN.
2537 * This is for converting an MxN list to a 1x1 list. This means we can't
2538 * use the normal buffer estimate, as we have an MxN list in which
2539 * some atom pairs beyond rlist are missing. We want to capture
2540 * the beneficial effect of buffering by extra pairs just outside rlist,
2541 * while removing the useless pairs that are further away from rlist.
2542 * (Also the buffer could have been set manually not using the estimate.)
2543 * This buffer size is an overestimate.
2544 * We add 10% of the smallest grid sub-cell dimensions.
2545 * Note that the z-size differs per cell and we don't use this,
2546 * so we overestimate.
2547 * With PME, the 10% value gives a buffer that is somewhat larger
2548 * than the effective buffer with a tolerance of 0.005 kJ/mol/ps.
2549 * Smaller tolerances or using RF lead to a smaller effective buffer,
2550 * so 10% gives a safe overestimate.
2552 return eff_1x1_buffer_fac_overest
*(minimum_subgrid_size_xy(iGrid
) +
2553 minimum_subgrid_size_xy(jGrid
));
2556 /* Estimates the interaction volume^2 for non-local interactions */
2557 static real
nonlocal_vol2(const struct gmx_domdec_zones_t
*zones
, const rvec ls
, real r
)
2565 /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
2566 * not home interaction volume^2. As these volumes are not additive,
2567 * this is an overestimate, but it would only be significant in the limit
2568 * of small cells, where we anyhow need to split the lists into
2569 * as small parts as possible.
2572 for (int z
= 0; z
< zones
->n
; z
++)
2574 if (zones
->shift
[z
][XX
] + zones
->shift
[z
][YY
] + zones
->shift
[z
][ZZ
] == 1)
2579 for (int d
= 0; d
< DIM
; d
++)
2581 if (zones
->shift
[z
][d
] == 0)
2585 za
*= zones
->size
[z
].x1
[d
] - zones
->size
[z
].x0
[d
];
2589 /* 4 octants of a sphere */
2590 vold_est
= 0.25*M_PI
*r
*r
*r
*r
;
2591 /* 4 quarter pie slices on the edges */
2592 vold_est
+= 4*cl
*M_PI
/6.0*r
*r
*r
;
2593 /* One rectangular volume on a face */
2594 vold_est
+= ca
*0.5*r
*r
;
2596 vol2_est_tot
+= vold_est
*za
;
2600 return vol2_est_tot
;
2603 /* Estimates the average size of a full j-list for super/sub setup */
2604 static void get_nsubpair_target(const nbnxn_search
*nbs
,
2605 const InteractionLocality iloc
,
2607 const int min_ci_balanced
,
2608 int *nsubpair_target
,
2609 float *nsubpair_tot_est
)
2611 /* The target value of 36 seems to be the optimum for Kepler.
2612 * Maxwell is less sensitive to the exact value.
2614 const int nsubpair_target_min
= 36;
2615 real r_eff_sup
, vol_est
, nsp_est
, nsp_est_nl
;
2617 const nbnxn_grid_t
&grid
= nbs
->grid
[0];
2619 /* We don't need to balance list sizes if:
2620 * - We didn't request balancing.
2621 * - The number of grid cells >= the number of lists requested,
2622 * since we will always generate at least #cells lists.
2623 * - We don't have any cells, since then there won't be any lists.
2625 if (min_ci_balanced
<= 0 || grid
.nc
>= min_ci_balanced
|| grid
.nc
== 0)
2627 /* nsubpair_target==0 signals no balancing */
2628 *nsubpair_target
= 0;
2629 *nsubpair_tot_est
= 0;
2635 ls
[XX
] = (grid
.c1
[XX
] - grid
.c0
[XX
])/(grid
.numCells
[XX
]*c_gpuNumClusterPerCellX
);
2636 ls
[YY
] = (grid
.c1
[YY
] - grid
.c0
[YY
])/(grid
.numCells
[YY
]*c_gpuNumClusterPerCellY
);
2637 ls
[ZZ
] = grid
.na_c
/(grid
.atom_density
*ls
[XX
]*ls
[YY
]);
2639 /* The formulas below are a heuristic estimate of the average nsj per si*/
2640 r_eff_sup
= rlist
+ nbnxn_get_rlist_effective_inc(grid
.na_c
, ls
);
2642 if (!nbs
->DomDec
|| nbs
->zones
->n
== 1)
2649 gmx::square(grid
.atom_density
/grid
.na_c
)*
2650 nonlocal_vol2(nbs
->zones
, ls
, r_eff_sup
);
2653 if (iloc
== InteractionLocality::Local
)
2655 /* Sub-cell interacts with itself */
2656 vol_est
= ls
[XX
]*ls
[YY
]*ls
[ZZ
];
2657 /* 6/2 rectangular volume on the faces */
2658 vol_est
+= (ls
[XX
]*ls
[YY
] + ls
[XX
]*ls
[ZZ
] + ls
[YY
]*ls
[ZZ
])*r_eff_sup
;
2659 /* 12/2 quarter pie slices on the edges */
2660 vol_est
+= 2*(ls
[XX
] + ls
[YY
] + ls
[ZZ
])*0.25*M_PI
*gmx::square(r_eff_sup
);
2661 /* 4 octants of a sphere */
2662 vol_est
+= 0.5*4.0/3.0*M_PI
*gmx::power3(r_eff_sup
);
2664 /* Estimate the number of cluster pairs as the local number of
2665 * clusters times the volume they interact with times the density.
2667 nsp_est
= grid
.nsubc_tot
*vol_est
*grid
.atom_density
/grid
.na_c
;
2669 /* Subtract the non-local pair count */
2670 nsp_est
-= nsp_est_nl
;
2672 /* For small cut-offs nsp_est will be an underesimate.
2673 * With DD nsp_est_nl is an overestimate so nsp_est can get negative.
2674 * So to avoid too small or negative nsp_est we set a minimum of
2675 * all cells interacting with all 3^3 direct neighbors (3^3-1)/2+1=14.
2676 * This might be a slight overestimate for small non-periodic groups of
2677 * atoms as will occur for a local domain with DD, but for small
2678 * groups of atoms we'll anyhow be limited by nsubpair_target_min,
2679 * so this overestimation will not matter.
2681 nsp_est
= std::max(nsp_est
, grid
.nsubc_tot
*14._real
);
2685 fprintf(debug
, "nsp_est local %5.1f non-local %5.1f\n",
2686 nsp_est
, nsp_est_nl
);
2691 nsp_est
= nsp_est_nl
;
2694 /* Thus the (average) maximum j-list size should be as follows.
2695 * Since there is overhead, we shouldn't make the lists too small
2696 * (and we can't chop up j-groups) so we use a minimum target size of 36.
2698 *nsubpair_target
= std::max(nsubpair_target_min
,
2699 roundToInt(nsp_est
/min_ci_balanced
));
2700 *nsubpair_tot_est
= static_cast<int>(nsp_est
);
2704 fprintf(debug
, "nbl nsp estimate %.1f, nsubpair_target %d\n",
2705 nsp_est
, *nsubpair_target
);
2709 /* Debug list print function */
2710 static void print_nblist_ci_cj(FILE *fp
, const NbnxnPairlistCpu
*nbl
)
2712 for (const nbnxn_ci_t
&ciEntry
: nbl
->ci
)
2714 fprintf(fp
, "ci %4d shift %2d ncj %3d\n",
2715 ciEntry
.ci
, ciEntry
.shift
,
2716 ciEntry
.cj_ind_end
- ciEntry
.cj_ind_start
);
2718 for (int j
= ciEntry
.cj_ind_start
; j
< ciEntry
.cj_ind_end
; j
++)
2720 fprintf(fp
, " cj %5d imask %x\n",
2727 /* Debug list print function */
2728 static void print_nblist_sci_cj(FILE *fp
, const NbnxnPairlistGpu
*nbl
)
2730 for (const nbnxn_sci_t
&sci
: nbl
->sci
)
2732 fprintf(fp
, "ci %4d shift %2d ncj4 %2d\n",
2734 sci
.numJClusterGroups());
2737 for (int j4
= sci
.cj4_ind_start
; j4
< sci
.cj4_ind_end
; j4
++)
2739 for (int j
= 0; j
< c_nbnxnGpuJgroupSize
; j
++)
2741 fprintf(fp
, " sj %5d imask %x\n",
2743 nbl
->cj4
[j4
].imei
[0].imask
);
2744 for (int si
= 0; si
< c_gpuNumClusterPerCell
; si
++)
2746 if (nbl
->cj4
[j4
].imei
[0].imask
& (1U << (j
*c_gpuNumClusterPerCell
+ si
)))
2753 fprintf(fp
, "ci %4d shift %2d ncj4 %2d ncp %3d\n",
2755 sci
.numJClusterGroups(),
2760 /* Combine pair lists *nbl generated on multiple threads nblc */
2761 static void combine_nblists(int nnbl
, NbnxnPairlistGpu
**nbl
,
2762 NbnxnPairlistGpu
*nblc
)
2764 int nsci
= nblc
->sci
.size();
2765 int ncj4
= nblc
->cj4
.size();
2766 int nexcl
= nblc
->excl
.size();
2767 for (int i
= 0; i
< nnbl
; i
++)
2769 nsci
+= nbl
[i
]->sci
.size();
2770 ncj4
+= nbl
[i
]->cj4
.size();
2771 nexcl
+= nbl
[i
]->excl
.size();
2774 /* Resize with the final, combined size, so we can fill in parallel */
2775 /* NOTE: For better performance we should use default initialization */
2776 nblc
->sci
.resize(nsci
);
2777 nblc
->cj4
.resize(ncj4
);
2778 nblc
->excl
.resize(nexcl
);
2780 /* Each thread should copy its own data to the combined arrays,
2781 * as otherwise data will go back and forth between different caches.
2783 #if GMX_OPENMP && !(defined __clang_analyzer__)
2784 int nthreads
= gmx_omp_nthreads_get(emntPairsearch
);
2787 #pragma omp parallel for num_threads(nthreads) schedule(static)
2788 for (int n
= 0; n
< nnbl
; n
++)
2792 /* Determine the offset in the combined data for our thread.
2793 * Note that the original sizes in nblc are lost.
2795 int sci_offset
= nsci
;
2796 int cj4_offset
= ncj4
;
2797 int excl_offset
= nexcl
;
2799 for (int i
= n
; i
< nnbl
; i
++)
2801 sci_offset
-= nbl
[i
]->sci
.size();
2802 cj4_offset
-= nbl
[i
]->cj4
.size();
2803 excl_offset
-= nbl
[i
]->excl
.size();
2806 const NbnxnPairlistGpu
&nbli
= *nbl
[n
];
2808 for (size_t i
= 0; i
< nbli
.sci
.size(); i
++)
2810 nblc
->sci
[sci_offset
+ i
] = nbli
.sci
[i
];
2811 nblc
->sci
[sci_offset
+ i
].cj4_ind_start
+= cj4_offset
;
2812 nblc
->sci
[sci_offset
+ i
].cj4_ind_end
+= cj4_offset
;
2815 for (size_t j4
= 0; j4
< nbli
.cj4
.size(); j4
++)
2817 nblc
->cj4
[cj4_offset
+ j4
] = nbli
.cj4
[j4
];
2818 nblc
->cj4
[cj4_offset
+ j4
].imei
[0].excl_ind
+= excl_offset
;
2819 nblc
->cj4
[cj4_offset
+ j4
].imei
[1].excl_ind
+= excl_offset
;
2822 for (size_t j4
= 0; j4
< nbli
.excl
.size(); j4
++)
2824 nblc
->excl
[excl_offset
+ j4
] = nbli
.excl
[j4
];
2827 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
2830 for (int n
= 0; n
< nnbl
; n
++)
2832 nblc
->nci_tot
+= nbl
[n
]->nci_tot
;
2836 static void balance_fep_lists(const nbnxn_search
*nbs
,
2837 nbnxn_pairlist_set_t
*nbl_lists
)
2840 int nri_tot
, nrj_tot
, nrj_target
;
2844 nnbl
= nbl_lists
->nnbl
;
2848 /* Nothing to balance */
2852 /* Count the total i-lists and pairs */
2855 for (int th
= 0; th
< nnbl
; th
++)
2857 nri_tot
+= nbl_lists
->nbl_fep
[th
]->nri
;
2858 nrj_tot
+= nbl_lists
->nbl_fep
[th
]->nrj
;
2861 nrj_target
= (nrj_tot
+ nnbl
- 1)/nnbl
;
2863 assert(gmx_omp_nthreads_get(emntNonbonded
) == nnbl
);
2865 #pragma omp parallel for schedule(static) num_threads(nnbl)
2866 for (int th
= 0; th
< nnbl
; th
++)
2870 t_nblist
*nbl
= nbs
->work
[th
].nbl_fep
.get();
2872 /* Note that here we allocate for the total size, instead of
2873 * a per-thread esimate (which is hard to obtain).
2875 if (nri_tot
> nbl
->maxnri
)
2877 nbl
->maxnri
= over_alloc_large(nri_tot
);
2878 reallocate_nblist(nbl
);
2880 if (nri_tot
> nbl
->maxnri
|| nrj_tot
> nbl
->maxnrj
)
2882 nbl
->maxnrj
= over_alloc_small(nrj_tot
);
2883 srenew(nbl
->jjnr
, nbl
->maxnrj
);
2884 srenew(nbl
->excl_fep
, nbl
->maxnrj
);
2887 clear_pairlist_fep(nbl
);
2889 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
2892 /* Loop over the source lists and assign and copy i-entries */
2894 nbld
= nbs
->work
[th_dest
].nbl_fep
.get();
2895 for (int th
= 0; th
< nnbl
; th
++)
2899 nbls
= nbl_lists
->nbl_fep
[th
];
2901 for (int i
= 0; i
< nbls
->nri
; i
++)
2905 /* The number of pairs in this i-entry */
2906 nrj
= nbls
->jindex
[i
+1] - nbls
->jindex
[i
];
2908 /* Decide if list th_dest is too large and we should procede
2909 * to the next destination list.
2911 if (th_dest
+1 < nnbl
&& nbld
->nrj
> 0 &&
2912 nbld
->nrj
+ nrj
- nrj_target
> nrj_target
- nbld
->nrj
)
2915 nbld
= nbs
->work
[th_dest
].nbl_fep
.get();
2918 nbld
->iinr
[nbld
->nri
] = nbls
->iinr
[i
];
2919 nbld
->gid
[nbld
->nri
] = nbls
->gid
[i
];
2920 nbld
->shift
[nbld
->nri
] = nbls
->shift
[i
];
2922 for (int j
= nbls
->jindex
[i
]; j
< nbls
->jindex
[i
+1]; j
++)
2924 nbld
->jjnr
[nbld
->nrj
] = nbls
->jjnr
[j
];
2925 nbld
->excl_fep
[nbld
->nrj
] = nbls
->excl_fep
[j
];
2929 nbld
->jindex
[nbld
->nri
] = nbld
->nrj
;
2933 /* Swap the list pointers */
2934 for (int th
= 0; th
< nnbl
; th
++)
2936 t_nblist
*nbl_tmp
= nbs
->work
[th
].nbl_fep
.release();
2937 nbs
->work
[th
].nbl_fep
.reset(nbl_lists
->nbl_fep
[th
]);
2938 nbl_lists
->nbl_fep
[th
] = nbl_tmp
;
2942 fprintf(debug
, "nbl_fep[%d] nri %4d nrj %4d\n",
2944 nbl_lists
->nbl_fep
[th
]->nri
,
2945 nbl_lists
->nbl_fep
[th
]->nrj
);
2950 /* Returns the next ci to be processes by our thread */
2951 static gmx_bool
next_ci(const nbnxn_grid_t
&grid
,
2952 int nth
, int ci_block
,
2953 int *ci_x
, int *ci_y
,
2959 if (*ci_b
== ci_block
)
2961 /* Jump to the next block assigned to this task */
2962 *ci
+= (nth
- 1)*ci_block
;
2971 while (*ci
>= grid
.cxy_ind
[*ci_x
*grid
.numCells
[YY
] + *ci_y
+ 1])
2974 if (*ci_y
== grid
.numCells
[YY
])
2984 /* Returns the distance^2 for which we put cell pairs in the list
2985 * without checking atom pair distances. This is usually < rlist^2.
2987 static float boundingbox_only_distance2(const nbnxn_grid_t
&iGrid
,
2988 const nbnxn_grid_t
&jGrid
,
2992 /* If the distance between two sub-cell bounding boxes is less
2993 * than this distance, do not check the distance between
2994 * all particle pairs in the sub-cell, since then it is likely
2995 * that the box pair has atom pairs within the cut-off.
2996 * We use the nblist cut-off minus 0.5 times the average x/y diagonal
2997 * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
2998 * Using more than 0.5 gains at most 0.5%.
2999 * If forces are calculated more than twice, the performance gain
3000 * in the force calculation outweighs the cost of checking.
3001 * Note that with subcell lists, the atom-pair distance check
3002 * is only performed when only 1 out of 8 sub-cells in within range,
3003 * this is because the GPU is much faster than the cpu.
3008 bbx
= 0.5*(iGrid
.cellSize
[XX
] + jGrid
.cellSize
[XX
]);
3009 bby
= 0.5*(iGrid
.cellSize
[YY
] + jGrid
.cellSize
[YY
]);
3012 bbx
/= c_gpuNumClusterPerCellX
;
3013 bby
/= c_gpuNumClusterPerCellY
;
3016 rbb2
= std::max(0.0, rlist
- 0.5*std::sqrt(bbx
*bbx
+ bby
*bby
));
3022 return (float)((1+GMX_FLOAT_EPS
)*rbb2
);
3026 static int get_ci_block_size(const nbnxn_grid_t
&iGrid
,
3027 gmx_bool bDomDec
, int nth
)
3029 const int ci_block_enum
= 5;
3030 const int ci_block_denom
= 11;
3031 const int ci_block_min_atoms
= 16;
3034 /* Here we decide how to distribute the blocks over the threads.
3035 * We use prime numbers to try to avoid that the grid size becomes
3036 * a multiple of the number of threads, which would lead to some
3037 * threads getting "inner" pairs and others getting boundary pairs,
3038 * which in turns will lead to load imbalance between threads.
3039 * Set the block size as 5/11/ntask times the average number of cells
3040 * in a y,z slab. This should ensure a quite uniform distribution
3041 * of the grid parts of the different thread along all three grid
3042 * zone boundaries with 3D domain decomposition. At the same time
3043 * the blocks will not become too small.
3045 ci_block
= (iGrid
.nc
*ci_block_enum
)/(ci_block_denom
*iGrid
.numCells
[XX
]*nth
);
3047 /* Ensure the blocks are not too small: avoids cache invalidation */
3048 if (ci_block
*iGrid
.na_sc
< ci_block_min_atoms
)
3050 ci_block
= (ci_block_min_atoms
+ iGrid
.na_sc
- 1)/iGrid
.na_sc
;
3053 /* Without domain decomposition
3054 * or with less than 3 blocks per task, divide in nth blocks.
3056 if (!bDomDec
|| nth
*3*ci_block
> iGrid
.nc
)
3058 ci_block
= (iGrid
.nc
+ nth
- 1)/nth
;
3061 if (ci_block
> 1 && (nth
- 1)*ci_block
>= iGrid
.nc
)
3063 /* Some threads have no work. Although reducing the block size
3064 * does not decrease the block count on the first few threads,
3065 * with GPUs better mixing of "upper" cells that have more empty
3066 * clusters results in a somewhat lower max load over all threads.
3067 * Without GPUs the regime of so few atoms per thread is less
3068 * performance relevant, but with 8-wide SIMD the same reasoning
3069 * applies, since the pair list uses 4 i-atom "sub-clusters".
3077 /* Returns the number of bits to right-shift a cluster index to obtain
3078 * the corresponding force buffer flag index.
3080 static int getBufferFlagShift(int numAtomsPerCluster
)
3082 int bufferFlagShift
= 0;
3083 while ((numAtomsPerCluster
<< bufferFlagShift
) < NBNXN_BUFFERFLAG_SIZE
)
3088 return bufferFlagShift
;
3091 static bool pairlistIsSimple(const NbnxnPairlistCpu gmx_unused
&pairlist
)
3096 static bool pairlistIsSimple(const NbnxnPairlistGpu gmx_unused
&pairlist
)
3101 static void makeClusterListWrapper(NbnxnPairlistCpu
*nbl
,
3102 const nbnxn_grid_t gmx_unused
&iGrid
,
3104 const nbnxn_grid_t
&jGrid
,
3105 const int firstCell
,
3107 const bool excludeSubDiagonal
,
3108 const nbnxn_atomdata_t
*nbat
,
3111 const Nbnxm::KernelType kernelType
,
3112 int *numDistanceChecks
)
3116 case Nbnxm::KernelType::Cpu4x4_PlainC
:
3117 makeClusterListSimple(jGrid
,
3118 nbl
, ci
, firstCell
, lastCell
,
3124 #ifdef GMX_NBNXN_SIMD_4XN
3125 case Nbnxm::KernelType::Cpu4xN_Simd_4xN
:
3126 makeClusterListSimd4xn(jGrid
,
3127 nbl
, ci
, firstCell
, lastCell
,
3134 #ifdef GMX_NBNXN_SIMD_2XNN
3135 case Nbnxm::KernelType::Cpu4xN_Simd_2xNN
:
3136 makeClusterListSimd2xnn(jGrid
,
3137 nbl
, ci
, firstCell
, lastCell
,
3145 GMX_ASSERT(false, "Unhandled kernel type");
3149 static void makeClusterListWrapper(NbnxnPairlistGpu
*nbl
,
3150 const nbnxn_grid_t
&gmx_unused iGrid
,
3152 const nbnxn_grid_t
&jGrid
,
3153 const int firstCell
,
3155 const bool excludeSubDiagonal
,
3156 const nbnxn_atomdata_t
*nbat
,
3159 Nbnxm::KernelType gmx_unused kernelType
,
3160 int *numDistanceChecks
)
3162 for (int cj
= firstCell
; cj
<= lastCell
; cj
++)
3164 make_cluster_list_supersub(iGrid
, jGrid
,
3167 nbat
->xstride
, nbat
->x().data(),
3173 static int getNumSimpleJClustersInList(const NbnxnPairlistCpu
&nbl
)
3175 return nbl
.cj
.size();
3178 static int getNumSimpleJClustersInList(const gmx_unused NbnxnPairlistGpu
&nbl
)
3183 static void incrementNumSimpleJClustersInList(NbnxnPairlistCpu
*nbl
,
3186 nbl
->ncjInUse
+= nbl
->cj
.size() - ncj_old_j
;
3189 static void incrementNumSimpleJClustersInList(NbnxnPairlistGpu gmx_unused
*nbl
,
3190 int gmx_unused ncj_old_j
)
3194 static void checkListSizeConsistency(const NbnxnPairlistCpu
&nbl
,
3195 const bool haveFreeEnergy
)
3197 GMX_RELEASE_ASSERT(static_cast<size_t>(nbl
.ncjInUse
) == nbl
.cj
.size() || haveFreeEnergy
,
3198 "Without free-energy all cj pair-list entries should be in use. "
3199 "Note that subsequent code does not make use of the equality, "
3200 "this check is only here to catch bugs");
3203 static void checkListSizeConsistency(const NbnxnPairlistGpu gmx_unused
&nbl
,
3204 bool gmx_unused haveFreeEnergy
)
3206 /* We currently can not check consistency here */
3209 /* Set the buffer flags for newly added entries in the list */
3210 static void setBufferFlags(const NbnxnPairlistCpu
&nbl
,
3211 const int ncj_old_j
,
3212 const int gridj_flag_shift
,
3213 gmx_bitmask_t
*gridj_flag
,
3216 if (gmx::ssize(nbl
.cj
) > ncj_old_j
)
3218 int cbFirst
= nbl
.cj
[ncj_old_j
].cj
>> gridj_flag_shift
;
3219 int cbLast
= nbl
.cj
.back().cj
>> gridj_flag_shift
;
3220 for (int cb
= cbFirst
; cb
<= cbLast
; cb
++)
3222 bitmask_init_bit(&gridj_flag
[cb
], th
);
3227 static void setBufferFlags(const NbnxnPairlistGpu gmx_unused
&nbl
,
3228 int gmx_unused ncj_old_j
,
3229 int gmx_unused gridj_flag_shift
,
3230 gmx_bitmask_t gmx_unused
*gridj_flag
,
3233 GMX_ASSERT(false, "This function should never be called");
3236 /* Generates the part of pair-list nbl assigned to our thread */
3237 template <typename T
>
3238 static void nbnxn_make_pairlist_part(const nbnxn_search
*nbs
,
3239 const nbnxn_grid_t
&iGrid
,
3240 const nbnxn_grid_t
&jGrid
,
3241 nbnxn_search_work_t
*work
,
3242 const nbnxn_atomdata_t
*nbat
,
3243 const t_blocka
&exclusions
,
3245 const Nbnxm::KernelType kernelType
,
3247 gmx_bool bFBufferFlag
,
3250 float nsubpair_tot_est
,
3257 real rlist2
, rl_fep2
= 0;
3259 int ci_b
, ci
, ci_x
, ci_y
, ci_xy
;
3261 real bx0
, bx1
, by0
, by1
, bz0
, bz1
;
3263 real d2cx
, d2z
, d2z_cx
, d2z_cy
, d2zx
, d2zxy
, d2xy
;
3264 int cxf
, cxl
, cyf
, cyf_x
, cyl
;
3265 int numDistanceChecks
;
3266 int gridi_flag_shift
= 0, gridj_flag_shift
= 0;
3267 gmx_bitmask_t
*gridj_flag
= nullptr;
3268 int ncj_old_i
, ncj_old_j
;
3270 nbs_cycle_start(&work
->cc
[enbsCCsearch
]);
3272 if (jGrid
.bSimple
!= pairlistIsSimple(*nbl
) ||
3273 iGrid
.bSimple
!= pairlistIsSimple(*nbl
))
3275 gmx_incons("Grid incompatible with pair-list");
3279 GMX_ASSERT(nbl
->na_ci
== jGrid
.na_c
, "The cluster sizes in the list and grid should match");
3280 nbl
->na_cj
= Nbnxm::JClusterSizePerKernelType
[kernelType
];
3281 na_cj_2log
= get_2log(nbl
->na_cj
);
3287 /* Determine conversion of clusters to flag blocks */
3288 gridi_flag_shift
= getBufferFlagShift(nbl
->na_ci
);
3289 gridj_flag_shift
= getBufferFlagShift(nbl
->na_cj
);
3291 gridj_flag
= work
->buffer_flags
.flag
;
3294 copy_mat(nbs
->box
, box
);
3296 rlist2
= nbl
->rlist
*nbl
->rlist
;
3298 if (nbs
->bFEP
&& !pairlistIsSimple(*nbl
))
3300 /* Determine an atom-pair list cut-off distance for FEP atom pairs.
3301 * We should not simply use rlist, since then we would not have
3302 * the small, effective buffering of the NxN lists.
3303 * The buffer is on overestimate, but the resulting cost for pairs
3304 * beyond rlist is neglible compared to the FEP pairs within rlist.
3306 rl_fep2
= nbl
->rlist
+ effective_buffer_1x1_vs_MxN(iGrid
, jGrid
);
3310 fprintf(debug
, "nbl_fep atom-pair rlist %f\n", rl_fep2
);
3312 rl_fep2
= rl_fep2
*rl_fep2
;
3315 rbb2
= boundingbox_only_distance2(iGrid
, jGrid
, nbl
->rlist
, pairlistIsSimple(*nbl
));
3319 fprintf(debug
, "nbl bounding box only distance %f\n", std::sqrt(rbb2
));
3322 const bool isIntraGridList
= (&iGrid
== &jGrid
);
3324 /* Set the shift range */
3325 for (int d
= 0; d
< DIM
; d
++)
3327 /* Check if we need periodicity shifts.
3328 * Without PBC or with domain decomposition we don't need them.
3330 if (d
>= ePBC2npbcdim(nbs
->ePBC
) || nbs
->dd_dim
[d
])
3336 const real listRangeCellToCell
= listRangeForGridCellToGridCell(rlist
, iGrid
, jGrid
);
3338 box
[XX
][XX
] - fabs(box
[YY
][XX
]) - fabs(box
[ZZ
][XX
]) < listRangeCellToCell
)
3348 const bool bSimple
= pairlistIsSimple(*nbl
);
3349 gmx::ArrayRef
<const nbnxn_bb_t
> bb_i
;
3351 gmx::ArrayRef
<const float> pbb_i
;
3361 /* We use the normal bounding box format for both grid types */
3364 gmx::ArrayRef
<const float> bbcz_i
= iGrid
.bbcz
;
3365 gmx::ArrayRef
<const int> flags_i
= iGrid
.flags
;
3366 gmx::ArrayRef
<const float> bbcz_j
= jGrid
.bbcz
;
3367 int cell0_i
= iGrid
.cell0
;
3371 fprintf(debug
, "nbl nc_i %d col.av. %.1f ci_block %d\n",
3372 iGrid
.nc
, iGrid
.nc
/static_cast<double>(iGrid
.numCells
[XX
]*iGrid
.numCells
[YY
]), ci_block
);
3375 numDistanceChecks
= 0;
3377 const real listRangeBBToJCell2
= gmx::square(listRangeForBoundingBoxToGridCell(rlist
, jGrid
));
3379 /* Initially ci_b and ci to 1 before where we want them to start,
3380 * as they will both be incremented in next_ci.
3383 ci
= th
*ci_block
- 1;
3386 while (next_ci(iGrid
, nth
, ci_block
, &ci_x
, &ci_y
, &ci_b
, &ci
))
3388 if (bSimple
&& flags_i
[ci
] == 0)
3393 ncj_old_i
= getNumSimpleJClustersInList(*nbl
);
3396 if (!isIntraGridList
&& shp
[XX
] == 0)
3400 bx1
= bb_i
[ci
].upper
[BB_X
];
3404 bx1
= iGrid
.c0
[XX
] + (ci_x
+1)*iGrid
.cellSize
[XX
];
3406 if (bx1
< jGrid
.c0
[XX
])
3408 d2cx
= gmx::square(jGrid
.c0
[XX
] - bx1
);
3410 if (d2cx
>= listRangeBBToJCell2
)
3417 ci_xy
= ci_x
*iGrid
.numCells
[YY
] + ci_y
;
3419 /* Loop over shift vectors in three dimensions */
3420 for (int tz
= -shp
[ZZ
]; tz
<= shp
[ZZ
]; tz
++)
3422 const real shz
= tz
*box
[ZZ
][ZZ
];
3424 bz0
= bbcz_i
[ci
*NNBSBB_D
] + shz
;
3425 bz1
= bbcz_i
[ci
*NNBSBB_D
+1] + shz
;
3433 d2z
= gmx::square(bz1
);
3437 d2z
= gmx::square(bz0
- box
[ZZ
][ZZ
]);
3440 d2z_cx
= d2z
+ d2cx
;
3442 if (d2z_cx
>= rlist2
)
3447 bz1_frac
= bz1
/(iGrid
.cxy_ind
[ci_xy
+1] - iGrid
.cxy_ind
[ci_xy
]);
3452 /* The check with bz1_frac close to or larger than 1 comes later */
3454 for (int ty
= -shp
[YY
]; ty
<= shp
[YY
]; ty
++)
3456 const real shy
= ty
*box
[YY
][YY
] + tz
*box
[ZZ
][YY
];
3460 by0
= bb_i
[ci
].lower
[BB_Y
] + shy
;
3461 by1
= bb_i
[ci
].upper
[BB_Y
] + shy
;
3465 by0
= iGrid
.c0
[YY
] + (ci_y
)*iGrid
.cellSize
[YY
] + shy
;
3466 by1
= iGrid
.c0
[YY
] + (ci_y
+1)*iGrid
.cellSize
[YY
] + shy
;
3469 get_cell_range
<YY
>(by0
, by1
,
3480 if (by1
< jGrid
.c0
[YY
])
3482 d2z_cy
+= gmx::square(jGrid
.c0
[YY
] - by1
);
3484 else if (by0
> jGrid
.c1
[YY
])
3486 d2z_cy
+= gmx::square(by0
- jGrid
.c1
[YY
]);
3489 for (int tx
= -shp
[XX
]; tx
<= shp
[XX
]; tx
++)
3491 const int shift
= XYZ2IS(tx
, ty
, tz
);
3493 const bool excludeSubDiagonal
= (isIntraGridList
&& shift
== CENTRAL
);
3495 if (c_pbcShiftBackward
&& isIntraGridList
&& shift
> CENTRAL
)
3500 const real shx
= tx
*box
[XX
][XX
] + ty
*box
[YY
][XX
] + tz
*box
[ZZ
][XX
];
3504 bx0
= bb_i
[ci
].lower
[BB_X
] + shx
;
3505 bx1
= bb_i
[ci
].upper
[BB_X
] + shx
;
3509 bx0
= iGrid
.c0
[XX
] + (ci_x
)*iGrid
.cellSize
[XX
] + shx
;
3510 bx1
= iGrid
.c0
[XX
] + (ci_x
+1)*iGrid
.cellSize
[XX
] + shx
;
3513 get_cell_range
<XX
>(bx0
, bx1
,
3523 addNewIEntry(nbl
, cell0_i
+ci
, shift
, flags_i
[ci
]);
3525 if ((!c_pbcShiftBackward
|| excludeSubDiagonal
) &&
3528 /* Leave the pairs with i > j.
3529 * x is the major index, so skip half of it.
3534 set_icell_bb(iGrid
, ci
, shx
, shy
, shz
,
3537 icell_set_x(cell0_i
+ci
, shx
, shy
, shz
,
3538 nbat
->xstride
, nbat
->x().data(),
3542 for (int cx
= cxf
; cx
<= cxl
; cx
++)
3545 if (jGrid
.c0
[XX
] + cx
*jGrid
.cellSize
[XX
] > bx1
)
3547 d2zx
+= gmx::square(jGrid
.c0
[XX
] + cx
*jGrid
.cellSize
[XX
] - bx1
);
3549 else if (jGrid
.c0
[XX
] + (cx
+1)*jGrid
.cellSize
[XX
] < bx0
)
3551 d2zx
+= gmx::square(jGrid
.c0
[XX
] + (cx
+1)*jGrid
.cellSize
[XX
] - bx0
);
3554 if (isIntraGridList
&&
3556 (!c_pbcShiftBackward
|| shift
== CENTRAL
) &&
3559 /* Leave the pairs with i > j.
3560 * Skip half of y when i and j have the same x.
3569 for (int cy
= cyf_x
; cy
<= cyl
; cy
++)
3571 const int columnStart
= jGrid
.cxy_ind
[cx
*jGrid
.numCells
[YY
] + cy
];
3572 const int columnEnd
= jGrid
.cxy_ind
[cx
*jGrid
.numCells
[YY
] + cy
+ 1];
3575 if (jGrid
.c0
[YY
] + cy
*jGrid
.cellSize
[YY
] > by1
)
3577 d2zxy
+= gmx::square(jGrid
.c0
[YY
] + cy
*jGrid
.cellSize
[YY
] - by1
);
3579 else if (jGrid
.c0
[YY
] + (cy
+1)*jGrid
.cellSize
[YY
] < by0
)
3581 d2zxy
+= gmx::square(jGrid
.c0
[YY
] + (cy
+1)*jGrid
.cellSize
[YY
] - by0
);
3583 if (columnStart
< columnEnd
&& d2zxy
< listRangeBBToJCell2
)
3585 /* To improve efficiency in the common case
3586 * of a homogeneous particle distribution,
3587 * we estimate the index of the middle cell
3588 * in range (midCell). We search down and up
3589 * starting from this index.
3591 * Note that the bbcz_j array contains bounds
3592 * for i-clusters, thus for clusters of 4 atoms.
3593 * For the common case where the j-cluster size
3594 * is 8, we could step with a stride of 2,
3595 * but we do not do this because it would
3596 * complicate this code even more.
3598 int midCell
= columnStart
+ static_cast<int>(bz1_frac
*(columnEnd
- columnStart
));
3599 if (midCell
>= columnEnd
)
3601 midCell
= columnEnd
- 1;
3606 /* Find the lowest cell that can possibly
3608 * Check if we hit the bottom of the grid,
3609 * if the j-cell is below the i-cell and if so,
3610 * if it is within range.
3612 int downTestCell
= midCell
;
3613 while (downTestCell
>= columnStart
&&
3614 (bbcz_j
[downTestCell
*NNBSBB_D
+ 1] >= bz0
||
3615 d2xy
+ gmx::square(bbcz_j
[downTestCell
*NNBSBB_D
+ 1] - bz0
) < rlist2
))
3619 int firstCell
= downTestCell
+ 1;
3621 /* Find the highest cell that can possibly
3623 * Check if we hit the top of the grid,
3624 * if the j-cell is above the i-cell and if so,
3625 * if it is within range.
3627 int upTestCell
= midCell
+ 1;
3628 while (upTestCell
< columnEnd
&&
3629 (bbcz_j
[upTestCell
*NNBSBB_D
] <= bz1
||
3630 d2xy
+ gmx::square(bbcz_j
[upTestCell
*NNBSBB_D
] - bz1
) < rlist2
))
3634 int lastCell
= upTestCell
- 1;
3636 #define NBNXN_REFCODE 0
3639 /* Simple reference code, for debugging,
3640 * overrides the more complex code above.
3642 firstCell
= columnEnd
;
3644 for (int k
= columnStart
; k
< columnEnd
; k
++)
3646 if (d2xy
+ gmx::square(bbcz_j
[k
*NNBSBB_D
+ 1] - bz0
) < rlist2
&&
3651 if (d2xy
+ gmx::square(bbcz_j
[k
*NNBSBB_D
] - bz1
) < rlist2
&&
3660 if (isIntraGridList
)
3662 /* We want each atom/cell pair only once,
3663 * only use cj >= ci.
3665 if (!c_pbcShiftBackward
|| shift
== CENTRAL
)
3667 firstCell
= std::max(firstCell
, ci
);
3671 if (firstCell
<= lastCell
)
3673 GMX_ASSERT(firstCell
>= columnStart
&& lastCell
< columnEnd
, "The range should reside within the current grid column");
3675 /* For f buffer flags with simple lists */
3676 ncj_old_j
= getNumSimpleJClustersInList(*nbl
);
3678 makeClusterListWrapper(nbl
,
3680 jGrid
, firstCell
, lastCell
,
3685 &numDistanceChecks
);
3689 setBufferFlags(*nbl
, ncj_old_j
, gridj_flag_shift
,
3693 incrementNumSimpleJClustersInList(nbl
, ncj_old_j
);
3699 /* Set the exclusions for this ci list */
3700 setExclusionsForIEntry(nbs
,
3704 *getOpenIEntry(nbl
),
3709 make_fep_list(nbs
, nbat
, nbl
,
3714 iGrid
, jGrid
, nbl_fep
);
3717 /* Close this ci list */
3720 progBal
, nsubpair_tot_est
,
3726 if (bFBufferFlag
&& getNumSimpleJClustersInList(*nbl
) > ncj_old_i
)
3728 bitmask_init_bit(&(work
->buffer_flags
.flag
[(iGrid
.cell0
+ci
) >> gridi_flag_shift
]), th
);
3732 work
->ndistc
= numDistanceChecks
;
3734 nbs_cycle_stop(&work
->cc
[enbsCCsearch
]);
3736 checkListSizeConsistency(*nbl
, nbs
->bFEP
);
3740 fprintf(debug
, "number of distance checks %d\n", numDistanceChecks
);
3742 print_nblist_statistics(debug
, nbl
, nbs
, rlist
);
3746 fprintf(debug
, "nbl FEP list pairs: %d\n", nbl_fep
->nrj
);
3751 static void reduce_buffer_flags(const nbnxn_search
*nbs
,
3753 const nbnxn_buffer_flags_t
*dest
)
3755 for (int s
= 0; s
< nsrc
; s
++)
3757 gmx_bitmask_t
* flag
= nbs
->work
[s
].buffer_flags
.flag
;
3759 for (int b
= 0; b
< dest
->nflag
; b
++)
3761 bitmask_union(&(dest
->flag
[b
]), flag
[b
]);
3766 static void print_reduction_cost(const nbnxn_buffer_flags_t
*flags
, int nout
)
3768 int nelem
, nkeep
, ncopy
, nred
, out
;
3769 gmx_bitmask_t mask_0
;
3775 bitmask_init_bit(&mask_0
, 0);
3776 for (int b
= 0; b
< flags
->nflag
; b
++)
3778 if (bitmask_is_equal(flags
->flag
[b
], mask_0
))
3780 /* Only flag 0 is set, no copy of reduction required */
3784 else if (!bitmask_is_zero(flags
->flag
[b
]))
3787 for (out
= 0; out
< nout
; out
++)
3789 if (bitmask_is_set(flags
->flag
[b
], out
))
3806 fprintf(debug
, "nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
3808 nelem
/static_cast<double>(flags
->nflag
),
3809 nkeep
/static_cast<double>(flags
->nflag
),
3810 ncopy
/static_cast<double>(flags
->nflag
),
3811 nred
/static_cast<double>(flags
->nflag
));
3814 /* Copies the list entries from src to dest when cjStart <= *cjGlobal < cjEnd.
3815 * *cjGlobal is updated with the cj count in src.
3816 * When setFlags==true, flag bit t is set in flag for all i and j clusters.
3818 template<bool setFlags
>
3819 static void copySelectedListRange(const nbnxn_ci_t
* gmx_restrict srcCi
,
3820 const NbnxnPairlistCpu
* gmx_restrict src
,
3821 NbnxnPairlistCpu
* gmx_restrict dest
,
3822 gmx_bitmask_t
*flag
,
3823 int iFlagShift
, int jFlagShift
, int t
)
3825 const int ncj
= srcCi
->cj_ind_end
- srcCi
->cj_ind_start
;
3827 dest
->ci
.push_back(*srcCi
);
3828 dest
->ci
.back().cj_ind_start
= dest
->cj
.size();
3829 dest
->ci
.back().cj_ind_end
= dest
->cj
.size() + ncj
;
3833 bitmask_init_bit(&flag
[srcCi
->ci
>> iFlagShift
], t
);
3836 for (int j
= srcCi
->cj_ind_start
; j
< srcCi
->cj_ind_end
; j
++)
3838 dest
->cj
.push_back(src
->cj
[j
]);
3842 /* NOTE: This is relatively expensive, since this
3843 * operation is done for all elements in the list,
3844 * whereas at list generation this is done only
3845 * once for each flag entry.
3847 bitmask_init_bit(&flag
[src
->cj
[j
].cj
>> jFlagShift
], t
);
3852 /* This routine re-balances the pairlists such that all are nearly equally
3853 * sized. Only whole i-entries are moved between lists. These are moved
3854 * between the ends of the lists, such that the buffer reduction cost should
3855 * not change significantly.
3856 * Note that all original reduction flags are currently kept. This can lead
3857 * to reduction of parts of the force buffer that could be avoided. But since
3858 * the original lists are quite balanced, this will only give minor overhead.
3860 static void rebalanceSimpleLists(int numLists
,
3861 NbnxnPairlistCpu
* const * const srcSet
,
3862 NbnxnPairlistCpu
**destSet
,
3863 gmx::ArrayRef
<nbnxn_search_work_t
> searchWork
)
3866 for (int s
= 0; s
< numLists
; s
++)
3868 ncjTotal
+= srcSet
[s
]->ncjInUse
;
3870 int ncjTarget
= (ncjTotal
+ numLists
- 1)/numLists
;
3872 #pragma omp parallel num_threads(numLists)
3874 int t
= gmx_omp_get_thread_num();
3876 int cjStart
= ncjTarget
* t
;
3877 int cjEnd
= ncjTarget
*(t
+ 1);
3879 /* The destination pair-list for task/thread t */
3880 NbnxnPairlistCpu
*dest
= destSet
[t
];
3882 clear_pairlist(dest
);
3883 dest
->na_cj
= srcSet
[0]->na_cj
;
3885 /* Note that the flags in the work struct (still) contain flags
3886 * for all entries that are present in srcSet->nbl[t].
3888 gmx_bitmask_t
*flag
= searchWork
[t
].buffer_flags
.flag
;
3890 int iFlagShift
= getBufferFlagShift(dest
->na_ci
);
3891 int jFlagShift
= getBufferFlagShift(dest
->na_cj
);
3894 for (int s
= 0; s
< numLists
&& cjGlobal
< cjEnd
; s
++)
3896 const NbnxnPairlistCpu
*src
= srcSet
[s
];
3898 if (cjGlobal
+ src
->ncjInUse
> cjStart
)
3900 for (gmx::index i
= 0; i
< gmx::ssize(src
->ci
) && cjGlobal
< cjEnd
; i
++)
3902 const nbnxn_ci_t
*srcCi
= &src
->ci
[i
];
3903 int ncj
= srcCi
->cj_ind_end
- srcCi
->cj_ind_start
;
3904 if (cjGlobal
>= cjStart
)
3906 /* If the source list is not our own, we need to set
3907 * extra flags (the template bool parameter).
3911 copySelectedListRange
3914 flag
, iFlagShift
, jFlagShift
, t
);
3918 copySelectedListRange
3921 dest
, flag
, iFlagShift
, jFlagShift
, t
);
3929 cjGlobal
+= src
->ncjInUse
;
3933 dest
->ncjInUse
= dest
->cj
.size();
3937 int ncjTotalNew
= 0;
3938 for (int s
= 0; s
< numLists
; s
++)
3940 ncjTotalNew
+= destSet
[s
]->ncjInUse
;
3942 GMX_RELEASE_ASSERT(ncjTotalNew
== ncjTotal
, "The total size of the lists before and after rebalancing should match");
3946 /* Returns if the pairlists are so imbalanced that it is worth rebalancing. */
3947 static bool checkRebalanceSimpleLists(const nbnxn_pairlist_set_t
*listSet
)
3949 int numLists
= listSet
->nnbl
;
3952 for (int s
= 0; s
< numLists
; s
++)
3954 ncjMax
= std::max(ncjMax
, listSet
->nbl
[s
]->ncjInUse
);
3955 ncjTotal
+= listSet
->nbl
[s
]->ncjInUse
;
3959 fprintf(debug
, "Pair-list ncjMax %d ncjTotal %d\n", ncjMax
, ncjTotal
);
3961 /* The rebalancing adds 3% extra time to the search. Heuristically we
3962 * determined that under common conditions the non-bonded kernel balance
3963 * improvement will outweigh this when the imbalance is more than 3%.
3964 * But this will, obviously, depend on search vs kernel time and nstlist.
3966 const real rebalanceTolerance
= 1.03;
3968 return numLists
*ncjMax
> ncjTotal
*rebalanceTolerance
;
3971 /* Perform a count (linear) sort to sort the smaller lists to the end.
3972 * This avoids load imbalance on the GPU, as large lists will be
3973 * scheduled and executed first and the smaller lists later.
3974 * Load balancing between multi-processors only happens at the end
3975 * and there smaller lists lead to more effective load balancing.
3976 * The sorting is done on the cj4 count, not on the actual pair counts.
3977 * Not only does this make the sort faster, but it also results in
3978 * better load balancing than using a list sorted on exact load.
3979 * This function swaps the pointer in the pair list to avoid a copy operation.
3981 static void sort_sci(NbnxnPairlistGpu
*nbl
)
3983 if (nbl
->cj4
.size() <= nbl
->sci
.size())
3985 /* nsci = 0 or all sci have size 1, sorting won't change the order */
3989 NbnxnPairlistGpuWork
&work
= *nbl
->work
;
3991 /* We will distinguish differences up to double the average */
3992 const int m
= (2*nbl
->cj4
.size())/nbl
->sci
.size();
3994 /* Resize work.sci_sort so we can sort into it */
3995 work
.sci_sort
.resize(nbl
->sci
.size());
3997 std::vector
<int> &sort
= work
.sortBuffer
;
3998 /* Set up m + 1 entries in sort, initialized at 0 */
4000 sort
.resize(m
+ 1, 0);
4001 /* Count the entries of each size */
4002 for (const nbnxn_sci_t
&sci
: nbl
->sci
)
4004 int i
= std::min(m
, sci
.numJClusterGroups());
4007 /* Calculate the offset for each count */
4010 for (int i
= m
- 1; i
>= 0; i
--)
4013 sort
[i
] = sort
[i
+ 1] + s0
;
4017 /* Sort entries directly into place */
4018 gmx::ArrayRef
<nbnxn_sci_t
> sci_sort
= work
.sci_sort
;
4019 for (const nbnxn_sci_t
&sci
: nbl
->sci
)
4021 int i
= std::min(m
, sci
.numJClusterGroups());
4022 sci_sort
[sort
[i
]++] = sci
;
4025 /* Swap the sci pointers so we use the new, sorted list */
4026 std::swap(nbl
->sci
, work
.sci_sort
);
4030 nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality iLocality
,
4032 nbnxn_atomdata_t
*nbat
,
4033 const t_blocka
*excl
,
4034 const Nbnxm::KernelType kernelType
,
4038 nbnxn_pairlist_set_t
*nbl_list
= &pairlistSet(iLocality
);
4040 const real rlist
= nbl_list
->params
.rlistOuter
;
4042 int nsubpair_target
;
4043 float nsubpair_tot_est
;
4046 gmx_bool CombineNBLists
;
4048 int np_tot
, np_noq
, np_hlj
, nap
;
4050 nnbl
= nbl_list
->nnbl
;
4051 CombineNBLists
= nbl_list
->bCombined
;
4055 fprintf(debug
, "ns making %d nblists\n", nnbl
);
4058 nbat
->bUseBufferFlags
= (nbat
->out
.size() > 1);
4059 /* We should re-init the flags before making the first list */
4060 if (nbat
->bUseBufferFlags
&& iLocality
== InteractionLocality::Local
)
4062 init_buffer_flags(&nbat
->buffer_flags
, nbat
->numAtoms());
4066 if (iLocality
== InteractionLocality::Local
)
4068 /* Only zone (grid) 0 vs 0 */
4073 nzi
= nbs
->zones
->nizone
;
4076 if (!nbl_list
->bSimple
&& minimumIlistCountForGpuBalancing_
> 0)
4078 get_nsubpair_target(nbs
, iLocality
, rlist
, minimumIlistCountForGpuBalancing_
,
4079 &nsubpair_target
, &nsubpair_tot_est
);
4083 nsubpair_target
= 0;
4084 nsubpair_tot_est
= 0;
4087 /* Clear all pair-lists */
4088 for (int th
= 0; th
< nnbl
; th
++)
4090 if (nbl_list
->bSimple
)
4092 clear_pairlist(nbl_list
->nbl
[th
]);
4096 clear_pairlist(nbl_list
->nblGpu
[th
]);
4101 clear_pairlist_fep(nbl_list
->nbl_fep
[th
]);
4105 for (int zi
= 0; zi
< nzi
; zi
++)
4107 const nbnxn_grid_t
&iGrid
= nbs
->grid
[zi
];
4111 if (iLocality
== InteractionLocality::Local
)
4118 zj0
= nbs
->zones
->izone
[zi
].j0
;
4119 zj1
= nbs
->zones
->izone
[zi
].j1
;
4125 for (int zj
= zj0
; zj
< zj1
; zj
++)
4127 const nbnxn_grid_t
&jGrid
= nbs
->grid
[zj
];
4131 fprintf(debug
, "ns search grid %d vs %d\n", zi
, zj
);
4134 nbs_cycle_start(&nbs
->cc
[enbsCCsearch
]);
4136 ci_block
= get_ci_block_size(iGrid
, nbs
->DomDec
, nnbl
);
4138 /* With GPU: generate progressively smaller lists for
4139 * load balancing for local only or non-local with 2 zones.
4141 progBal
= (iLocality
== InteractionLocality::Local
|| nbs
->zones
->n
<= 2);
4143 #pragma omp parallel for num_threads(nnbl) schedule(static)
4144 for (int th
= 0; th
< nnbl
; th
++)
4148 /* Re-init the thread-local work flag data before making
4149 * the first list (not an elegant conditional).
4151 if (nbat
->bUseBufferFlags
&& ((zi
== 0 && zj
== 0)))
4153 init_buffer_flags(&nbs
->work
[th
].buffer_flags
, nbat
->numAtoms());
4156 if (CombineNBLists
&& th
> 0)
4158 GMX_ASSERT(!nbl_list
->bSimple
, "Can only combine GPU lists");
4160 clear_pairlist(nbl_list
->nblGpu
[th
]);
4163 /* Divide the i super cell equally over the nblists */
4164 if (nbl_list
->bSimple
)
4166 nbnxn_make_pairlist_part(nbs
, iGrid
, jGrid
,
4167 &nbs
->work
[th
], nbat
, *excl
,
4171 nbat
->bUseBufferFlags
,
4173 progBal
, nsubpair_tot_est
,
4176 nbl_list
->nbl_fep
[th
]);
4180 nbnxn_make_pairlist_part(nbs
, iGrid
, jGrid
,
4181 &nbs
->work
[th
], nbat
, *excl
,
4185 nbat
->bUseBufferFlags
,
4187 progBal
, nsubpair_tot_est
,
4189 nbl_list
->nblGpu
[th
],
4190 nbl_list
->nbl_fep
[th
]);
4193 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
4195 nbs_cycle_stop(&nbs
->cc
[enbsCCsearch
]);
4200 for (int th
= 0; th
< nnbl
; th
++)
4202 inc_nrnb(nrnb
, eNR_NBNXN_DIST2
, nbs
->work
[th
].ndistc
);
4204 if (nbl_list
->bSimple
)
4206 NbnxnPairlistCpu
*nbl
= nbl_list
->nbl
[th
];
4207 np_tot
+= nbl
->cj
.size();
4208 np_noq
+= nbl
->work
->ncj_noq
;
4209 np_hlj
+= nbl
->work
->ncj_hlj
;
4213 NbnxnPairlistGpu
*nbl
= nbl_list
->nblGpu
[th
];
4214 /* This count ignores potential subsequent pair pruning */
4215 np_tot
+= nbl
->nci_tot
;
4218 if (nbl_list
->bSimple
)
4220 nap
= nbl_list
->nbl
[0]->na_ci
*nbl_list
->nbl
[0]->na_cj
;
4224 nap
= gmx::square(nbl_list
->nblGpu
[0]->na_ci
);
4226 nbl_list
->natpair_ljq
= (np_tot
- np_noq
)*nap
- np_hlj
*nap
/2;
4227 nbl_list
->natpair_lj
= np_noq
*nap
;
4228 nbl_list
->natpair_q
= np_hlj
*nap
/2;
4230 if (CombineNBLists
&& nnbl
> 1)
4232 GMX_ASSERT(!nbl_list
->bSimple
, "Can only combine GPU lists");
4233 NbnxnPairlistGpu
**nbl
= nbl_list
->nblGpu
;
4235 nbs_cycle_start(&nbs
->cc
[enbsCCcombine
]);
4237 combine_nblists(nnbl
-1, nbl
+1, nbl
[0]);
4239 nbs_cycle_stop(&nbs
->cc
[enbsCCcombine
]);
4244 if (nbl_list
->bSimple
)
4246 if (nnbl
> 1 && checkRebalanceSimpleLists(nbl_list
))
4248 rebalanceSimpleLists(nbl_list
->nnbl
, nbl_list
->nbl
, nbl_list
->nbl_work
, nbs
->work
);
4250 /* Swap the pointer of the sets of pair lists */
4251 NbnxnPairlistCpu
**tmp
= nbl_list
->nbl
;
4252 nbl_list
->nbl
= nbl_list
->nbl_work
;
4253 nbl_list
->nbl_work
= tmp
;
4258 /* Sort the entries on size, large ones first */
4259 if (CombineNBLists
|| nnbl
== 1)
4261 sort_sci(nbl_list
->nblGpu
[0]);
4265 #pragma omp parallel for num_threads(nnbl) schedule(static)
4266 for (int th
= 0; th
< nnbl
; th
++)
4270 sort_sci(nbl_list
->nblGpu
[th
]);
4272 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
4277 if (nbat
->bUseBufferFlags
)
4279 reduce_buffer_flags(nbs
, nbl_list
->nnbl
, &nbat
->buffer_flags
);
4284 /* Balance the free-energy lists over all the threads */
4285 balance_fep_lists(nbs
, nbl_list
);
4288 if (nbl_list
->bSimple
)
4290 /* This is a fresh list, so not pruned, stored using ci.
4291 * ciOuter is invalid at this point.
4293 GMX_ASSERT(nbl_list
->nbl
[0]->ciOuter
.empty(), "ciOuter is invalid so it should be empty");
4296 if (iLocality
== Nbnxm::InteractionLocality::Local
)
4298 outerListCreationStep_
= step
;
4302 GMX_RELEASE_ASSERT(outerListCreationStep_
== step
,
4303 "Outer list should be created at the same step as the inner list");
4306 /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
4307 if (iLocality
== InteractionLocality::Local
)
4309 nbs
->search_count
++;
4311 if (nbs
->print_cycles
&&
4312 (!nbs
->DomDec
|| iLocality
== InteractionLocality::NonLocal
) &&
4313 nbs
->search_count
% 100 == 0)
4315 nbs_cycle_print(stderr
, nbs
);
4318 /* If we have more than one list, they either got rebalancing (CPU)
4319 * or combined (GPU), so we should dump the final result to debug.
4321 if (debug
&& nbl_list
->nnbl
> 1)
4323 if (nbl_list
->bSimple
)
4325 for (int t
= 0; t
< nbl_list
->nnbl
; t
++)
4327 print_nblist_statistics(debug
, nbl_list
->nbl
[t
], nbs
, rlist
);
4332 print_nblist_statistics(debug
, nbl_list
->nblGpu
[0], nbs
, rlist
);
4340 if (nbl_list
->bSimple
)
4342 for (int t
= 0; t
< nbl_list
->nnbl
; t
++)
4344 print_nblist_ci_cj(debug
, nbl_list
->nbl
[t
]);
4349 print_nblist_sci_cj(debug
, nbl_list
->nblGpu
[0]);
4353 if (nbat
->bUseBufferFlags
)
4355 print_reduction_cost(&nbat
->buffer_flags
, nbl_list
->nnbl
);
4359 if (params_
.useDynamicPruning
&& nbl_list
->bSimple
)
4361 nbnxnPrepareListForDynamicPruning(nbl_list
);
4366 nonbonded_verlet_t::constructPairlist(const Nbnxm::InteractionLocality iLocality
,
4367 const t_blocka
*excl
,
4371 pairlistSets_
->construct(iLocality
, nbs
.get(), nbat
, excl
,
4372 kernelSetup_
.kernelType
,
4377 /* Launch the transfer of the pairlist to the GPU.
4379 * NOTE: The launch overhead is currently not timed separately
4381 Nbnxm::gpu_init_pairlist(gpu_nbv
,
4382 pairlistSets().pairlistSet(iLocality
).nblGpu
[0],
4387 void nbnxnPrepareListForDynamicPruning(nbnxn_pairlist_set_t
*listSet
)
4389 GMX_RELEASE_ASSERT(listSet
->bSimple
, "Should only be called for simple lists");
4391 /* TODO: Restructure the lists so we have actual outer and inner
4392 * list objects so we can set a single pointer instead of
4393 * swapping several pointers.
4396 for (int i
= 0; i
< listSet
->nnbl
; i
++)
4398 NbnxnPairlistCpu
&list
= *listSet
->nbl
[i
];
4400 /* The search produced a list in ci/cj.
4401 * Swap the list pointers so we get the outer list is ciOuter,cjOuter
4402 * and we can prune that to get an inner list in ci/cj.
4404 GMX_RELEASE_ASSERT(list
.ciOuter
.empty() && list
.cjOuter
.empty(),
4405 "The outer lists should be empty before preparation");
4407 std::swap(list
.ci
, list
.ciOuter
);
4408 std::swap(list
.cj
, list
.cjOuter
);