2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2012, The GROMACS development team,
6 * check out http://www.gromacs.org for more information.
7 * Copyright (c) 2012,2013, by the GROMACS development team, led by
8 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9 * others, as listed in the AUTHORS file in the top-level source
10 * directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
51 #include "nbnxn_consts.h"
52 /* nbnxn_internal.h included gmx_simd_macros.h */
53 #include "nbnxn_internal.h"
55 #include "gmx_simd_vec.h"
57 #include "nbnxn_atomdata.h"
58 #include "nbnxn_search.h"
59 #include "gmx_cyclecounter.h"
61 #include "gmx_omp_nthreads.h"
65 #ifdef NBNXN_SEARCH_BB_SIMD4
66 /* We use 4-wide SIMD for bounding box calculations */
69 /* Single precision BBs + coordinates, we can also load coordinates with SIMD */
70 #define NBNXN_SEARCH_SIMD4_FLOAT_X_BB
73 #if defined NBNXN_SEARCH_SIMD4_FLOAT_X_BB && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8)
74 /* Store bounding boxes with x, y and z coordinates in packs of 4 */
75 #define NBNXN_PBB_SIMD4
78 /* The packed bounding box coordinate stride is always set to 4.
79 * With AVX we could use 8, but that turns out not to be faster.
82 #define STRIDE_PBB_2LOG 2
84 #endif /* NBNXN_SEARCH_BB_SIMD4 */
88 /* The functions below are macros as they are performance sensitive */
90 /* 4x4 list, pack=4: no complex conversion required */
91 /* i-cluster to j-cluster conversion */
92 #define CI_TO_CJ_J4(ci) (ci)
93 /* cluster index to coordinate array index conversion */
94 #define X_IND_CI_J4(ci) ((ci)*STRIDE_P4)
95 #define X_IND_CJ_J4(cj) ((cj)*STRIDE_P4)
97 /* 4x2 list, pack=4: j-cluster size is half the packing width */
98 /* i-cluster to j-cluster conversion */
99 #define CI_TO_CJ_J2(ci) ((ci)<<1)
100 /* cluster index to coordinate array index conversion */
101 #define X_IND_CI_J2(ci) ((ci)*STRIDE_P4)
102 #define X_IND_CJ_J2(cj) (((cj)>>1)*STRIDE_P4 + ((cj) & 1)*(PACK_X4>>1))
104 /* 4x8 list, pack=8: i-cluster size is half the packing width */
105 /* i-cluster to j-cluster conversion */
106 #define CI_TO_CJ_J8(ci) ((ci)>>1)
107 /* cluster index to coordinate array index conversion */
108 #define X_IND_CI_J8(ci) (((ci)>>1)*STRIDE_P8 + ((ci) & 1)*(PACK_X8>>1))
109 #define X_IND_CJ_J8(cj) ((cj)*STRIDE_P8)
111 /* The j-cluster size is matched to the SIMD width */
112 #if GMX_SIMD_WIDTH_HERE == 2
113 #define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J2(ci)
114 #define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J2(ci)
115 #define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J2(cj)
117 #if GMX_SIMD_WIDTH_HERE == 4
118 #define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J4(ci)
119 #define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J4(ci)
120 #define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J4(cj)
122 #if GMX_SIMD_WIDTH_HERE == 8
123 #define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J8(ci)
124 #define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J8(ci)
125 #define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J8(cj)
126 /* Half SIMD with j-cluster size */
127 #define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J4(ci)
128 #define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J4(ci)
129 #define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J4(cj)
131 #if GMX_SIMD_WIDTH_HERE == 16
132 #define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J8(ci)
133 #define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J8(ci)
134 #define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J8(cj)
136 #error "unsupported GMX_NBNXN_SIMD_WIDTH"
142 #endif /* GMX_NBNXN_SIMD */
145 #ifdef NBNXN_SEARCH_BB_SIMD4
146 /* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
148 /* Size of bounding box corners quadruplet */
149 #define NNBSBB_XXXX (NNBSBB_D*DIM*STRIDE_PBB)
152 /* We shift the i-particles backward for PBC.
153 * This leads to more conditionals than shifting forward.
154 * We do this to get more balanced pair lists.
156 #define NBNXN_SHIFT_BACKWARD
159 /* This define is a lazy way to avoid interdependence of the grid
160 * and searching data structures.
162 #define NBNXN_NA_SC_MAX (GPU_NSUBCELL*NBNXN_GPU_CLUSTER_SIZE)
165 static void nbs_cycle_clear(nbnxn_cycle_t
*cc
)
169 for (i
= 0; i
< enbsCCnr
; i
++)
176 static double Mcyc_av(const nbnxn_cycle_t
*cc
)
178 return (double)cc
->c
*1e-6/cc
->count
;
181 static void nbs_cycle_print(FILE *fp
, const nbnxn_search_t nbs
)
187 fprintf(fp
, "ns %4d grid %4.1f search %4.1f red.f %5.3f",
188 nbs
->cc
[enbsCCgrid
].count
,
189 Mcyc_av(&nbs
->cc
[enbsCCgrid
]),
190 Mcyc_av(&nbs
->cc
[enbsCCsearch
]),
191 Mcyc_av(&nbs
->cc
[enbsCCreducef
]));
193 if (nbs
->nthread_max
> 1)
195 if (nbs
->cc
[enbsCCcombine
].count
> 0)
197 fprintf(fp
, " comb %5.2f",
198 Mcyc_av(&nbs
->cc
[enbsCCcombine
]));
200 fprintf(fp
, " s. th");
201 for (t
= 0; t
< nbs
->nthread_max
; t
++)
203 fprintf(fp
, " %4.1f",
204 Mcyc_av(&nbs
->work
[t
].cc
[enbsCCsearch
]));
210 static void nbnxn_grid_init(nbnxn_grid_t
* grid
)
213 grid
->cxy_ind
= NULL
;
214 grid
->cxy_nalloc
= 0;
220 static int get_2log(int n
)
225 while ((1<<log2
) < n
)
231 gmx_fatal(FARGS
, "nbnxn na_c (%d) is not a power of 2", n
);
237 static int nbnxn_kernel_to_ci_size(int nb_kernel_type
)
239 switch (nb_kernel_type
)
241 case nbnxnk4x4_PlainC
:
242 case nbnxnk4xN_SIMD_4xN
:
243 case nbnxnk4xN_SIMD_2xNN
:
244 return NBNXN_CPU_CLUSTER_I_SIZE
;
245 case nbnxnk8x8x8_CUDA
:
246 case nbnxnk8x8x8_PlainC
:
247 /* The cluster size for super/sub lists is only set here.
248 * Any value should work for the pair-search and atomdata code.
249 * The kernels, of course, might require a particular value.
251 return NBNXN_GPU_CLUSTER_SIZE
;
253 gmx_incons("unknown kernel type");
259 int nbnxn_kernel_to_cj_size(int nb_kernel_type
)
261 int nbnxn_simd_width
= 0;
264 #ifdef GMX_NBNXN_SIMD
265 nbnxn_simd_width
= GMX_SIMD_WIDTH_HERE
;
268 switch (nb_kernel_type
)
270 case nbnxnk4x4_PlainC
:
271 cj_size
= NBNXN_CPU_CLUSTER_I_SIZE
;
273 case nbnxnk4xN_SIMD_4xN
:
274 cj_size
= nbnxn_simd_width
;
276 case nbnxnk4xN_SIMD_2xNN
:
277 cj_size
= nbnxn_simd_width
/2;
279 case nbnxnk8x8x8_CUDA
:
280 case nbnxnk8x8x8_PlainC
:
281 cj_size
= nbnxn_kernel_to_ci_size(nb_kernel_type
);
284 gmx_incons("unknown kernel type");
290 static int ci_to_cj(int na_cj_2log
, int ci
)
294 case 2: return ci
; break;
295 case 1: return (ci
<<1); break;
296 case 3: return (ci
>>1); break;
302 gmx_bool
nbnxn_kernel_pairlist_simple(int nb_kernel_type
)
304 if (nb_kernel_type
== nbnxnkNotSet
)
306 gmx_fatal(FARGS
, "Non-bonded kernel type not set for Verlet-style pair-list.");
309 switch (nb_kernel_type
)
311 case nbnxnk8x8x8_CUDA
:
312 case nbnxnk8x8x8_PlainC
:
315 case nbnxnk4x4_PlainC
:
316 case nbnxnk4xN_SIMD_4xN
:
317 case nbnxnk4xN_SIMD_2xNN
:
321 gmx_incons("Invalid nonbonded kernel type passed!");
326 void nbnxn_init_search(nbnxn_search_t
* nbs_ptr
,
328 gmx_domdec_zones_t
*zones
,
337 nbs
->DomDec
= (n_dd_cells
!= NULL
);
339 clear_ivec(nbs
->dd_dim
);
345 for (d
= 0; d
< DIM
; d
++)
347 if ((*n_dd_cells
)[d
] > 1)
350 /* Each grid matches a DD zone */
356 snew(nbs
->grid
, nbs
->ngrid
);
357 for (g
= 0; g
< nbs
->ngrid
; g
++)
359 nbnxn_grid_init(&nbs
->grid
[g
]);
362 nbs
->cell_nalloc
= 0;
366 nbs
->nthread_max
= nthread_max
;
368 /* Initialize the work data structures for each thread */
369 snew(nbs
->work
, nbs
->nthread_max
);
370 for (t
= 0; t
< nbs
->nthread_max
; t
++)
372 nbs
->work
[t
].cxy_na
= NULL
;
373 nbs
->work
[t
].cxy_na_nalloc
= 0;
374 nbs
->work
[t
].sort_work
= NULL
;
375 nbs
->work
[t
].sort_work_nalloc
= 0;
378 /* Initialize detailed nbsearch cycle counting */
379 nbs
->print_cycles
= (getenv("GMX_NBNXN_CYCLE") != 0);
380 nbs
->search_count
= 0;
381 nbs_cycle_clear(nbs
->cc
);
382 for (t
= 0; t
< nbs
->nthread_max
; t
++)
384 nbs_cycle_clear(nbs
->work
[t
].cc
);
388 static real
grid_atom_density(int n
, rvec corner0
, rvec corner1
)
392 rvec_sub(corner1
, corner0
, size
);
394 return n
/(size
[XX
]*size
[YY
]*size
[ZZ
]);
397 static int set_grid_size_xy(const nbnxn_search_t nbs
,
400 int n
, rvec corner0
, rvec corner1
,
406 real adens
, tlen
, tlen_x
, tlen_y
, nc_max
;
409 rvec_sub(corner1
, corner0
, size
);
413 /* target cell length */
416 /* To minimize the zero interactions, we should make
417 * the largest of the i/j cell cubic.
419 na_c
= max(grid
->na_c
, grid
->na_cj
);
421 /* Approximately cubic cells */
422 tlen
= pow(na_c
/atom_density
, 1.0/3.0);
428 /* Approximately cubic sub cells */
429 tlen
= pow(grid
->na_c
/atom_density
, 1.0/3.0);
430 tlen_x
= tlen
*GPU_NSUBCELL_X
;
431 tlen_y
= tlen
*GPU_NSUBCELL_Y
;
433 /* We round ncx and ncy down, because we get less cell pairs
434 * in the nbsist when the fixed cell dimensions (x,y) are
435 * larger than the variable one (z) than the other way around.
437 grid
->ncx
= max(1, (int)(size
[XX
]/tlen_x
));
438 grid
->ncy
= max(1, (int)(size
[YY
]/tlen_y
));
446 grid
->sx
= size
[XX
]/grid
->ncx
;
447 grid
->sy
= size
[YY
]/grid
->ncy
;
448 grid
->inv_sx
= 1/grid
->sx
;
449 grid
->inv_sy
= 1/grid
->sy
;
453 /* This is a non-home zone, add an extra row of cells
454 * for particles communicated for bonded interactions.
455 * These can be beyond the cut-off. It doesn't matter where
456 * they end up on the grid, but for performance it's better
457 * if they don't end up in cells that can be within cut-off range.
463 /* We need one additional cell entry for particles moved by DD */
464 if (grid
->ncx
*grid
->ncy
+1 > grid
->cxy_nalloc
)
466 grid
->cxy_nalloc
= over_alloc_large(grid
->ncx
*grid
->ncy
+1);
467 srenew(grid
->cxy_na
, grid
->cxy_nalloc
);
468 srenew(grid
->cxy_ind
, grid
->cxy_nalloc
+1);
470 for (t
= 0; t
< nbs
->nthread_max
; t
++)
472 if (grid
->ncx
*grid
->ncy
+1 > nbs
->work
[t
].cxy_na_nalloc
)
474 nbs
->work
[t
].cxy_na_nalloc
= over_alloc_large(grid
->ncx
*grid
->ncy
+1);
475 srenew(nbs
->work
[t
].cxy_na
, nbs
->work
[t
].cxy_na_nalloc
);
479 /* Worst case scenario of 1 atom in each last cell */
480 if (grid
->na_cj
<= grid
->na_c
)
482 nc_max
= n
/grid
->na_sc
+ grid
->ncx
*grid
->ncy
;
486 nc_max
= n
/grid
->na_sc
+ grid
->ncx
*grid
->ncy
*grid
->na_cj
/grid
->na_c
;
489 if (nc_max
> grid
->nc_nalloc
)
491 grid
->nc_nalloc
= over_alloc_large(nc_max
);
492 srenew(grid
->nsubc
, grid
->nc_nalloc
);
493 srenew(grid
->bbcz
, grid
->nc_nalloc
*NNBSBB_D
);
495 sfree_aligned(grid
->bb
);
496 /* This snew also zeros the contents, this avoid possible
497 * floating exceptions in SIMD with the unused bb elements.
501 snew_aligned(grid
->bb
, grid
->nc_nalloc
, 16);
508 pbb_nalloc
= grid
->nc_nalloc
*GPU_NSUBCELL
/STRIDE_PBB
*NNBSBB_XXXX
;
509 snew_aligned(grid
->pbb
, pbb_nalloc
, 16);
511 snew_aligned(grid
->bb
, grid
->nc_nalloc
*GPU_NSUBCELL
, 16);
517 if (grid
->na_cj
== grid
->na_c
)
519 grid
->bbj
= grid
->bb
;
523 sfree_aligned(grid
->bbj
);
524 snew_aligned(grid
->bbj
, grid
->nc_nalloc
*grid
->na_c
/grid
->na_cj
, 16);
528 srenew(grid
->flags
, grid
->nc_nalloc
);
531 copy_rvec(corner0
, grid
->c0
);
532 copy_rvec(corner1
, grid
->c1
);
537 /* We need to sort paricles in grid columns on z-coordinate.
538 * As particle are very often distributed homogeneously, we a sorting
539 * algorithm similar to pigeonhole sort. We multiply the z-coordinate
540 * by a factor, cast to an int and try to store in that hole. If the hole
541 * is full, we move this or another particle. A second pass is needed to make
542 * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
543 * 4 is the optimal value for homogeneous particle distribution and allows
544 * for an O(#particles) sort up till distributions were all particles are
545 * concentrated in 1/4 of the space. No NlogN fallback is implemented,
546 * as it can be expensive to detect imhomogeneous particle distributions.
547 * SGSF is the maximum ratio of holes used, in the worst case all particles
548 * end up in the last hole and we need #particles extra holes at the end.
550 #define SORT_GRID_OVERSIZE 4
551 #define SGSF (SORT_GRID_OVERSIZE + 1)
553 /* Sort particle index a on coordinates x along dim.
554 * Backwards tells if we want decreasing iso increasing coordinates.
555 * h0 is the minimum of the coordinate range.
556 * invh is the 1/length of the sorting range.
557 * n_per_h (>=n) is the expected average number of particles per 1/invh
558 * sort is the sorting work array.
559 * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
560 * or easier, allocate at least n*SGSF elements.
562 static void sort_atoms(int dim
, gmx_bool Backwards
,
563 int *a
, int n
, rvec
*x
,
564 real h0
, real invh
, int n_per_h
,
568 int zi
, zim
, zi_min
, zi_max
;
580 gmx_incons("n > n_per_h");
584 /* Transform the inverse range height into the inverse hole height */
585 invh
*= n_per_h
*SORT_GRID_OVERSIZE
;
587 /* Set nsort to the maximum possible number of holes used.
588 * In worst case all n elements end up in the last bin.
590 nsort
= n_per_h
*SORT_GRID_OVERSIZE
+ n
;
592 /* Determine the index range used, so we can limit it for the second pass */
596 /* Sort the particles using a simple index sort */
597 for (i
= 0; i
< n
; i
++)
599 /* The cast takes care of float-point rounding effects below zero.
600 * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
601 * times the box height out of the box.
603 zi
= (int)((x
[a
[i
]][dim
] - h0
)*invh
);
606 /* As we can have rounding effect, we use > iso >= here */
607 if (zi
< 0 || zi
> n_per_h
*SORT_GRID_OVERSIZE
)
609 gmx_fatal(FARGS
, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
610 a
[i
], 'x'+dim
, x
[a
[i
]][dim
], h0
, invh
, zi
,
611 n_per_h
, SORT_GRID_OVERSIZE
);
615 /* Ideally this particle should go in sort cell zi,
616 * but that might already be in use,
617 * in that case find the first empty cell higher up
622 zi_min
= min(zi_min
, zi
);
623 zi_max
= max(zi_max
, zi
);
627 /* We have multiple atoms in the same sorting slot.
628 * Sort on real z for minimal bounding box size.
629 * There is an extra check for identical z to ensure
630 * well-defined output order, independent of input order
631 * to ensure binary reproducibility after restarts.
633 while (sort
[zi
] >= 0 && ( x
[a
[i
]][dim
] > x
[sort
[zi
]][dim
] ||
634 (x
[a
[i
]][dim
] == x
[sort
[zi
]][dim
] &&
642 /* Shift all elements by one slot until we find an empty slot */
645 while (sort
[zim
] >= 0)
653 zi_max
= max(zi_max
, zim
);
656 zi_max
= max(zi_max
, zi
);
663 for (zi
= 0; zi
< nsort
; zi
++)
674 for (zi
= zi_max
; zi
>= zi_min
; zi
--)
685 gmx_incons("Lost particles while sorting");
690 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
691 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
697 /* Coordinate order x,y,z, bb order xyz0 */
698 static void calc_bounding_box(int na
, int stride
, const real
*x
, nbnxn_bb_t
*bb
)
701 real xl
, xh
, yl
, yh
, zl
, zh
;
711 for (j
= 1; j
< na
; j
++)
713 xl
= min(xl
, x
[i
+XX
]);
714 xh
= max(xh
, x
[i
+XX
]);
715 yl
= min(yl
, x
[i
+YY
]);
716 yh
= max(yh
, x
[i
+YY
]);
717 zl
= min(zl
, x
[i
+ZZ
]);
718 zh
= max(zh
, x
[i
+ZZ
]);
721 /* Note: possible double to float conversion here */
722 bb
->lower
[BB_X
] = R2F_D(xl
);
723 bb
->lower
[BB_Y
] = R2F_D(yl
);
724 bb
->lower
[BB_Z
] = R2F_D(zl
);
725 bb
->upper
[BB_X
] = R2F_U(xh
);
726 bb
->upper
[BB_Y
] = R2F_U(yh
);
727 bb
->upper
[BB_Z
] = R2F_U(zh
);
730 /* Packed coordinates, bb order xyz0 */
731 static void calc_bounding_box_x_x4(int na
, const real
*x
, nbnxn_bb_t
*bb
)
734 real xl
, xh
, yl
, yh
, zl
, zh
;
742 for (j
= 1; j
< na
; j
++)
744 xl
= min(xl
, x
[j
+XX
*PACK_X4
]);
745 xh
= max(xh
, x
[j
+XX
*PACK_X4
]);
746 yl
= min(yl
, x
[j
+YY
*PACK_X4
]);
747 yh
= max(yh
, x
[j
+YY
*PACK_X4
]);
748 zl
= min(zl
, x
[j
+ZZ
*PACK_X4
]);
749 zh
= max(zh
, x
[j
+ZZ
*PACK_X4
]);
751 /* Note: possible double to float conversion here */
752 bb
->lower
[BB_X
] = R2F_D(xl
);
753 bb
->lower
[BB_Y
] = R2F_D(yl
);
754 bb
->lower
[BB_Z
] = R2F_D(zl
);
755 bb
->upper
[BB_X
] = R2F_U(xh
);
756 bb
->upper
[BB_Y
] = R2F_U(yh
);
757 bb
->upper
[BB_Z
] = R2F_U(zh
);
760 /* Packed coordinates, bb order xyz0 */
761 static void calc_bounding_box_x_x8(int na
, const real
*x
, nbnxn_bb_t
*bb
)
764 real xl
, xh
, yl
, yh
, zl
, zh
;
772 for (j
= 1; j
< na
; j
++)
774 xl
= min(xl
, x
[j
+XX
*PACK_X8
]);
775 xh
= max(xh
, x
[j
+XX
*PACK_X8
]);
776 yl
= min(yl
, x
[j
+YY
*PACK_X8
]);
777 yh
= max(yh
, x
[j
+YY
*PACK_X8
]);
778 zl
= min(zl
, x
[j
+ZZ
*PACK_X8
]);
779 zh
= max(zh
, x
[j
+ZZ
*PACK_X8
]);
781 /* Note: possible double to float conversion here */
782 bb
->lower
[BB_X
] = R2F_D(xl
);
783 bb
->lower
[BB_Y
] = R2F_D(yl
);
784 bb
->lower
[BB_Z
] = R2F_D(zl
);
785 bb
->upper
[BB_X
] = R2F_U(xh
);
786 bb
->upper
[BB_Y
] = R2F_U(yh
);
787 bb
->upper
[BB_Z
] = R2F_U(zh
);
790 /* Packed coordinates, bb order xyz0 */
791 static void calc_bounding_box_x_x4_halves(int na
, const real
*x
,
792 nbnxn_bb_t
*bb
, nbnxn_bb_t
*bbj
)
794 calc_bounding_box_x_x4(min(na
, 2), x
, bbj
);
798 calc_bounding_box_x_x4(min(na
-2, 2), x
+(PACK_X4
>>1), bbj
+1);
802 /* Set the "empty" bounding box to the same as the first one,
803 * so we don't need to treat special cases in the rest of the code.
805 #ifdef NBNXN_SEARCH_BB_SIMD4
806 gmx_simd4_store_pr(&bbj
[1].lower
[0], gmx_simd4_load_bb_pr(&bbj
[0].lower
[0]));
807 gmx_simd4_store_pr(&bbj
[1].upper
[0], gmx_simd4_load_bb_pr(&bbj
[0].upper
[0]));
813 #ifdef NBNXN_SEARCH_BB_SIMD4
814 gmx_simd4_store_pr(&bb
->lower
[0],
815 gmx_simd4_min_pr(gmx_simd4_load_bb_pr(&bbj
[0].lower
[0]),
816 gmx_simd4_load_bb_pr(&bbj
[1].lower
[0])));
817 gmx_simd4_store_pr(&bb
->upper
[0],
818 gmx_simd4_max_pr(gmx_simd4_load_bb_pr(&bbj
[0].upper
[0]),
819 gmx_simd4_load_bb_pr(&bbj
[1].upper
[0])));
824 for (i
= 0; i
< NNBSBB_C
; i
++)
826 bb
->lower
[i
] = min(bbj
[0].lower
[i
], bbj
[1].lower
[i
]);
827 bb
->upper
[i
] = max(bbj
[0].upper
[i
], bbj
[1].upper
[i
]);
833 #ifdef NBNXN_SEARCH_BB_SIMD4
835 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
836 static void calc_bounding_box_xxxx(int na
, int stride
, const real
*x
, float *bb
)
839 real xl
, xh
, yl
, yh
, zl
, zh
;
849 for (j
= 1; j
< na
; j
++)
851 xl
= min(xl
, x
[i
+XX
]);
852 xh
= max(xh
, x
[i
+XX
]);
853 yl
= min(yl
, x
[i
+YY
]);
854 yh
= max(yh
, x
[i
+YY
]);
855 zl
= min(zl
, x
[i
+ZZ
]);
856 zh
= max(zh
, x
[i
+ZZ
]);
859 /* Note: possible double to float conversion here */
860 bb
[0*STRIDE_PBB
] = R2F_D(xl
);
861 bb
[1*STRIDE_PBB
] = R2F_D(yl
);
862 bb
[2*STRIDE_PBB
] = R2F_D(zl
);
863 bb
[3*STRIDE_PBB
] = R2F_U(xh
);
864 bb
[4*STRIDE_PBB
] = R2F_U(yh
);
865 bb
[5*STRIDE_PBB
] = R2F_U(zh
);
868 #endif /* NBNXN_SEARCH_BB_SIMD4 */
870 #ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
872 /* Coordinate order xyz?, bb order xyz0 */
873 static void calc_bounding_box_simd4(int na
, const float *x
, nbnxn_bb_t
*bb
)
875 gmx_simd4_pr bb_0_S
, bb_1_S
;
880 bb_0_S
= gmx_simd4_load_bb_pr(x
);
883 for (i
= 1; i
< na
; i
++)
885 x_S
= gmx_simd4_load_bb_pr(x
+i
*NNBSBB_C
);
886 bb_0_S
= gmx_simd4_min_pr(bb_0_S
, x_S
);
887 bb_1_S
= gmx_simd4_max_pr(bb_1_S
, x_S
);
890 gmx_simd4_store_pr(&bb
->lower
[0], bb_0_S
);
891 gmx_simd4_store_pr(&bb
->upper
[0], bb_1_S
);
894 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
895 static void calc_bounding_box_xxxx_simd4(int na
, const float *x
,
896 nbnxn_bb_t
*bb_work_aligned
,
899 calc_bounding_box_simd4(na
, x
, bb_work_aligned
);
901 bb
[0*STRIDE_PBB
] = bb_work_aligned
->lower
[BB_X
];
902 bb
[1*STRIDE_PBB
] = bb_work_aligned
->lower
[BB_Y
];
903 bb
[2*STRIDE_PBB
] = bb_work_aligned
->lower
[BB_Z
];
904 bb
[3*STRIDE_PBB
] = bb_work_aligned
->upper
[BB_X
];
905 bb
[4*STRIDE_PBB
] = bb_work_aligned
->upper
[BB_Y
];
906 bb
[5*STRIDE_PBB
] = bb_work_aligned
->upper
[BB_Z
];
909 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
912 /* Combines pairs of consecutive bounding boxes */
913 static void combine_bounding_box_pairs(nbnxn_grid_t
*grid
, const nbnxn_bb_t
*bb
)
915 int i
, j
, sc2
, nc2
, c2
;
917 for (i
= 0; i
< grid
->ncx
*grid
->ncy
; i
++)
919 /* Starting bb in a column is expected to be 2-aligned */
920 sc2
= grid
->cxy_ind
[i
]>>1;
921 /* For odd numbers skip the last bb here */
922 nc2
= (grid
->cxy_na
[i
]+3)>>(2+1);
923 for (c2
= sc2
; c2
< sc2
+nc2
; c2
++)
925 #ifdef NBNXN_SEARCH_BB_SIMD4
926 gmx_simd4_pr min_S
, max_S
;
928 min_S
= gmx_simd4_min_pr(gmx_simd4_load_bb_pr(&bb
[c2
*2+0].lower
[0]),
929 gmx_simd4_load_bb_pr(&bb
[c2
*2+1].lower
[0]));
930 max_S
= gmx_simd4_max_pr(gmx_simd4_load_bb_pr(&bb
[c2
*2+0].upper
[0]),
931 gmx_simd4_load_bb_pr(&bb
[c2
*2+1].upper
[0]));
932 gmx_simd4_store_pr(&grid
->bbj
[c2
].lower
[0], min_S
);
933 gmx_simd4_store_pr(&grid
->bbj
[c2
].upper
[0], max_S
);
935 for (j
= 0; j
< NNBSBB_C
; j
++)
937 grid
->bbj
[c2
].lower
[j
] = min(bb
[c2
*2+0].lower
[j
],
938 bb
[c2
*2+1].lower
[j
]);
939 grid
->bbj
[c2
].upper
[j
] = max(bb
[c2
*2+0].upper
[j
],
940 bb
[c2
*2+1].upper
[j
]);
944 if (((grid
->cxy_na
[i
]+3)>>2) & 1)
946 /* The bb count in this column is odd: duplicate the last bb */
947 for (j
= 0; j
< NNBSBB_C
; j
++)
949 grid
->bbj
[c2
].lower
[j
] = bb
[c2
*2].lower
[j
];
950 grid
->bbj
[c2
].upper
[j
] = bb
[c2
*2].upper
[j
];
957 /* Prints the average bb size, used for debug output */
958 static void print_bbsizes_simple(FILE *fp
,
959 const nbnxn_search_t nbs
,
960 const nbnxn_grid_t
*grid
)
966 for (c
= 0; c
< grid
->nc
; c
++)
968 for (d
= 0; d
< DIM
; d
++)
970 ba
[d
] += grid
->bb
[c
].upper
[d
] - grid
->bb
[c
].lower
[d
];
973 dsvmul(1.0/grid
->nc
, ba
, ba
);
975 fprintf(fp
, "ns bb: %4.2f %4.2f %4.2f %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
976 nbs
->box
[XX
][XX
]/grid
->ncx
,
977 nbs
->box
[YY
][YY
]/grid
->ncy
,
978 nbs
->box
[ZZ
][ZZ
]*grid
->ncx
*grid
->ncy
/grid
->nc
,
979 ba
[XX
], ba
[YY
], ba
[ZZ
],
980 ba
[XX
]*grid
->ncx
/nbs
->box
[XX
][XX
],
981 ba
[YY
]*grid
->ncy
/nbs
->box
[YY
][YY
],
982 ba
[ZZ
]*grid
->nc
/(grid
->ncx
*grid
->ncy
*nbs
->box
[ZZ
][ZZ
]));
985 /* Prints the average bb size, used for debug output */
986 static void print_bbsizes_supersub(FILE *fp
,
987 const nbnxn_search_t nbs
,
988 const nbnxn_grid_t
*grid
)
995 for (c
= 0; c
< grid
->nc
; c
++)
998 for (s
= 0; s
< grid
->nsubc
[c
]; s
+= STRIDE_PBB
)
1002 cs_w
= (c
*GPU_NSUBCELL
+ s
)/STRIDE_PBB
;
1003 for (i
= 0; i
< STRIDE_PBB
; i
++)
1005 for (d
= 0; d
< DIM
; d
++)
1008 grid
->pbb
[cs_w
*NNBSBB_XXXX
+(DIM
+d
)*STRIDE_PBB
+i
] -
1009 grid
->pbb
[cs_w
*NNBSBB_XXXX
+ d
*STRIDE_PBB
+i
];
1014 for (s
= 0; s
< grid
->nsubc
[c
]; s
++)
1018 cs
= c
*GPU_NSUBCELL
+ s
;
1019 for (d
= 0; d
< DIM
; d
++)
1021 ba
[d
] += grid
->bb
[cs
].upper
[d
] - grid
->bb
[cs
].lower
[d
];
1025 ns
+= grid
->nsubc
[c
];
1027 dsvmul(1.0/ns
, ba
, ba
);
1029 fprintf(fp
, "ns bb: %4.2f %4.2f %4.2f %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
1030 nbs
->box
[XX
][XX
]/(grid
->ncx
*GPU_NSUBCELL_X
),
1031 nbs
->box
[YY
][YY
]/(grid
->ncy
*GPU_NSUBCELL_Y
),
1032 nbs
->box
[ZZ
][ZZ
]*grid
->ncx
*grid
->ncy
/(grid
->nc
*GPU_NSUBCELL_Z
),
1033 ba
[XX
], ba
[YY
], ba
[ZZ
],
1034 ba
[XX
]*grid
->ncx
*GPU_NSUBCELL_X
/nbs
->box
[XX
][XX
],
1035 ba
[YY
]*grid
->ncy
*GPU_NSUBCELL_Y
/nbs
->box
[YY
][YY
],
1036 ba
[ZZ
]*grid
->nc
*GPU_NSUBCELL_Z
/(grid
->ncx
*grid
->ncy
*nbs
->box
[ZZ
][ZZ
]));
1039 /* Potentially sorts atoms on LJ coefficients !=0 and ==0.
1040 * Also sets interaction flags.
1042 void sort_on_lj(nbnxn_atomdata_t
*nbat
, int na_c
,
1043 int a0
, int a1
, const int *atinfo
,
1047 int subc
, s
, a
, n1
, n2
, a_lj_max
, i
, j
;
1048 int sort1
[NBNXN_NA_SC_MAX
/GPU_NSUBCELL
];
1049 int sort2
[NBNXN_NA_SC_MAX
/GPU_NSUBCELL
];
1055 for (s
= a0
; s
< a1
; s
+= na_c
)
1057 /* Make lists for this (sub-)cell on atoms with and without LJ */
1062 for (a
= s
; a
< min(s
+na_c
, a1
); a
++)
1064 haveQ
= haveQ
|| GET_CGINFO_HAS_Q(atinfo
[order
[a
]]);
1066 if (GET_CGINFO_HAS_VDW(atinfo
[order
[a
]]))
1068 sort1
[n1
++] = order
[a
];
1073 sort2
[n2
++] = order
[a
];
1077 /* If we don't have atom with LJ, there's nothing to sort */
1080 *flags
|= NBNXN_CI_DO_LJ(subc
);
1084 /* Only sort when strictly necessary. Ordering particles
1085 * Ordering particles can lead to less accurate summation
1086 * due to rounding, both for LJ and Coulomb interactions.
1088 if (2*(a_lj_max
- s
) >= na_c
)
1090 for (i
= 0; i
< n1
; i
++)
1092 order
[a0
+i
] = sort1
[i
];
1094 for (j
= 0; j
< n2
; j
++)
1096 order
[a0
+n1
+j
] = sort2
[j
];
1100 *flags
|= NBNXN_CI_HALF_LJ(subc
);
1105 *flags
|= NBNXN_CI_DO_COUL(subc
);
1111 /* Fill a pair search cell with atoms.
1112 * Potentially sorts atoms and sets the interaction flags.
1114 void fill_cell(const nbnxn_search_t nbs
,
1116 nbnxn_atomdata_t
*nbat
,
1120 int sx
, int sy
, int sz
,
1121 nbnxn_bb_t
*bb_work_aligned
)
1134 sort_on_lj(nbat
, grid
->na_c
, a0
, a1
, atinfo
, nbs
->a
,
1135 grid
->flags
+(a0
>>grid
->na_c_2log
)-grid
->cell0
);
1138 /* Now we have sorted the atoms, set the cell indices */
1139 for (a
= a0
; a
< a1
; a
++)
1141 nbs
->cell
[nbs
->a
[a
]] = a
;
1144 copy_rvec_to_nbat_real(nbs
->a
+a0
, a1
-a0
, grid
->na_c
, x
,
1145 nbat
->XFormat
, nbat
->x
, a0
,
1148 if (nbat
->XFormat
== nbatX4
)
1150 /* Store the bounding boxes as xyz.xyz. */
1151 offset
= (a0
- grid
->cell0
*grid
->na_sc
) >> grid
->na_c_2log
;
1152 bb_ptr
= grid
->bb
+ offset
;
1154 #if defined GMX_NBNXN_SIMD && GMX_SIMD_WIDTH_HERE == 2
1155 if (2*grid
->na_cj
== grid
->na_c
)
1157 calc_bounding_box_x_x4_halves(na
, nbat
->x
+X4_IND_A(a0
), bb_ptr
,
1158 grid
->bbj
+offset
*2);
1163 calc_bounding_box_x_x4(na
, nbat
->x
+X4_IND_A(a0
), bb_ptr
);
1166 else if (nbat
->XFormat
== nbatX8
)
1168 /* Store the bounding boxes as xyz.xyz. */
1169 offset
= (a0
- grid
->cell0
*grid
->na_sc
) >> grid
->na_c_2log
;
1170 bb_ptr
= grid
->bb
+ offset
;
1172 calc_bounding_box_x_x8(na
, nbat
->x
+X8_IND_A(a0
), bb_ptr
);
1175 else if (!grid
->bSimple
)
1177 /* Store the bounding boxes in a format convenient
1178 * for SIMD4 calculations: xxxxyyyyzzzz...
1182 ((a0
-grid
->cell0
*grid
->na_sc
)>>(grid
->na_c_2log
+STRIDE_PBB_2LOG
))*NNBSBB_XXXX
+
1183 (((a0
-grid
->cell0
*grid
->na_sc
)>>grid
->na_c_2log
) & (STRIDE_PBB
-1));
1185 #ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
1186 if (nbat
->XFormat
== nbatXYZQ
)
1188 calc_bounding_box_xxxx_simd4(na
, nbat
->x
+a0
*nbat
->xstride
,
1189 bb_work_aligned
, pbb_ptr
);
1194 calc_bounding_box_xxxx(na
, nbat
->xstride
, nbat
->x
+a0
*nbat
->xstride
,
1199 fprintf(debug
, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
1201 pbb_ptr
[0*STRIDE_PBB
], pbb_ptr
[3*STRIDE_PBB
],
1202 pbb_ptr
[1*STRIDE_PBB
], pbb_ptr
[4*STRIDE_PBB
],
1203 pbb_ptr
[2*STRIDE_PBB
], pbb_ptr
[5*STRIDE_PBB
]);
1209 /* Store the bounding boxes as xyz.xyz. */
1210 bb_ptr
= grid
->bb
+((a0
-grid
->cell0
*grid
->na_sc
)>>grid
->na_c_2log
);
1212 calc_bounding_box(na
, nbat
->xstride
, nbat
->x
+a0
*nbat
->xstride
,
1218 bbo
= (a0
- grid
->cell0
*grid
->na_sc
)/grid
->na_c
;
1219 fprintf(debug
, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
1221 grid
->bb
[bbo
].lower
[BB_X
],
1222 grid
->bb
[bbo
].lower
[BB_Y
],
1223 grid
->bb
[bbo
].lower
[BB_Z
],
1224 grid
->bb
[bbo
].upper
[BB_X
],
1225 grid
->bb
[bbo
].upper
[BB_Y
],
1226 grid
->bb
[bbo
].upper
[BB_Z
]);
1231 /* Spatially sort the atoms within one grid column */
1232 static void sort_columns_simple(const nbnxn_search_t nbs
,
1238 nbnxn_atomdata_t
*nbat
,
1239 int cxy_start
, int cxy_end
,
1243 int cx
, cy
, cz
, ncz
, cfilled
, c
;
1244 int na
, ash
, ind
, a
;
1249 fprintf(debug
, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1250 grid
->cell0
, cxy_start
, cxy_end
, a0
, a1
);
1253 /* Sort the atoms within each x,y column in 3 dimensions */
1254 for (cxy
= cxy_start
; cxy
< cxy_end
; cxy
++)
1257 cy
= cxy
- cx
*grid
->ncy
;
1259 na
= grid
->cxy_na
[cxy
];
1260 ncz
= grid
->cxy_ind
[cxy
+1] - grid
->cxy_ind
[cxy
];
1261 ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
1263 /* Sort the atoms within each x,y column on z coordinate */
1264 sort_atoms(ZZ
, FALSE
,
1267 1.0/nbs
->box
[ZZ
][ZZ
], ncz
*grid
->na_sc
,
1270 /* Fill the ncz cells in this column */
1271 cfilled
= grid
->cxy_ind
[cxy
];
1272 for (cz
= 0; cz
< ncz
; cz
++)
1274 c
= grid
->cxy_ind
[cxy
] + cz
;
1276 ash_c
= ash
+ cz
*grid
->na_sc
;
1277 na_c
= min(grid
->na_sc
, na
-(ash_c
-ash
));
1279 fill_cell(nbs
, grid
, nbat
,
1280 ash_c
, ash_c
+na_c
, atinfo
, x
,
1281 grid
->na_sc
*cx
+ (dd_zone
>> 2),
1282 grid
->na_sc
*cy
+ (dd_zone
& 3),
1286 /* This copy to bbcz is not really necessary.
1287 * But it allows to use the same grid search code
1288 * for the simple and supersub cell setups.
1294 grid
->bbcz
[c
*NNBSBB_D
] = grid
->bb
[cfilled
].lower
[BB_Z
];
1295 grid
->bbcz
[c
*NNBSBB_D
+1] = grid
->bb
[cfilled
].upper
[BB_Z
];
1298 /* Set the unused atom indices to -1 */
1299 for (ind
= na
; ind
< ncz
*grid
->na_sc
; ind
++)
1301 nbs
->a
[ash
+ind
] = -1;
1306 /* Spatially sort the atoms within one grid column */
1307 static void sort_columns_supersub(const nbnxn_search_t nbs
,
1313 nbnxn_atomdata_t
*nbat
,
1314 int cxy_start
, int cxy_end
,
1318 int cx
, cy
, cz
= -1, c
= -1, ncz
;
1319 int na
, ash
, na_c
, ind
, a
;
1320 int subdiv_z
, sub_z
, na_z
, ash_z
;
1321 int subdiv_y
, sub_y
, na_y
, ash_y
;
1322 int subdiv_x
, sub_x
, na_x
, ash_x
;
1324 nbnxn_bb_t bb_work_array
[2], *bb_work_aligned
;
1326 bb_work_aligned
= (nbnxn_bb_t
*)(((size_t)(bb_work_array
+1)) & (~((size_t)15)));
1330 fprintf(debug
, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1331 grid
->cell0
, cxy_start
, cxy_end
, a0
, a1
);
1334 subdiv_x
= grid
->na_c
;
1335 subdiv_y
= GPU_NSUBCELL_X
*subdiv_x
;
1336 subdiv_z
= GPU_NSUBCELL_Y
*subdiv_y
;
1338 /* Sort the atoms within each x,y column in 3 dimensions */
1339 for (cxy
= cxy_start
; cxy
< cxy_end
; cxy
++)
1342 cy
= cxy
- cx
*grid
->ncy
;
1344 na
= grid
->cxy_na
[cxy
];
1345 ncz
= grid
->cxy_ind
[cxy
+1] - grid
->cxy_ind
[cxy
];
1346 ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
1348 /* Sort the atoms within each x,y column on z coordinate */
1349 sort_atoms(ZZ
, FALSE
,
1352 1.0/nbs
->box
[ZZ
][ZZ
], ncz
*grid
->na_sc
,
1355 /* This loop goes over the supercells and subcells along z at once */
1356 for (sub_z
= 0; sub_z
< ncz
*GPU_NSUBCELL_Z
; sub_z
++)
1358 ash_z
= ash
+ sub_z
*subdiv_z
;
1359 na_z
= min(subdiv_z
, na
-(ash_z
-ash
));
1361 /* We have already sorted on z */
1363 if (sub_z
% GPU_NSUBCELL_Z
== 0)
1365 cz
= sub_z
/GPU_NSUBCELL_Z
;
1366 c
= grid
->cxy_ind
[cxy
] + cz
;
1368 /* The number of atoms in this supercell */
1369 na_c
= min(grid
->na_sc
, na
-(ash_z
-ash
));
1371 grid
->nsubc
[c
] = min(GPU_NSUBCELL
, (na_c
+grid
->na_c
-1)/grid
->na_c
);
1373 /* Store the z-boundaries of the super cell */
1374 grid
->bbcz
[c
*NNBSBB_D
] = x
[nbs
->a
[ash_z
]][ZZ
];
1375 grid
->bbcz
[c
*NNBSBB_D
+1] = x
[nbs
->a
[ash_z
+na_c
-1]][ZZ
];
1378 #if GPU_NSUBCELL_Y > 1
1379 /* Sort the atoms along y */
1380 sort_atoms(YY
, (sub_z
& 1),
1381 nbs
->a
+ash_z
, na_z
, x
,
1382 grid
->c0
[YY
]+cy
*grid
->sy
,
1383 grid
->inv_sy
, subdiv_z
,
1387 for (sub_y
= 0; sub_y
< GPU_NSUBCELL_Y
; sub_y
++)
1389 ash_y
= ash_z
+ sub_y
*subdiv_y
;
1390 na_y
= min(subdiv_y
, na
-(ash_y
-ash
));
1392 #if GPU_NSUBCELL_X > 1
1393 /* Sort the atoms along x */
1394 sort_atoms(XX
, ((cz
*GPU_NSUBCELL_Y
+ sub_y
) & 1),
1395 nbs
->a
+ash_y
, na_y
, x
,
1396 grid
->c0
[XX
]+cx
*grid
->sx
,
1397 grid
->inv_sx
, subdiv_y
,
1401 for (sub_x
= 0; sub_x
< GPU_NSUBCELL_X
; sub_x
++)
1403 ash_x
= ash_y
+ sub_x
*subdiv_x
;
1404 na_x
= min(subdiv_x
, na
-(ash_x
-ash
));
1406 fill_cell(nbs
, grid
, nbat
,
1407 ash_x
, ash_x
+na_x
, atinfo
, x
,
1408 grid
->na_c
*(cx
*GPU_NSUBCELL_X
+sub_x
) + (dd_zone
>> 2),
1409 grid
->na_c
*(cy
*GPU_NSUBCELL_Y
+sub_y
) + (dd_zone
& 3),
1416 /* Set the unused atom indices to -1 */
1417 for (ind
= na
; ind
< ncz
*grid
->na_sc
; ind
++)
1419 nbs
->a
[ash
+ind
] = -1;
1424 /* Determine in which grid column atoms should go */
1425 static void calc_column_indices(nbnxn_grid_t
*grid
,
1428 int dd_zone
, const int *move
,
1429 int thread
, int nthread
,
1436 /* We add one extra cell for particles which moved during DD */
1437 for (i
= 0; i
< grid
->ncx
*grid
->ncy
+1; i
++)
1442 n0
= a0
+ (int)((thread
+0)*(a1
- a0
))/nthread
;
1443 n1
= a0
+ (int)((thread
+1)*(a1
- a0
))/nthread
;
1447 for (i
= n0
; i
< n1
; i
++)
1449 if (move
== NULL
|| move
[i
] >= 0)
1451 /* We need to be careful with rounding,
1452 * particles might be a few bits outside the local zone.
1453 * The int cast takes care of the lower bound,
1454 * we will explicitly take care of the upper bound.
1456 cx
= (int)((x
[i
][XX
] - grid
->c0
[XX
])*grid
->inv_sx
);
1457 cy
= (int)((x
[i
][YY
] - grid
->c0
[YY
])*grid
->inv_sy
);
1460 if (cx
< 0 || cx
> grid
->ncx
||
1461 cy
< 0 || cy
> grid
->ncy
)
1464 "grid cell cx %d cy %d out of range (max %d %d)\n"
1465 "atom %f %f %f, grid->c0 %f %f",
1466 cx
, cy
, grid
->ncx
, grid
->ncy
,
1467 x
[i
][XX
], x
[i
][YY
], x
[i
][ZZ
], grid
->c0
[XX
], grid
->c0
[YY
]);
1470 /* Take care of potential rouding issues */
1471 cx
= min(cx
, grid
->ncx
- 1);
1472 cy
= min(cy
, grid
->ncy
- 1);
1474 /* For the moment cell will contain only the, grid local,
1475 * x and y indices, not z.
1477 cell
[i
] = cx
*grid
->ncy
+ cy
;
1481 /* Put this moved particle after the end of the grid,
1482 * so we can process it later without using conditionals.
1484 cell
[i
] = grid
->ncx
*grid
->ncy
;
1493 for (i
= n0
; i
< n1
; i
++)
1495 cx
= (int)((x
[i
][XX
] - grid
->c0
[XX
])*grid
->inv_sx
);
1496 cy
= (int)((x
[i
][YY
] - grid
->c0
[YY
])*grid
->inv_sy
);
1498 /* For non-home zones there could be particles outside
1499 * the non-bonded cut-off range, which have been communicated
1500 * for bonded interactions only. For the result it doesn't
1501 * matter where these end up on the grid. For performance
1502 * we put them in an extra row at the border.
1505 cx
= min(cx
, grid
->ncx
- 1);
1507 cy
= min(cy
, grid
->ncy
- 1);
1509 /* For the moment cell will contain only the, grid local,
1510 * x and y indices, not z.
1512 cell
[i
] = cx
*grid
->ncy
+ cy
;
1519 /* Determine in which grid cells the atoms should go */
1520 static void calc_cell_indices(const nbnxn_search_t nbs
,
1527 nbnxn_atomdata_t
*nbat
)
1530 int cx
, cy
, cxy
, ncz_max
, ncz
;
1531 int nthread
, thread
;
1532 int *cxy_na
, cxy_na_i
;
1534 nthread
= gmx_omp_nthreads_get(emntPairsearch
);
1536 #pragma omp parallel for num_threads(nthread) schedule(static)
1537 for (thread
= 0; thread
< nthread
; thread
++)
1539 calc_column_indices(grid
, a0
, a1
, x
, dd_zone
, move
, thread
, nthread
,
1540 nbs
->cell
, nbs
->work
[thread
].cxy_na
);
1543 /* Make the cell index as a function of x and y */
1546 grid
->cxy_ind
[0] = 0;
1547 for (i
= 0; i
< grid
->ncx
*grid
->ncy
+1; i
++)
1549 /* We set ncz_max at the beginning of the loop iso at the end
1550 * to skip i=grid->ncx*grid->ncy which are moved particles
1551 * that do not need to be ordered on the grid.
1557 cxy_na_i
= nbs
->work
[0].cxy_na
[i
];
1558 for (thread
= 1; thread
< nthread
; thread
++)
1560 cxy_na_i
+= nbs
->work
[thread
].cxy_na
[i
];
1562 ncz
= (cxy_na_i
+ grid
->na_sc
- 1)/grid
->na_sc
;
1563 if (nbat
->XFormat
== nbatX8
)
1565 /* Make the number of cell a multiple of 2 */
1566 ncz
= (ncz
+ 1) & ~1;
1568 grid
->cxy_ind
[i
+1] = grid
->cxy_ind
[i
] + ncz
;
1569 /* Clear cxy_na, so we can reuse the array below */
1570 grid
->cxy_na
[i
] = 0;
1572 grid
->nc
= grid
->cxy_ind
[grid
->ncx
*grid
->ncy
] - grid
->cxy_ind
[0];
1574 nbat
->natoms
= (grid
->cell0
+ grid
->nc
)*grid
->na_sc
;
1578 fprintf(debug
, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1579 grid
->na_sc
, grid
->na_c
, grid
->nc
,
1580 grid
->ncx
, grid
->ncy
, grid
->nc
/((double)(grid
->ncx
*grid
->ncy
)),
1585 for (cy
= 0; cy
< grid
->ncy
; cy
++)
1587 for (cx
= 0; cx
< grid
->ncx
; cx
++)
1589 fprintf(debug
, " %2d", grid
->cxy_ind
[i
+1]-grid
->cxy_ind
[i
]);
1592 fprintf(debug
, "\n");
1597 /* Make sure the work array for sorting is large enough */
1598 if (ncz_max
*grid
->na_sc
*SGSF
> nbs
->work
[0].sort_work_nalloc
)
1600 for (thread
= 0; thread
< nbs
->nthread_max
; thread
++)
1602 nbs
->work
[thread
].sort_work_nalloc
=
1603 over_alloc_large(ncz_max
*grid
->na_sc
*SGSF
);
1604 srenew(nbs
->work
[thread
].sort_work
,
1605 nbs
->work
[thread
].sort_work_nalloc
);
1606 /* When not in use, all elements should be -1 */
1607 for (i
= 0; i
< nbs
->work
[thread
].sort_work_nalloc
; i
++)
1609 nbs
->work
[thread
].sort_work
[i
] = -1;
1614 /* Now we know the dimensions we can fill the grid.
1615 * This is the first, unsorted fill. We sort the columns after this.
1617 for (i
= a0
; i
< a1
; i
++)
1619 /* At this point nbs->cell contains the local grid x,y indices */
1621 nbs
->a
[(grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
+ grid
->cxy_na
[cxy
]++] = i
;
1626 /* Set the cell indices for the moved particles */
1627 n0
= grid
->nc
*grid
->na_sc
;
1628 n1
= grid
->nc
*grid
->na_sc
+grid
->cxy_na
[grid
->ncx
*grid
->ncy
];
1631 for (i
= n0
; i
< n1
; i
++)
1633 nbs
->cell
[nbs
->a
[i
]] = i
;
1638 /* Sort the super-cell columns along z into the sub-cells. */
1639 #pragma omp parallel for num_threads(nbs->nthread_max) schedule(static)
1640 for (thread
= 0; thread
< nbs
->nthread_max
; thread
++)
1644 sort_columns_simple(nbs
, dd_zone
, grid
, a0
, a1
, atinfo
, x
, nbat
,
1645 ((thread
+0)*grid
->ncx
*grid
->ncy
)/nthread
,
1646 ((thread
+1)*grid
->ncx
*grid
->ncy
)/nthread
,
1647 nbs
->work
[thread
].sort_work
);
1651 sort_columns_supersub(nbs
, dd_zone
, grid
, a0
, a1
, atinfo
, x
, nbat
,
1652 ((thread
+0)*grid
->ncx
*grid
->ncy
)/nthread
,
1653 ((thread
+1)*grid
->ncx
*grid
->ncy
)/nthread
,
1654 nbs
->work
[thread
].sort_work
);
1658 if (grid
->bSimple
&& nbat
->XFormat
== nbatX8
)
1660 combine_bounding_box_pairs(grid
, grid
->bb
);
1665 grid
->nsubc_tot
= 0;
1666 for (i
= 0; i
< grid
->nc
; i
++)
1668 grid
->nsubc_tot
+= grid
->nsubc
[i
];
1676 print_bbsizes_simple(debug
, nbs
, grid
);
1680 fprintf(debug
, "ns non-zero sub-cells: %d average atoms %.2f\n",
1681 grid
->nsubc_tot
, (a1
-a0
)/(double)grid
->nsubc_tot
);
1683 print_bbsizes_supersub(debug
, nbs
, grid
);
1688 static void init_buffer_flags(nbnxn_buffer_flags_t
*flags
,
1693 flags
->nflag
= (natoms
+ NBNXN_BUFFERFLAG_SIZE
- 1)/NBNXN_BUFFERFLAG_SIZE
;
1694 if (flags
->nflag
> flags
->flag_nalloc
)
1696 flags
->flag_nalloc
= over_alloc_large(flags
->nflag
);
1697 srenew(flags
->flag
, flags
->flag_nalloc
);
1699 for (b
= 0; b
< flags
->nflag
; b
++)
1705 /* Sets up a grid and puts the atoms on the grid.
1706 * This function only operates on one domain of the domain decompostion.
1707 * Note that without domain decomposition there is only one domain.
1709 void nbnxn_put_on_grid(nbnxn_search_t nbs
,
1710 int ePBC
, matrix box
,
1712 rvec corner0
, rvec corner1
,
1717 int nmoved
, int *move
,
1719 nbnxn_atomdata_t
*nbat
)
1723 int nc_max_grid
, nc_max
;
1725 grid
= &nbs
->grid
[dd_zone
];
1727 nbs_cycle_start(&nbs
->cc
[enbsCCgrid
]);
1729 grid
->bSimple
= nbnxn_kernel_pairlist_simple(nb_kernel_type
);
1731 grid
->na_c
= nbnxn_kernel_to_ci_size(nb_kernel_type
);
1732 grid
->na_cj
= nbnxn_kernel_to_cj_size(nb_kernel_type
);
1733 grid
->na_sc
= (grid
->bSimple
? 1 : GPU_NSUBCELL
)*grid
->na_c
;
1734 grid
->na_c_2log
= get_2log(grid
->na_c
);
1736 nbat
->na_c
= grid
->na_c
;
1745 (nbs
->grid
[dd_zone
-1].cell0
+ nbs
->grid
[dd_zone
-1].nc
)*
1746 nbs
->grid
[dd_zone
-1].na_sc
/grid
->na_sc
;
1754 copy_mat(box
, nbs
->box
);
1756 if (atom_density
>= 0)
1758 grid
->atom_density
= atom_density
;
1762 grid
->atom_density
= grid_atom_density(n
-nmoved
, corner0
, corner1
);
1767 nbs
->natoms_local
= a1
- nmoved
;
1768 /* We assume that nbnxn_put_on_grid is called first
1769 * for the local atoms (dd_zone=0).
1771 nbs
->natoms_nonlocal
= a1
- nmoved
;
1775 nbs
->natoms_nonlocal
= max(nbs
->natoms_nonlocal
, a1
);
1778 nc_max_grid
= set_grid_size_xy(nbs
, grid
,
1779 dd_zone
, n
-nmoved
, corner0
, corner1
,
1780 nbs
->grid
[0].atom_density
,
1783 nc_max
= grid
->cell0
+ nc_max_grid
;
1785 if (a1
> nbs
->cell_nalloc
)
1787 nbs
->cell_nalloc
= over_alloc_large(a1
);
1788 srenew(nbs
->cell
, nbs
->cell_nalloc
);
1791 /* To avoid conditionals we store the moved particles at the end of a,
1792 * make sure we have enough space.
1794 if (nc_max
*grid
->na_sc
+ nmoved
> nbs
->a_nalloc
)
1796 nbs
->a_nalloc
= over_alloc_large(nc_max
*grid
->na_sc
+ nmoved
);
1797 srenew(nbs
->a
, nbs
->a_nalloc
);
1800 /* We need padding up to a multiple of the buffer flag size: simply add */
1801 if (nc_max
*grid
->na_sc
+ NBNXN_BUFFERFLAG_SIZE
> nbat
->nalloc
)
1803 nbnxn_atomdata_realloc(nbat
, nc_max
*grid
->na_sc
+NBNXN_BUFFERFLAG_SIZE
);
1806 calc_cell_indices(nbs
, dd_zone
, grid
, a0
, a1
, atinfo
, x
, move
, nbat
);
1810 nbat
->natoms_local
= nbat
->natoms
;
1813 nbs_cycle_stop(&nbs
->cc
[enbsCCgrid
]);
1816 /* Calls nbnxn_put_on_grid for all non-local domains */
1817 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs
,
1818 const gmx_domdec_zones_t
*zones
,
1822 nbnxn_atomdata_t
*nbat
)
1827 for (zone
= 1; zone
< zones
->n
; zone
++)
1829 for (d
= 0; d
< DIM
; d
++)
1831 c0
[d
] = zones
->size
[zone
].bb_x0
[d
];
1832 c1
[d
] = zones
->size
[zone
].bb_x1
[d
];
1835 nbnxn_put_on_grid(nbs
, nbs
->ePBC
, NULL
,
1837 zones
->cg_range
[zone
],
1838 zones
->cg_range
[zone
+1],
1848 /* Add simple grid type information to the local super/sub grid */
1849 void nbnxn_grid_add_simple(nbnxn_search_t nbs
,
1850 nbnxn_atomdata_t
*nbat
)
1857 grid
= &nbs
->grid
[0];
1861 gmx_incons("nbnxn_grid_simple called with a simple grid");
1864 ncd
= grid
->na_sc
/NBNXN_CPU_CLUSTER_I_SIZE
;
1866 if (grid
->nc
*ncd
> grid
->nc_nalloc_simple
)
1868 grid
->nc_nalloc_simple
= over_alloc_large(grid
->nc
*ncd
);
1869 srenew(grid
->bbcz_simple
, grid
->nc_nalloc_simple
*NNBSBB_D
);
1870 srenew(grid
->bb_simple
, grid
->nc_nalloc_simple
);
1871 srenew(grid
->flags_simple
, grid
->nc_nalloc_simple
);
1874 sfree_aligned(grid
->bbj
);
1875 snew_aligned(grid
->bbj
, grid
->nc_nalloc_simple
/2, 16);
1879 bbcz
= grid
->bbcz_simple
;
1880 bb
= grid
->bb_simple
;
1882 #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
1883 for (sc
= 0; sc
< grid
->nc
; sc
++)
1887 for (c
= 0; c
< ncd
; c
++)
1891 na
= NBNXN_CPU_CLUSTER_I_SIZE
;
1893 nbat
->type
[tx
*NBNXN_CPU_CLUSTER_I_SIZE
+na
-1] == nbat
->ntype
-1)
1900 switch (nbat
->XFormat
)
1903 /* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
1904 calc_bounding_box_x_x4(na
, nbat
->x
+tx
*STRIDE_P4
,
1908 /* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
1909 calc_bounding_box_x_x8(na
, nbat
->x
+X8_IND_A(tx
*NBNXN_CPU_CLUSTER_I_SIZE
),
1913 calc_bounding_box(na
, nbat
->xstride
,
1914 nbat
->x
+tx
*NBNXN_CPU_CLUSTER_I_SIZE
*nbat
->xstride
,
1918 bbcz
[tx
*NNBSBB_D
+0] = bb
[tx
].lower
[BB_Z
];
1919 bbcz
[tx
*NNBSBB_D
+1] = bb
[tx
].upper
[BB_Z
];
1921 /* No interaction optimization yet here */
1922 grid
->flags_simple
[tx
] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
1926 grid
->flags_simple
[tx
] = 0;
1931 if (grid
->bSimple
&& nbat
->XFormat
== nbatX8
)
1933 combine_bounding_box_pairs(grid
, grid
->bb_simple
);
1937 void nbnxn_get_ncells(nbnxn_search_t nbs
, int *ncx
, int *ncy
)
1939 *ncx
= nbs
->grid
[0].ncx
;
1940 *ncy
= nbs
->grid
[0].ncy
;
1943 void nbnxn_get_atomorder(nbnxn_search_t nbs
, int **a
, int *n
)
1945 const nbnxn_grid_t
*grid
;
1947 grid
= &nbs
->grid
[0];
1949 /* Return the atom order for the home cell (index 0) */
1952 *n
= grid
->cxy_ind
[grid
->ncx
*grid
->ncy
]*grid
->na_sc
;
1955 void nbnxn_set_atomorder(nbnxn_search_t nbs
)
1958 int ao
, cx
, cy
, cxy
, cz
, j
;
1960 /* Set the atom order for the home cell (index 0) */
1961 grid
= &nbs
->grid
[0];
1964 for (cx
= 0; cx
< grid
->ncx
; cx
++)
1966 for (cy
= 0; cy
< grid
->ncy
; cy
++)
1968 cxy
= cx
*grid
->ncy
+ cy
;
1969 j
= grid
->cxy_ind
[cxy
]*grid
->na_sc
;
1970 for (cz
= 0; cz
< grid
->cxy_na
[cxy
]; cz
++)
1981 /* Determines the cell range along one dimension that
1982 * the bounding box b0 - b1 sees.
1984 static void get_cell_range(real b0
, real b1
,
1985 int nc
, real c0
, real s
, real invs
,
1986 real d2
, real r2
, int *cf
, int *cl
)
1988 *cf
= max((int)((b0
- c0
)*invs
), 0);
1990 while (*cf
> 0 && d2
+ sqr((b0
- c0
) - (*cf
-1+1)*s
) < r2
)
1995 *cl
= min((int)((b1
- c0
)*invs
), nc
-1);
1996 while (*cl
< nc
-1 && d2
+ sqr((*cl
+1)*s
- (b1
- c0
)) < r2
)
2002 /* Reference code calculating the distance^2 between two bounding boxes */
2003 static float box_dist2(float bx0
, float bx1
, float by0
,
2004 float by1
, float bz0
, float bz1
,
2005 const nbnxn_bb_t
*bb
)
2008 float dl
, dh
, dm
, dm0
;
2012 dl
= bx0
- bb
->upper
[BB_X
];
2013 dh
= bb
->lower
[BB_X
] - bx1
;
2018 dl
= by0
- bb
->upper
[BB_Y
];
2019 dh
= bb
->lower
[BB_Y
] - by1
;
2024 dl
= bz0
- bb
->upper
[BB_Z
];
2025 dh
= bb
->lower
[BB_Z
] - bz1
;
2033 /* Plain C code calculating the distance^2 between two bounding boxes */
2034 static float subc_bb_dist2(int si
, const nbnxn_bb_t
*bb_i_ci
,
2035 int csj
, const nbnxn_bb_t
*bb_j_all
)
2037 const nbnxn_bb_t
*bb_i
, *bb_j
;
2039 float dl
, dh
, dm
, dm0
;
2041 bb_i
= bb_i_ci
+ si
;
2042 bb_j
= bb_j_all
+ csj
;
2046 dl
= bb_i
->lower
[BB_X
] - bb_j
->upper
[BB_X
];
2047 dh
= bb_j
->lower
[BB_X
] - bb_i
->upper
[BB_X
];
2052 dl
= bb_i
->lower
[BB_Y
] - bb_j
->upper
[BB_Y
];
2053 dh
= bb_j
->lower
[BB_Y
] - bb_i
->upper
[BB_Y
];
2058 dl
= bb_i
->lower
[BB_Z
] - bb_j
->upper
[BB_Z
];
2059 dh
= bb_j
->lower
[BB_Z
] - bb_i
->upper
[BB_Z
];
2067 #ifdef NBNXN_SEARCH_BB_SIMD4
2069 /* 4-wide SIMD code for bb distance for bb format xyz0 */
2070 static float subc_bb_dist2_simd4(int si
, const nbnxn_bb_t
*bb_i_ci
,
2071 int csj
, const nbnxn_bb_t
*bb_j_all
)
2073 gmx_simd4_pr bb_i_S0
, bb_i_S1
;
2074 gmx_simd4_pr bb_j_S0
, bb_j_S1
;
2080 bb_i_S0
= gmx_simd4_load_bb_pr(&bb_i_ci
[si
].lower
[0]);
2081 bb_i_S1
= gmx_simd4_load_bb_pr(&bb_i_ci
[si
].upper
[0]);
2082 bb_j_S0
= gmx_simd4_load_bb_pr(&bb_j_all
[csj
].lower
[0]);
2083 bb_j_S1
= gmx_simd4_load_bb_pr(&bb_j_all
[csj
].upper
[0]);
2085 dl_S
= gmx_simd4_sub_pr(bb_i_S0
, bb_j_S1
);
2086 dh_S
= gmx_simd4_sub_pr(bb_j_S0
, bb_i_S1
);
2088 dm_S
= gmx_simd4_max_pr(dl_S
, dh_S
);
2089 dm0_S
= gmx_simd4_max_pr(dm_S
, gmx_simd4_setzero_pr());
2091 return gmx_simd4_dotproduct3(dm0_S
, dm0_S
);
2094 /* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
2095 #define SUBC_BB_DIST2_SIMD4_XXXX_INNER(si, bb_i, d2) \
2099 gmx_simd4_pr dx_0, dy_0, dz_0; \
2100 gmx_simd4_pr dx_1, dy_1, dz_1; \
2102 gmx_simd4_pr mx, my, mz; \
2103 gmx_simd4_pr m0x, m0y, m0z; \
2105 gmx_simd4_pr d2x, d2y, d2z; \
2106 gmx_simd4_pr d2s, d2t; \
2108 shi = si*NNBSBB_D*DIM; \
2110 xi_l = gmx_simd4_load_bb_pr(bb_i+shi+0*STRIDE_PBB); \
2111 yi_l = gmx_simd4_load_bb_pr(bb_i+shi+1*STRIDE_PBB); \
2112 zi_l = gmx_simd4_load_bb_pr(bb_i+shi+2*STRIDE_PBB); \
2113 xi_h = gmx_simd4_load_bb_pr(bb_i+shi+3*STRIDE_PBB); \
2114 yi_h = gmx_simd4_load_bb_pr(bb_i+shi+4*STRIDE_PBB); \
2115 zi_h = gmx_simd4_load_bb_pr(bb_i+shi+5*STRIDE_PBB); \
2117 dx_0 = gmx_simd4_sub_pr(xi_l, xj_h); \
2118 dy_0 = gmx_simd4_sub_pr(yi_l, yj_h); \
2119 dz_0 = gmx_simd4_sub_pr(zi_l, zj_h); \
2121 dx_1 = gmx_simd4_sub_pr(xj_l, xi_h); \
2122 dy_1 = gmx_simd4_sub_pr(yj_l, yi_h); \
2123 dz_1 = gmx_simd4_sub_pr(zj_l, zi_h); \
2125 mx = gmx_simd4_max_pr(dx_0, dx_1); \
2126 my = gmx_simd4_max_pr(dy_0, dy_1); \
2127 mz = gmx_simd4_max_pr(dz_0, dz_1); \
2129 m0x = gmx_simd4_max_pr(mx, zero); \
2130 m0y = gmx_simd4_max_pr(my, zero); \
2131 m0z = gmx_simd4_max_pr(mz, zero); \
2133 d2x = gmx_simd4_mul_pr(m0x, m0x); \
2134 d2y = gmx_simd4_mul_pr(m0y, m0y); \
2135 d2z = gmx_simd4_mul_pr(m0z, m0z); \
2137 d2s = gmx_simd4_add_pr(d2x, d2y); \
2138 d2t = gmx_simd4_add_pr(d2s, d2z); \
2140 gmx_simd4_store_pr(d2+si, d2t); \
2143 /* 4-wide SIMD code for nsi bb distances for bb format xxxxyyyyzzzz */
2144 static void subc_bb_dist2_simd4_xxxx(const float *bb_j
,
2145 int nsi
, const float *bb_i
,
2148 gmx_simd4_pr xj_l
, yj_l
, zj_l
;
2149 gmx_simd4_pr xj_h
, yj_h
, zj_h
;
2150 gmx_simd4_pr xi_l
, yi_l
, zi_l
;
2151 gmx_simd4_pr xi_h
, yi_h
, zi_h
;
2155 zero
= gmx_simd4_setzero_pr();
2157 xj_l
= gmx_simd4_set1_pr(bb_j
[0*STRIDE_PBB
]);
2158 yj_l
= gmx_simd4_set1_pr(bb_j
[1*STRIDE_PBB
]);
2159 zj_l
= gmx_simd4_set1_pr(bb_j
[2*STRIDE_PBB
]);
2160 xj_h
= gmx_simd4_set1_pr(bb_j
[3*STRIDE_PBB
]);
2161 yj_h
= gmx_simd4_set1_pr(bb_j
[4*STRIDE_PBB
]);
2162 zj_h
= gmx_simd4_set1_pr(bb_j
[5*STRIDE_PBB
]);
2164 /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.
2165 * But as we know the number of iterations is 1 or 2, we unroll manually.
2167 SUBC_BB_DIST2_SIMD4_XXXX_INNER(0, bb_i
, d2
);
2168 if (STRIDE_PBB
< nsi
)
2170 SUBC_BB_DIST2_SIMD4_XXXX_INNER(STRIDE_PBB
, bb_i
, d2
);
2174 #endif /* NBNXN_SEARCH_BB_SIMD4 */
2176 /* Plain C function which determines if any atom pair between two cells
2177 * is within distance sqrt(rl2).
2179 static gmx_bool
subc_in_range_x(int na_c
,
2180 int si
, const real
*x_i
,
2181 int csj
, int stride
, const real
*x_j
,
2187 for (i
= 0; i
< na_c
; i
++)
2189 i0
= (si
*na_c
+ i
)*DIM
;
2190 for (j
= 0; j
< na_c
; j
++)
2192 j0
= (csj
*na_c
+ j
)*stride
;
2194 d2
= sqr(x_i
[i0
] - x_j
[j0
]) +
2195 sqr(x_i
[i0
+1] - x_j
[j0
+1]) +
2196 sqr(x_i
[i0
+2] - x_j
[j0
+2]);
2208 #ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
2209 /* When we make seperate single/double precision SIMD vector operation
2210 * include files, this function should be moved there (also using FMA).
2212 static inline gmx_simd4_pr
2213 gmx_simd4_calc_rsq_pr(gmx_simd4_pr x
, gmx_simd4_pr y
, gmx_simd4_pr z
)
2215 return gmx_simd4_add_pr( gmx_simd4_add_pr( gmx_simd4_mul_pr(x
, x
), gmx_simd4_mul_pr(y
, y
) ), gmx_simd4_mul_pr(z
, z
) );
2219 /* 4-wide SIMD function which determines if any atom pair between two cells,
2220 * both with 8 atoms, is within distance sqrt(rl2).
2221 * Using 8-wide AVX is not faster on Intel Sandy Bridge.
2223 static gmx_bool
subc_in_range_simd4(int na_c
,
2224 int si
, const real
*x_i
,
2225 int csj
, int stride
, const real
*x_j
,
2228 #ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
2229 gmx_simd4_pr ix_S0
, iy_S0
, iz_S0
;
2230 gmx_simd4_pr ix_S1
, iy_S1
, iz_S1
;
2237 rc2_S
= gmx_simd4_set1_pr(rl2
);
2239 dim_stride
= NBNXN_GPU_CLUSTER_SIZE
/STRIDE_PBB
*DIM
;
2240 ix_S0
= gmx_simd4_load_bb_pr(x_i
+(si
*dim_stride
+0)*STRIDE_PBB
);
2241 iy_S0
= gmx_simd4_load_bb_pr(x_i
+(si
*dim_stride
+1)*STRIDE_PBB
);
2242 iz_S0
= gmx_simd4_load_bb_pr(x_i
+(si
*dim_stride
+2)*STRIDE_PBB
);
2243 ix_S1
= gmx_simd4_load_bb_pr(x_i
+(si
*dim_stride
+3)*STRIDE_PBB
);
2244 iy_S1
= gmx_simd4_load_bb_pr(x_i
+(si
*dim_stride
+4)*STRIDE_PBB
);
2245 iz_S1
= gmx_simd4_load_bb_pr(x_i
+(si
*dim_stride
+5)*STRIDE_PBB
);
2247 /* We loop from the outer to the inner particles to maximize
2248 * the chance that we find a pair in range quickly and return.
2254 gmx_simd4_pr jx0_S
, jy0_S
, jz0_S
;
2255 gmx_simd4_pr jx1_S
, jy1_S
, jz1_S
;
2257 gmx_simd4_pr dx_S0
, dy_S0
, dz_S0
;
2258 gmx_simd4_pr dx_S1
, dy_S1
, dz_S1
;
2259 gmx_simd4_pr dx_S2
, dy_S2
, dz_S2
;
2260 gmx_simd4_pr dx_S3
, dy_S3
, dz_S3
;
2262 gmx_simd4_pr rsq_S0
;
2263 gmx_simd4_pr rsq_S1
;
2264 gmx_simd4_pr rsq_S2
;
2265 gmx_simd4_pr rsq_S3
;
2267 gmx_simd4_pb wco_S0
;
2268 gmx_simd4_pb wco_S1
;
2269 gmx_simd4_pb wco_S2
;
2270 gmx_simd4_pb wco_S3
;
2271 gmx_simd4_pb wco_any_S01
, wco_any_S23
, wco_any_S
;
2273 jx0_S
= gmx_simd4_set1_pr(x_j
[j0
*stride
+0]);
2274 jy0_S
= gmx_simd4_set1_pr(x_j
[j0
*stride
+1]);
2275 jz0_S
= gmx_simd4_set1_pr(x_j
[j0
*stride
+2]);
2277 jx1_S
= gmx_simd4_set1_pr(x_j
[j1
*stride
+0]);
2278 jy1_S
= gmx_simd4_set1_pr(x_j
[j1
*stride
+1]);
2279 jz1_S
= gmx_simd4_set1_pr(x_j
[j1
*stride
+2]);
2281 /* Calculate distance */
2282 dx_S0
= gmx_simd4_sub_pr(ix_S0
, jx0_S
);
2283 dy_S0
= gmx_simd4_sub_pr(iy_S0
, jy0_S
);
2284 dz_S0
= gmx_simd4_sub_pr(iz_S0
, jz0_S
);
2285 dx_S1
= gmx_simd4_sub_pr(ix_S1
, jx0_S
);
2286 dy_S1
= gmx_simd4_sub_pr(iy_S1
, jy0_S
);
2287 dz_S1
= gmx_simd4_sub_pr(iz_S1
, jz0_S
);
2288 dx_S2
= gmx_simd4_sub_pr(ix_S0
, jx1_S
);
2289 dy_S2
= gmx_simd4_sub_pr(iy_S0
, jy1_S
);
2290 dz_S2
= gmx_simd4_sub_pr(iz_S0
, jz1_S
);
2291 dx_S3
= gmx_simd4_sub_pr(ix_S1
, jx1_S
);
2292 dy_S3
= gmx_simd4_sub_pr(iy_S1
, jy1_S
);
2293 dz_S3
= gmx_simd4_sub_pr(iz_S1
, jz1_S
);
2295 /* rsq = dx*dx+dy*dy+dz*dz */
2296 rsq_S0
= gmx_simd4_calc_rsq_pr(dx_S0
, dy_S0
, dz_S0
);
2297 rsq_S1
= gmx_simd4_calc_rsq_pr(dx_S1
, dy_S1
, dz_S1
);
2298 rsq_S2
= gmx_simd4_calc_rsq_pr(dx_S2
, dy_S2
, dz_S2
);
2299 rsq_S3
= gmx_simd4_calc_rsq_pr(dx_S3
, dy_S3
, dz_S3
);
2301 wco_S0
= gmx_simd4_cmplt_pr(rsq_S0
, rc2_S
);
2302 wco_S1
= gmx_simd4_cmplt_pr(rsq_S1
, rc2_S
);
2303 wco_S2
= gmx_simd4_cmplt_pr(rsq_S2
, rc2_S
);
2304 wco_S3
= gmx_simd4_cmplt_pr(rsq_S3
, rc2_S
);
2306 wco_any_S01
= gmx_simd4_or_pb(wco_S0
, wco_S1
);
2307 wco_any_S23
= gmx_simd4_or_pb(wco_S2
, wco_S3
);
2308 wco_any_S
= gmx_simd4_or_pb(wco_any_S01
, wco_any_S23
);
2310 if (gmx_simd4_anytrue_pb(wco_any_S
))
2322 gmx_incons("SIMD4 function called without 4-wide SIMD support");
2328 /* Returns the j sub-cell for index cj_ind */
2329 static int nbl_cj(const nbnxn_pairlist_t
*nbl
, int cj_ind
)
2331 return nbl
->cj4
[cj_ind
>> NBNXN_GPU_JGROUP_SIZE_2LOG
].cj
[cj_ind
& (NBNXN_GPU_JGROUP_SIZE
- 1)];
2334 /* Returns the i-interaction mask of the j sub-cell for index cj_ind */
2335 static unsigned nbl_imask0(const nbnxn_pairlist_t
*nbl
, int cj_ind
)
2337 return nbl
->cj4
[cj_ind
>> NBNXN_GPU_JGROUP_SIZE_2LOG
].imei
[0].imask
;
2340 /* Ensures there is enough space for extra extra exclusion masks */
2341 static void check_excl_space(nbnxn_pairlist_t
*nbl
, int extra
)
2343 if (nbl
->nexcl
+extra
> nbl
->excl_nalloc
)
2345 nbl
->excl_nalloc
= over_alloc_small(nbl
->nexcl
+extra
);
2346 nbnxn_realloc_void((void **)&nbl
->excl
,
2347 nbl
->nexcl
*sizeof(*nbl
->excl
),
2348 nbl
->excl_nalloc
*sizeof(*nbl
->excl
),
2349 nbl
->alloc
, nbl
->free
);
2353 /* Ensures there is enough space for ncell extra j-cells in the list */
2354 static void check_subcell_list_space_simple(nbnxn_pairlist_t
*nbl
,
2359 cj_max
= nbl
->ncj
+ ncell
;
2361 if (cj_max
> nbl
->cj_nalloc
)
2363 nbl
->cj_nalloc
= over_alloc_small(cj_max
);
2364 nbnxn_realloc_void((void **)&nbl
->cj
,
2365 nbl
->ncj
*sizeof(*nbl
->cj
),
2366 nbl
->cj_nalloc
*sizeof(*nbl
->cj
),
2367 nbl
->alloc
, nbl
->free
);
2371 /* Ensures there is enough space for ncell extra j-subcells in the list */
2372 static void check_subcell_list_space_supersub(nbnxn_pairlist_t
*nbl
,
2375 int ncj4_max
, j4
, j
, w
, t
;
2378 #define WARP_SIZE 32
2380 /* We can have maximally nsupercell*GPU_NSUBCELL sj lists */
2381 /* We can store 4 j-subcell - i-supercell pairs in one struct.
2382 * since we round down, we need one extra entry.
2384 ncj4_max
= ((nbl
->work
->cj_ind
+ nsupercell
*GPU_NSUBCELL
+ NBNXN_GPU_JGROUP_SIZE
- 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG
);
2386 if (ncj4_max
> nbl
->cj4_nalloc
)
2388 nbl
->cj4_nalloc
= over_alloc_small(ncj4_max
);
2389 nbnxn_realloc_void((void **)&nbl
->cj4
,
2390 nbl
->work
->cj4_init
*sizeof(*nbl
->cj4
),
2391 nbl
->cj4_nalloc
*sizeof(*nbl
->cj4
),
2392 nbl
->alloc
, nbl
->free
);
2395 if (ncj4_max
> nbl
->work
->cj4_init
)
2397 for (j4
= nbl
->work
->cj4_init
; j4
< ncj4_max
; j4
++)
2399 /* No i-subcells and no excl's in the list initially */
2400 for (w
= 0; w
< NWARP
; w
++)
2402 nbl
->cj4
[j4
].imei
[w
].imask
= 0U;
2403 nbl
->cj4
[j4
].imei
[w
].excl_ind
= 0;
2407 nbl
->work
->cj4_init
= ncj4_max
;
2411 /* Set all excl masks for one GPU warp no exclusions */
2412 static void set_no_excls(nbnxn_excl_t
*excl
)
2416 for (t
= 0; t
< WARP_SIZE
; t
++)
2418 /* Turn all interaction bits on */
2419 excl
->pair
[t
] = NBNXN_INTERACTION_MASK_ALL
;
2423 /* Initializes a single nbnxn_pairlist_t data structure */
2424 static void nbnxn_init_pairlist(nbnxn_pairlist_t
*nbl
,
2426 nbnxn_alloc_t
*alloc
,
2431 nbl
->alloc
= nbnxn_alloc_aligned
;
2439 nbl
->free
= nbnxn_free_aligned
;
2446 nbl
->bSimple
= bSimple
;
2457 /* We need one element extra in sj, so alloc initially with 1 */
2458 nbl
->cj4_nalloc
= 0;
2465 nbl
->excl_nalloc
= 0;
2467 check_excl_space(nbl
, 1);
2469 set_no_excls(&nbl
->excl
[0]);
2475 snew_aligned(nbl
->work
->bb_ci
, 1, NBNXN_SEARCH_BB_MEM_ALIGN
);
2480 snew_aligned(nbl
->work
->pbb_ci
, GPU_NSUBCELL
/STRIDE_PBB
*NNBSBB_XXXX
, NBNXN_SEARCH_BB_MEM_ALIGN
);
2482 snew_aligned(nbl
->work
->bb_ci
, GPU_NSUBCELL
, NBNXN_SEARCH_BB_MEM_ALIGN
);
2485 snew_aligned(nbl
->work
->x_ci
, NBNXN_NA_SC_MAX
*DIM
, NBNXN_SEARCH_BB_MEM_ALIGN
);
2486 #ifdef GMX_NBNXN_SIMD
2487 snew_aligned(nbl
->work
->x_ci_simd_4xn
, 1, NBNXN_MEM_ALIGN
);
2488 snew_aligned(nbl
->work
->x_ci_simd_2xnn
, 1, NBNXN_MEM_ALIGN
);
2490 snew_aligned(nbl
->work
->d2
, GPU_NSUBCELL
, NBNXN_SEARCH_BB_MEM_ALIGN
);
2492 nbl
->work
->sort
= NULL
;
2493 nbl
->work
->sort_nalloc
= 0;
2494 nbl
->work
->sci_sort
= NULL
;
2495 nbl
->work
->sci_sort_nalloc
= 0;
2498 void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t
*nbl_list
,
2499 gmx_bool bSimple
, gmx_bool bCombined
,
2500 nbnxn_alloc_t
*alloc
,
2505 nbl_list
->bSimple
= bSimple
;
2506 nbl_list
->bCombined
= bCombined
;
2508 nbl_list
->nnbl
= gmx_omp_nthreads_get(emntNonbonded
);
2510 if (!nbl_list
->bCombined
&&
2511 nbl_list
->nnbl
> NBNXN_BUFFERFLAG_MAX_THREADS
)
2513 gmx_fatal(FARGS
, "%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
2514 nbl_list
->nnbl
, NBNXN_BUFFERFLAG_MAX_THREADS
, NBNXN_BUFFERFLAG_MAX_THREADS
);
2517 snew(nbl_list
->nbl
, nbl_list
->nnbl
);
2518 /* Execute in order to avoid memory interleaving between threads */
2519 #pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
2520 for (i
= 0; i
< nbl_list
->nnbl
; i
++)
2522 /* Allocate the nblist data structure locally on each thread
2523 * to optimize memory access for NUMA architectures.
2525 snew(nbl_list
->nbl
[i
], 1);
2527 /* Only list 0 is used on the GPU, use normal allocation for i>0 */
2530 nbnxn_init_pairlist(nbl_list
->nbl
[i
], nbl_list
->bSimple
, alloc
, free
);
2534 nbnxn_init_pairlist(nbl_list
->nbl
[i
], nbl_list
->bSimple
, NULL
, NULL
);
2539 /* Print statistics of a pair list, used for debug output */
2540 static void print_nblist_statistics_simple(FILE *fp
, const nbnxn_pairlist_t
*nbl
,
2541 const nbnxn_search_t nbs
, real rl
)
2543 const nbnxn_grid_t
*grid
;
2548 /* This code only produces correct statistics with domain decomposition */
2549 grid
= &nbs
->grid
[0];
2551 fprintf(fp
, "nbl nci %d ncj %d\n",
2552 nbl
->nci
, nbl
->ncj
);
2553 fprintf(fp
, "nbl na_sc %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
2554 nbl
->na_sc
, rl
, nbl
->ncj
, nbl
->ncj
/(double)grid
->nc
,
2555 nbl
->ncj
/(double)grid
->nc
*grid
->na_sc
,
2556 nbl
->ncj
/(double)grid
->nc
*grid
->na_sc
/(0.5*4.0/3.0*M_PI
*rl
*rl
*rl
*grid
->nc
*grid
->na_sc
/det(nbs
->box
)));
2558 fprintf(fp
, "nbl average j cell list length %.1f\n",
2559 0.25*nbl
->ncj
/(double)nbl
->nci
);
2561 for (s
= 0; s
< SHIFTS
; s
++)
2566 for (i
= 0; i
< nbl
->nci
; i
++)
2568 cs
[nbl
->ci
[i
].shift
& NBNXN_CI_SHIFT
] +=
2569 nbl
->ci
[i
].cj_ind_end
- nbl
->ci
[i
].cj_ind_start
;
2571 j
= nbl
->ci
[i
].cj_ind_start
;
2572 while (j
< nbl
->ci
[i
].cj_ind_end
&&
2573 nbl
->cj
[j
].excl
!= NBNXN_INTERACTION_MASK_ALL
)
2579 fprintf(fp
, "nbl cell pairs, total: %d excl: %d %.1f%%\n",
2580 nbl
->ncj
, npexcl
, 100*npexcl
/(double)nbl
->ncj
);
2581 for (s
= 0; s
< SHIFTS
; s
++)
2585 fprintf(fp
, "nbl shift %2d ncj %3d\n", s
, cs
[s
]);
2590 /* Print statistics of a pair lists, used for debug output */
2591 static void print_nblist_statistics_supersub(FILE *fp
, const nbnxn_pairlist_t
*nbl
,
2592 const nbnxn_search_t nbs
, real rl
)
2594 const nbnxn_grid_t
*grid
;
2595 int i
, j4
, j
, si
, b
;
2596 int c
[GPU_NSUBCELL
+1];
2598 /* This code only produces correct statistics with domain decomposition */
2599 grid
= &nbs
->grid
[0];
2601 fprintf(fp
, "nbl nsci %d ncj4 %d nsi %d excl4 %d\n",
2602 nbl
->nsci
, nbl
->ncj4
, nbl
->nci_tot
, nbl
->nexcl
);
2603 fprintf(fp
, "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
2604 nbl
->na_ci
, rl
, nbl
->nci_tot
, nbl
->nci_tot
/(double)grid
->nsubc_tot
,
2605 nbl
->nci_tot
/(double)grid
->nsubc_tot
*grid
->na_c
,
2606 nbl
->nci_tot
/(double)grid
->nsubc_tot
*grid
->na_c
/(0.5*4.0/3.0*M_PI
*rl
*rl
*rl
*grid
->nsubc_tot
*grid
->na_c
/det(nbs
->box
)));
2608 fprintf(fp
, "nbl average j super cell list length %.1f\n",
2609 0.25*nbl
->ncj4
/(double)nbl
->nsci
);
2610 fprintf(fp
, "nbl average i sub cell list length %.1f\n",
2611 nbl
->nci_tot
/((double)nbl
->ncj4
));
2613 for (si
= 0; si
<= GPU_NSUBCELL
; si
++)
2617 for (i
= 0; i
< nbl
->nsci
; i
++)
2619 for (j4
= nbl
->sci
[i
].cj4_ind_start
; j4
< nbl
->sci
[i
].cj4_ind_end
; j4
++)
2621 for (j
= 0; j
< NBNXN_GPU_JGROUP_SIZE
; j
++)
2624 for (si
= 0; si
< GPU_NSUBCELL
; si
++)
2626 if (nbl
->cj4
[j4
].imei
[0].imask
& (1U << (j
*GPU_NSUBCELL
+ si
)))
2635 for (b
= 0; b
<= GPU_NSUBCELL
; b
++)
2637 fprintf(fp
, "nbl j-list #i-subcell %d %7d %4.1f\n",
2638 b
, c
[b
], 100.0*c
[b
]/(double)(nbl
->ncj4
*NBNXN_GPU_JGROUP_SIZE
));
2642 /* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp */
2643 static void low_get_nbl_exclusions(nbnxn_pairlist_t
*nbl
, int cj4
,
2644 int warp
, nbnxn_excl_t
**excl
)
2646 if (nbl
->cj4
[cj4
].imei
[warp
].excl_ind
== 0)
2648 /* No exclusions set, make a new list entry */
2649 nbl
->cj4
[cj4
].imei
[warp
].excl_ind
= nbl
->nexcl
;
2651 *excl
= &nbl
->excl
[nbl
->cj4
[cj4
].imei
[warp
].excl_ind
];
2652 set_no_excls(*excl
);
2656 /* We already have some exclusions, new ones can be added to the list */
2657 *excl
= &nbl
->excl
[nbl
->cj4
[cj4
].imei
[warp
].excl_ind
];
2661 /* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp,
2662 * allocates extra memory, if necessary.
2664 static void get_nbl_exclusions_1(nbnxn_pairlist_t
*nbl
, int cj4
,
2665 int warp
, nbnxn_excl_t
**excl
)
2667 if (nbl
->cj4
[cj4
].imei
[warp
].excl_ind
== 0)
2669 /* We need to make a new list entry, check if we have space */
2670 check_excl_space(nbl
, 1);
2672 low_get_nbl_exclusions(nbl
, cj4
, warp
, excl
);
2675 /* Returns pointers to the exclusion mask for cj4-unit cj4 for both warps,
2676 * allocates extra memory, if necessary.
2678 static void get_nbl_exclusions_2(nbnxn_pairlist_t
*nbl
, int cj4
,
2679 nbnxn_excl_t
**excl_w0
,
2680 nbnxn_excl_t
**excl_w1
)
2682 /* Check for space we might need */
2683 check_excl_space(nbl
, 2);
2685 low_get_nbl_exclusions(nbl
, cj4
, 0, excl_w0
);
2686 low_get_nbl_exclusions(nbl
, cj4
, 1, excl_w1
);
2689 /* Sets the self exclusions i=j and pair exclusions i>j */
2690 static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t
*nbl
,
2691 int cj4_ind
, int sj_offset
,
2694 nbnxn_excl_t
*excl
[2];
2697 /* Here we only set the set self and double pair exclusions */
2699 get_nbl_exclusions_2(nbl
, cj4_ind
, &excl
[0], &excl
[1]);
2701 /* Only minor < major bits set */
2702 for (ej
= 0; ej
< nbl
->na_ci
; ej
++)
2705 for (ei
= ej
; ei
< nbl
->na_ci
; ei
++)
2707 excl
[w
]->pair
[(ej
& (NBNXN_GPU_JGROUP_SIZE
-1))*nbl
->na_ci
+ ei
] &=
2708 ~(1U << (sj_offset
*GPU_NSUBCELL
+ si
));
2713 /* Returns a diagonal or off-diagonal interaction mask for plain C lists */
2714 static unsigned int get_imask(gmx_bool rdiag
, int ci
, int cj
)
2716 return (rdiag
&& ci
== cj
? NBNXN_INTERACTION_MASK_DIAG
: NBNXN_INTERACTION_MASK_ALL
);
2719 /* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
2720 static unsigned int get_imask_simd_j2(gmx_bool rdiag
, int ci
, int cj
)
2722 return (rdiag
&& ci
*2 == cj
? NBNXN_INTERACTION_MASK_DIAG_J2_0
:
2723 (rdiag
&& ci
*2+1 == cj
? NBNXN_INTERACTION_MASK_DIAG_J2_1
:
2724 NBNXN_INTERACTION_MASK_ALL
));
2727 /* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
2728 static unsigned int get_imask_simd_j4(gmx_bool rdiag
, int ci
, int cj
)
2730 return (rdiag
&& ci
== cj
? NBNXN_INTERACTION_MASK_DIAG
: NBNXN_INTERACTION_MASK_ALL
);
2733 /* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
2734 static unsigned int get_imask_simd_j8(gmx_bool rdiag
, int ci
, int cj
)
2736 return (rdiag
&& ci
== cj
*2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0
:
2737 (rdiag
&& ci
== cj
*2+1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1
:
2738 NBNXN_INTERACTION_MASK_ALL
));
2741 #ifdef GMX_NBNXN_SIMD
2742 #if GMX_SIMD_WIDTH_HERE == 2
2743 #define get_imask_simd_4xn get_imask_simd_j2
2745 #if GMX_SIMD_WIDTH_HERE == 4
2746 #define get_imask_simd_4xn get_imask_simd_j4
2748 #if GMX_SIMD_WIDTH_HERE == 8
2749 #define get_imask_simd_4xn get_imask_simd_j8
2750 #define get_imask_simd_2xnn get_imask_simd_j4
2752 #if GMX_SIMD_WIDTH_HERE == 16
2753 #define get_imask_simd_2xnn get_imask_simd_j8
2757 /* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
2758 * Checks bounding box distances and possibly atom pair distances.
2760 static void make_cluster_list_simple(const nbnxn_grid_t
*gridj
,
2761 nbnxn_pairlist_t
*nbl
,
2762 int ci
, int cjf
, int cjl
,
2763 gmx_bool remove_sub_diag
,
2765 real rl2
, float rbb2
,
2768 const nbnxn_list_work_t
*work
;
2770 const nbnxn_bb_t
*bb_ci
;
2775 int cjf_gl
, cjl_gl
, cj
;
2779 bb_ci
= nbl
->work
->bb_ci
;
2780 x_ci
= nbl
->work
->x_ci
;
2783 while (!InRange
&& cjf
<= cjl
)
2785 d2
= subc_bb_dist2(0, bb_ci
, cjf
, gridj
->bb
);
2788 /* Check if the distance is within the distance where
2789 * we use only the bounding box distance rbb,
2790 * or within the cut-off and there is at least one atom pair
2791 * within the cut-off.
2801 cjf_gl
= gridj
->cell0
+ cjf
;
2802 for (i
= 0; i
< NBNXN_CPU_CLUSTER_I_SIZE
&& !InRange
; i
++)
2804 for (j
= 0; j
< NBNXN_CPU_CLUSTER_I_SIZE
; j
++)
2806 InRange
= InRange
||
2807 (sqr(x_ci
[i
*STRIDE_XYZ
+XX
] - x_j
[(cjf_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+XX
]) +
2808 sqr(x_ci
[i
*STRIDE_XYZ
+YY
] - x_j
[(cjf_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+YY
]) +
2809 sqr(x_ci
[i
*STRIDE_XYZ
+ZZ
] - x_j
[(cjf_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+ZZ
]) < rl2
);
2812 *ndistc
+= NBNXN_CPU_CLUSTER_I_SIZE
*NBNXN_CPU_CLUSTER_I_SIZE
;
2825 while (!InRange
&& cjl
> cjf
)
2827 d2
= subc_bb_dist2(0, bb_ci
, cjl
, gridj
->bb
);
2830 /* Check if the distance is within the distance where
2831 * we use only the bounding box distance rbb,
2832 * or within the cut-off and there is at least one atom pair
2833 * within the cut-off.
2843 cjl_gl
= gridj
->cell0
+ cjl
;
2844 for (i
= 0; i
< NBNXN_CPU_CLUSTER_I_SIZE
&& !InRange
; i
++)
2846 for (j
= 0; j
< NBNXN_CPU_CLUSTER_I_SIZE
; j
++)
2848 InRange
= InRange
||
2849 (sqr(x_ci
[i
*STRIDE_XYZ
+XX
] - x_j
[(cjl_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+XX
]) +
2850 sqr(x_ci
[i
*STRIDE_XYZ
+YY
] - x_j
[(cjl_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+YY
]) +
2851 sqr(x_ci
[i
*STRIDE_XYZ
+ZZ
] - x_j
[(cjl_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+ZZ
]) < rl2
);
2854 *ndistc
+= NBNXN_CPU_CLUSTER_I_SIZE
*NBNXN_CPU_CLUSTER_I_SIZE
;
2864 for (cj
= cjf
; cj
<= cjl
; cj
++)
2866 /* Store cj and the interaction mask */
2867 nbl
->cj
[nbl
->ncj
].cj
= gridj
->cell0
+ cj
;
2868 nbl
->cj
[nbl
->ncj
].excl
= get_imask(remove_sub_diag
, ci
, cj
);
2871 /* Increase the closing index in i super-cell list */
2872 nbl
->ci
[nbl
->nci
].cj_ind_end
= nbl
->ncj
;
2876 #ifdef GMX_NBNXN_SIMD_4XN
2877 #include "nbnxn_search_simd_4xn.h"
2879 #ifdef GMX_NBNXN_SIMD_2XNN
2880 #include "nbnxn_search_simd_2xnn.h"
2883 /* Plain C or SIMD4 code for making a pair list of super-cell sci vs scj.
2884 * Checks bounding box distances and possibly atom pair distances.
2886 static void make_cluster_list_supersub(const nbnxn_search_t nbs
,
2887 const nbnxn_grid_t
*gridi
,
2888 const nbnxn_grid_t
*gridj
,
2889 nbnxn_pairlist_t
*nbl
,
2891 gmx_bool sci_equals_scj
,
2892 int stride
, const real
*x
,
2893 real rl2
, float rbb2
,
2898 int cjo
, ci1
, ci
, cj
, cj_gl
;
2899 int cj4_ind
, cj_offset
;
2903 const float *pbb_ci
;
2905 const nbnxn_bb_t
*bb_ci
;
2910 #define PRUNE_LIST_CPU_ONE
2911 #ifdef PRUNE_LIST_CPU_ONE
2915 d2l
= nbl
->work
->d2
;
2918 pbb_ci
= nbl
->work
->pbb_ci
;
2920 bb_ci
= nbl
->work
->bb_ci
;
2922 x_ci
= nbl
->work
->x_ci
;
2926 for (cjo
= 0; cjo
< gridj
->nsubc
[scj
]; cjo
++)
2928 cj4_ind
= (nbl
->work
->cj_ind
>> NBNXN_GPU_JGROUP_SIZE_2LOG
);
2929 cj_offset
= nbl
->work
->cj_ind
- cj4_ind
*NBNXN_GPU_JGROUP_SIZE
;
2930 cj4
= &nbl
->cj4
[cj4_ind
];
2932 cj
= scj
*GPU_NSUBCELL
+ cjo
;
2934 cj_gl
= gridj
->cell0
*GPU_NSUBCELL
+ cj
;
2936 /* Initialize this j-subcell i-subcell list */
2937 cj4
->cj
[cj_offset
] = cj_gl
;
2946 ci1
= gridi
->nsubc
[sci
];
2950 /* Determine all ci1 bb distances in one call with SIMD4 */
2951 subc_bb_dist2_simd4_xxxx(gridj
->pbb
+(cj
>>STRIDE_PBB_2LOG
)*NNBSBB_XXXX
+(cj
& (STRIDE_PBB
-1)),
2957 /* We use a fixed upper-bound instead of ci1 to help optimization */
2958 for (ci
= 0; ci
< GPU_NSUBCELL
; ci
++)
2965 #ifndef NBNXN_BBXXXX
2966 /* Determine the bb distance between ci and cj */
2967 d2l
[ci
] = subc_bb_dist2(ci
, bb_ci
, cj
, gridj
->bb
);
2972 #ifdef PRUNE_LIST_CPU_ALL
2973 /* Check if the distance is within the distance where
2974 * we use only the bounding box distance rbb,
2975 * or within the cut-off and there is at least one atom pair
2976 * within the cut-off. This check is very costly.
2978 *ndistc
+= na_c
*na_c
;
2981 #ifdef NBNXN_PBB_SIMD4
2986 (na_c
, ci
, x_ci
, cj_gl
, stride
, x
, rl2
)))
2988 /* Check if the distance between the two bounding boxes
2989 * in within the pair-list cut-off.
2994 /* Flag this i-subcell to be taken into account */
2995 imask
|= (1U << (cj_offset
*GPU_NSUBCELL
+ci
));
2997 #ifdef PRUNE_LIST_CPU_ONE
3005 #ifdef PRUNE_LIST_CPU_ONE
3006 /* If we only found 1 pair, check if any atoms are actually
3007 * within the cut-off, so we could get rid of it.
3009 if (npair
== 1 && d2l
[ci_last
] >= rbb2
)
3011 /* Avoid using function pointers here, as it's slower */
3013 #ifdef NBNXN_PBB_SIMD4
3014 !subc_in_range_simd4
3018 (na_c
, ci_last
, x_ci
, cj_gl
, stride
, x
, rl2
))
3020 imask
&= ~(1U << (cj_offset
*GPU_NSUBCELL
+ci_last
));
3028 /* We have a useful sj entry, close it now */
3030 /* Set the exclucions for the ci== sj entry.
3031 * Here we don't bother to check if this entry is actually flagged,
3032 * as it will nearly always be in the list.
3036 set_self_and_newton_excls_supersub(nbl
, cj4_ind
, cj_offset
, cjo
);
3039 /* Copy the cluster interaction mask to the list */
3040 for (w
= 0; w
< NWARP
; w
++)
3042 cj4
->imei
[w
].imask
|= imask
;
3045 nbl
->work
->cj_ind
++;
3047 /* Keep the count */
3048 nbl
->nci_tot
+= npair
;
3050 /* Increase the closing index in i super-cell list */
3051 nbl
->sci
[nbl
->nsci
].cj4_ind_end
=
3052 ((nbl
->work
->cj_ind
+NBNXN_GPU_JGROUP_SIZE
-1) >> NBNXN_GPU_JGROUP_SIZE_2LOG
);
3057 /* Set all atom-pair exclusions from the topology stored in excl
3058 * as masks in the pair-list for simple list i-entry nbl_ci
3060 static void set_ci_top_excls(const nbnxn_search_t nbs
,
3061 nbnxn_pairlist_t
*nbl
,
3062 gmx_bool diagRemoved
,
3065 const nbnxn_ci_t
*nbl_ci
,
3066 const t_blocka
*excl
)
3070 int cj_ind_first
, cj_ind_last
;
3071 int cj_first
, cj_last
;
3073 int i
, ai
, aj
, si
, eind
, ge
, se
;
3074 int found
, cj_ind_0
, cj_ind_1
, cj_ind_m
;
3078 nbnxn_excl_t
*nbl_excl
;
3079 int inner_i
, inner_e
;
3083 if (nbl_ci
->cj_ind_end
== nbl_ci
->cj_ind_start
)
3091 cj_ind_first
= nbl_ci
->cj_ind_start
;
3092 cj_ind_last
= nbl
->ncj
- 1;
3094 cj_first
= nbl
->cj
[cj_ind_first
].cj
;
3095 cj_last
= nbl
->cj
[cj_ind_last
].cj
;
3097 /* Determine how many contiguous j-cells we have starting
3098 * from the first i-cell. This number can be used to directly
3099 * calculate j-cell indices for excluded atoms.
3102 if (na_ci_2log
== na_cj_2log
)
3104 while (cj_ind_first
+ ndirect
<= cj_ind_last
&&
3105 nbl
->cj
[cj_ind_first
+ndirect
].cj
== ci
+ ndirect
)
3110 #ifdef NBNXN_SEARCH_BB_SIMD4
3113 while (cj_ind_first
+ ndirect
<= cj_ind_last
&&
3114 nbl
->cj
[cj_ind_first
+ndirect
].cj
== ci_to_cj(na_cj_2log
, ci
) + ndirect
)
3121 /* Loop over the atoms in the i super-cell */
3122 for (i
= 0; i
< nbl
->na_sc
; i
++)
3124 ai
= nbs
->a
[ci
*nbl
->na_sc
+i
];
3127 si
= (i
>>na_ci_2log
);
3129 /* Loop over the topology-based exclusions for this i-atom */
3130 for (eind
= excl
->index
[ai
]; eind
< excl
->index
[ai
+1]; eind
++)
3136 /* The self exclusion are already set, save some time */
3142 /* Without shifts we only calculate interactions j>i
3143 * for one-way pair-lists.
3145 if (diagRemoved
&& ge
<= ci
*nbl
->na_sc
+ i
)
3150 se
= (ge
>> na_cj_2log
);
3152 /* Could the cluster se be in our list? */
3153 if (se
>= cj_first
&& se
<= cj_last
)
3155 if (se
< cj_first
+ ndirect
)
3157 /* We can calculate cj_ind directly from se */
3158 found
= cj_ind_first
+ se
- cj_first
;
3162 /* Search for se using bisection */
3164 cj_ind_0
= cj_ind_first
+ ndirect
;
3165 cj_ind_1
= cj_ind_last
+ 1;
3166 while (found
== -1 && cj_ind_0
< cj_ind_1
)
3168 cj_ind_m
= (cj_ind_0
+ cj_ind_1
)>>1;
3170 cj_m
= nbl
->cj
[cj_ind_m
].cj
;
3178 cj_ind_1
= cj_ind_m
;
3182 cj_ind_0
= cj_ind_m
+ 1;
3189 inner_i
= i
- (si
<< na_ci_2log
);
3190 inner_e
= ge
- (se
<< na_cj_2log
);
3192 nbl
->cj
[found
].excl
&= ~(1U<<((inner_i
<<na_cj_2log
) + inner_e
));
3193 /* The next code line is usually not needed. We do not want to version
3194 * away the above line, because there is logic that relies on being
3195 * able to detect easily whether any exclusions exist. */
3196 #if (defined GMX_CPU_ACCELERATION_IBM_QPX)
3197 nbl
->cj
[found
].interaction_mask_indices
[inner_i
] &= ~(1U << inner_e
);
3206 /* Set all atom-pair exclusions from the topology stored in excl
3207 * as masks in the pair-list for i-super-cell entry nbl_sci
3209 static void set_sci_top_excls(const nbnxn_search_t nbs
,
3210 nbnxn_pairlist_t
*nbl
,
3211 gmx_bool diagRemoved
,
3213 const nbnxn_sci_t
*nbl_sci
,
3214 const t_blocka
*excl
)
3219 int cj_ind_first
, cj_ind_last
;
3220 int cj_first
, cj_last
;
3222 int i
, ai
, aj
, si
, eind
, ge
, se
;
3223 int found
, cj_ind_0
, cj_ind_1
, cj_ind_m
;
3227 nbnxn_excl_t
*nbl_excl
;
3228 int inner_i
, inner_e
, w
;
3234 if (nbl_sci
->cj4_ind_end
== nbl_sci
->cj4_ind_start
)
3242 cj_ind_first
= nbl_sci
->cj4_ind_start
*NBNXN_GPU_JGROUP_SIZE
;
3243 cj_ind_last
= nbl
->work
->cj_ind
- 1;
3245 cj_first
= nbl
->cj4
[nbl_sci
->cj4_ind_start
].cj
[0];
3246 cj_last
= nbl_cj(nbl
, cj_ind_last
);
3248 /* Determine how many contiguous j-clusters we have starting
3249 * from the first i-cluster. This number can be used to directly
3250 * calculate j-cluster indices for excluded atoms.
3253 while (cj_ind_first
+ ndirect
<= cj_ind_last
&&
3254 nbl_cj(nbl
, cj_ind_first
+ndirect
) == sci
*GPU_NSUBCELL
+ ndirect
)
3259 /* Loop over the atoms in the i super-cell */
3260 for (i
= 0; i
< nbl
->na_sc
; i
++)
3262 ai
= nbs
->a
[sci
*nbl
->na_sc
+i
];
3265 si
= (i
>>na_c_2log
);
3267 /* Loop over the topology-based exclusions for this i-atom */
3268 for (eind
= excl
->index
[ai
]; eind
< excl
->index
[ai
+1]; eind
++)
3274 /* The self exclusion are already set, save some time */
3280 /* Without shifts we only calculate interactions j>i
3281 * for one-way pair-lists.
3283 if (diagRemoved
&& ge
<= sci
*nbl
->na_sc
+ i
)
3289 /* Could the cluster se be in our list? */
3290 if (se
>= cj_first
&& se
<= cj_last
)
3292 if (se
< cj_first
+ ndirect
)
3294 /* We can calculate cj_ind directly from se */
3295 found
= cj_ind_first
+ se
- cj_first
;
3299 /* Search for se using bisection */
3301 cj_ind_0
= cj_ind_first
+ ndirect
;
3302 cj_ind_1
= cj_ind_last
+ 1;
3303 while (found
== -1 && cj_ind_0
< cj_ind_1
)
3305 cj_ind_m
= (cj_ind_0
+ cj_ind_1
)>>1;
3307 cj_m
= nbl_cj(nbl
, cj_ind_m
);
3315 cj_ind_1
= cj_ind_m
;
3319 cj_ind_0
= cj_ind_m
+ 1;
3326 inner_i
= i
- si
*na_c
;
3327 inner_e
= ge
- se
*na_c
;
3329 /* Macro for getting the index of atom a within a cluster */
3330 #define AMODCJ4(a) ((a) & (NBNXN_GPU_JGROUP_SIZE - 1))
3331 /* Macro for converting an atom number to a cluster number */
3332 #define A2CJ4(a) ((a) >> NBNXN_GPU_JGROUP_SIZE_2LOG)
3333 /* Macro for getting the index of an i-atom within a warp */
3334 #define AMODWI(a) ((a) & (NBNXN_GPU_CLUSTER_SIZE/2 - 1))
3336 if (nbl_imask0(nbl
, found
) & (1U << (AMODCJ4(found
)*GPU_NSUBCELL
+ si
)))
3340 get_nbl_exclusions_1(nbl
, A2CJ4(found
), w
, &nbl_excl
);
3342 nbl_excl
->pair
[AMODWI(inner_e
)*nbl
->na_ci
+inner_i
] &=
3343 ~(1U << (AMODCJ4(found
)*GPU_NSUBCELL
+ si
));
3356 /* Reallocate the simple ci list for at least n entries */
3357 static void nb_realloc_ci(nbnxn_pairlist_t
*nbl
, int n
)
3359 nbl
->ci_nalloc
= over_alloc_small(n
);
3360 nbnxn_realloc_void((void **)&nbl
->ci
,
3361 nbl
->nci
*sizeof(*nbl
->ci
),
3362 nbl
->ci_nalloc
*sizeof(*nbl
->ci
),
3363 nbl
->alloc
, nbl
->free
);
3366 /* Reallocate the super-cell sci list for at least n entries */
3367 static void nb_realloc_sci(nbnxn_pairlist_t
*nbl
, int n
)
3369 nbl
->sci_nalloc
= over_alloc_small(n
);
3370 nbnxn_realloc_void((void **)&nbl
->sci
,
3371 nbl
->nsci
*sizeof(*nbl
->sci
),
3372 nbl
->sci_nalloc
*sizeof(*nbl
->sci
),
3373 nbl
->alloc
, nbl
->free
);
3376 /* Make a new ci entry at index nbl->nci */
3377 static void new_ci_entry(nbnxn_pairlist_t
*nbl
, int ci
, int shift
, int flags
,
3378 nbnxn_list_work_t
*work
)
3380 if (nbl
->nci
+ 1 > nbl
->ci_nalloc
)
3382 nb_realloc_ci(nbl
, nbl
->nci
+1);
3384 nbl
->ci
[nbl
->nci
].ci
= ci
;
3385 nbl
->ci
[nbl
->nci
].shift
= shift
;
3386 /* Store the interaction flags along with the shift */
3387 nbl
->ci
[nbl
->nci
].shift
|= flags
;
3388 nbl
->ci
[nbl
->nci
].cj_ind_start
= nbl
->ncj
;
3389 nbl
->ci
[nbl
->nci
].cj_ind_end
= nbl
->ncj
;
3392 /* Make a new sci entry at index nbl->nsci */
3393 static void new_sci_entry(nbnxn_pairlist_t
*nbl
, int sci
, int shift
, int flags
,
3394 nbnxn_list_work_t
*work
)
3396 if (nbl
->nsci
+ 1 > nbl
->sci_nalloc
)
3398 nb_realloc_sci(nbl
, nbl
->nsci
+1);
3400 nbl
->sci
[nbl
->nsci
].sci
= sci
;
3401 nbl
->sci
[nbl
->nsci
].shift
= shift
;
3402 nbl
->sci
[nbl
->nsci
].cj4_ind_start
= nbl
->ncj4
;
3403 nbl
->sci
[nbl
->nsci
].cj4_ind_end
= nbl
->ncj4
;
3406 /* Sort the simple j-list cj on exclusions.
3407 * Entries with exclusions will all be sorted to the beginning of the list.
3409 static void sort_cj_excl(nbnxn_cj_t
*cj
, int ncj
,
3410 nbnxn_list_work_t
*work
)
3414 if (ncj
> work
->cj_nalloc
)
3416 work
->cj_nalloc
= over_alloc_large(ncj
);
3417 srenew(work
->cj
, work
->cj_nalloc
);
3420 /* Make a list of the j-cells involving exclusions */
3422 for (j
= 0; j
< ncj
; j
++)
3424 if (cj
[j
].excl
!= NBNXN_INTERACTION_MASK_ALL
)
3426 work
->cj
[jnew
++] = cj
[j
];
3429 /* Check if there are exclusions at all or not just the first entry */
3430 if (!((jnew
== 0) ||
3431 (jnew
== 1 && cj
[0].excl
!= NBNXN_INTERACTION_MASK_ALL
)))
3433 for (j
= 0; j
< ncj
; j
++)
3435 if (cj
[j
].excl
== NBNXN_INTERACTION_MASK_ALL
)
3437 work
->cj
[jnew
++] = cj
[j
];
3440 for (j
= 0; j
< ncj
; j
++)
3442 cj
[j
] = work
->cj
[j
];
3447 /* Close this simple list i entry */
3448 static void close_ci_entry_simple(nbnxn_pairlist_t
*nbl
)
3452 /* All content of the new ci entry have already been filled correctly,
3453 * we only need to increase the count here (for non empty lists).
3455 jlen
= nbl
->ci
[nbl
->nci
].cj_ind_end
- nbl
->ci
[nbl
->nci
].cj_ind_start
;
3458 sort_cj_excl(nbl
->cj
+nbl
->ci
[nbl
->nci
].cj_ind_start
, jlen
, nbl
->work
);
3460 /* The counts below are used for non-bonded pair/flop counts
3461 * and should therefore match the available kernel setups.
3463 if (!(nbl
->ci
[nbl
->nci
].shift
& NBNXN_CI_DO_COUL(0)))
3465 nbl
->work
->ncj_noq
+= jlen
;
3467 else if ((nbl
->ci
[nbl
->nci
].shift
& NBNXN_CI_HALF_LJ(0)) ||
3468 !(nbl
->ci
[nbl
->nci
].shift
& NBNXN_CI_DO_LJ(0)))
3470 nbl
->work
->ncj_hlj
+= jlen
;
3477 /* Split sci entry for load balancing on the GPU.
3478 * Splitting ensures we have enough lists to fully utilize the whole GPU.
3479 * With progBal we generate progressively smaller lists, which improves
3480 * load balancing. As we only know the current count on our own thread,
3481 * we will need to estimate the current total amount of i-entries.
3482 * As the lists get concatenated later, this estimate depends
3483 * both on nthread and our own thread index.
3485 static void split_sci_entry(nbnxn_pairlist_t
*nbl
,
3486 int nsp_max_av
, gmx_bool progBal
, int nc_bal
,
3487 int thread
, int nthread
)
3491 int cj4_start
, cj4_end
, j4len
, cj4
;
3493 int nsp
, nsp_sci
, nsp_cj4
, nsp_cj4_e
, nsp_cj4_p
;
3498 /* Estimate the total numbers of ci's of the nblist combined
3499 * over all threads using the target number of ci's.
3501 nsci_est
= nc_bal
*thread
/nthread
+ nbl
->nsci
;
3503 /* The first ci blocks should be larger, to avoid overhead.
3504 * The last ci blocks should be smaller, to improve load balancing.
3507 nsp_max_av
*nc_bal
*3/(2*(nsci_est
- 1 + nc_bal
)));
3511 nsp_max
= nsp_max_av
;
3514 cj4_start
= nbl
->sci
[nbl
->nsci
-1].cj4_ind_start
;
3515 cj4_end
= nbl
->sci
[nbl
->nsci
-1].cj4_ind_end
;
3516 j4len
= cj4_end
- cj4_start
;
3518 if (j4len
> 1 && j4len
*GPU_NSUBCELL
*NBNXN_GPU_JGROUP_SIZE
> nsp_max
)
3520 /* Remove the last ci entry and process the cj4's again */
3528 for (cj4
= cj4_start
; cj4
< cj4_end
; cj4
++)
3530 nsp_cj4_p
= nsp_cj4
;
3531 /* Count the number of cluster pairs in this cj4 group */
3533 for (p
= 0; p
< GPU_NSUBCELL
*NBNXN_GPU_JGROUP_SIZE
; p
++)
3535 nsp_cj4
+= (nbl
->cj4
[cj4
].imei
[0].imask
>> p
) & 1;
3538 if (nsp_cj4
> 0 && nsp
+ nsp_cj4
> nsp_max
)
3540 /* Split the list at cj4 */
3541 nbl
->sci
[sci
].cj4_ind_end
= cj4
;
3542 /* Create a new sci entry */
3545 if (nbl
->nsci
+1 > nbl
->sci_nalloc
)
3547 nb_realloc_sci(nbl
, nbl
->nsci
+1);
3549 nbl
->sci
[sci
].sci
= nbl
->sci
[nbl
->nsci
-1].sci
;
3550 nbl
->sci
[sci
].shift
= nbl
->sci
[nbl
->nsci
-1].shift
;
3551 nbl
->sci
[sci
].cj4_ind_start
= cj4
;
3553 nsp_cj4_e
= nsp_cj4_p
;
3559 /* Put the remaining cj4's in the last sci entry */
3560 nbl
->sci
[sci
].cj4_ind_end
= cj4_end
;
3562 /* Possibly balance out the last two sci's
3563 * by moving the last cj4 of the second last sci.
3565 if (nsp_sci
- nsp_cj4_e
>= nsp
+ nsp_cj4_e
)
3567 nbl
->sci
[sci
-1].cj4_ind_end
--;
3568 nbl
->sci
[sci
].cj4_ind_start
--;
3575 /* Clost this super/sub list i entry */
3576 static void close_ci_entry_supersub(nbnxn_pairlist_t
*nbl
,
3578 gmx_bool progBal
, int nc_bal
,
3579 int thread
, int nthread
)
3584 /* All content of the new ci entry have already been filled correctly,
3585 * we only need to increase the count here (for non empty lists).
3587 j4len
= nbl
->sci
[nbl
->nsci
].cj4_ind_end
- nbl
->sci
[nbl
->nsci
].cj4_ind_start
;
3590 /* We can only have complete blocks of 4 j-entries in a list,
3591 * so round the count up before closing.
3593 nbl
->ncj4
= ((nbl
->work
->cj_ind
+ NBNXN_GPU_JGROUP_SIZE
- 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG
);
3594 nbl
->work
->cj_ind
= nbl
->ncj4
*NBNXN_GPU_JGROUP_SIZE
;
3600 /* Measure the size of the new entry and potentially split it */
3601 split_sci_entry(nbl
, nsp_max_av
, progBal
, nc_bal
, thread
, nthread
);
3606 /* Syncs the working array before adding another grid pair to the list */
3607 static void sync_work(nbnxn_pairlist_t
*nbl
)
3611 nbl
->work
->cj_ind
= nbl
->ncj4
*NBNXN_GPU_JGROUP_SIZE
;
3612 nbl
->work
->cj4_init
= nbl
->ncj4
;
3616 /* Clears an nbnxn_pairlist_t data structure */
3617 static void clear_pairlist(nbnxn_pairlist_t
*nbl
)
3626 nbl
->work
->ncj_noq
= 0;
3627 nbl
->work
->ncj_hlj
= 0;
3630 /* Sets a simple list i-cell bounding box, including PBC shift */
3631 static gmx_inline
void set_icell_bb_simple(const nbnxn_bb_t
*bb
, int ci
,
3632 real shx
, real shy
, real shz
,
3635 bb_ci
->lower
[BB_X
] = bb
[ci
].lower
[BB_X
] + shx
;
3636 bb_ci
->lower
[BB_Y
] = bb
[ci
].lower
[BB_Y
] + shy
;
3637 bb_ci
->lower
[BB_Z
] = bb
[ci
].lower
[BB_Z
] + shz
;
3638 bb_ci
->upper
[BB_X
] = bb
[ci
].upper
[BB_X
] + shx
;
3639 bb_ci
->upper
[BB_Y
] = bb
[ci
].upper
[BB_Y
] + shy
;
3640 bb_ci
->upper
[BB_Z
] = bb
[ci
].upper
[BB_Z
] + shz
;
3644 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
3645 static void set_icell_bbxxxx_supersub(const float *bb
, int ci
,
3646 real shx
, real shy
, real shz
,
3651 ia
= ci
*(GPU_NSUBCELL
>>STRIDE_PBB_2LOG
)*NNBSBB_XXXX
;
3652 for (m
= 0; m
< (GPU_NSUBCELL
>>STRIDE_PBB_2LOG
)*NNBSBB_XXXX
; m
+= NNBSBB_XXXX
)
3654 for (i
= 0; i
< STRIDE_PBB
; i
++)
3656 bb_ci
[m
+0*STRIDE_PBB
+i
] = bb
[ia
+m
+0*STRIDE_PBB
+i
] + shx
;
3657 bb_ci
[m
+1*STRIDE_PBB
+i
] = bb
[ia
+m
+1*STRIDE_PBB
+i
] + shy
;
3658 bb_ci
[m
+2*STRIDE_PBB
+i
] = bb
[ia
+m
+2*STRIDE_PBB
+i
] + shz
;
3659 bb_ci
[m
+3*STRIDE_PBB
+i
] = bb
[ia
+m
+3*STRIDE_PBB
+i
] + shx
;
3660 bb_ci
[m
+4*STRIDE_PBB
+i
] = bb
[ia
+m
+4*STRIDE_PBB
+i
] + shy
;
3661 bb_ci
[m
+5*STRIDE_PBB
+i
] = bb
[ia
+m
+5*STRIDE_PBB
+i
] + shz
;
3667 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
3668 static void set_icell_bb_supersub(const nbnxn_bb_t
*bb
, int ci
,
3669 real shx
, real shy
, real shz
,
3674 for (i
= 0; i
< GPU_NSUBCELL
; i
++)
3676 set_icell_bb_simple(bb
, ci
*GPU_NSUBCELL
+i
,
3682 /* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
3683 static void icell_set_x_simple(int ci
,
3684 real shx
, real shy
, real shz
,
3686 int stride
, const real
*x
,
3687 nbnxn_list_work_t
*work
)
3691 ia
= ci
*NBNXN_CPU_CLUSTER_I_SIZE
;
3693 for (i
= 0; i
< NBNXN_CPU_CLUSTER_I_SIZE
; i
++)
3695 work
->x_ci
[i
*STRIDE_XYZ
+XX
] = x
[(ia
+i
)*stride
+XX
] + shx
;
3696 work
->x_ci
[i
*STRIDE_XYZ
+YY
] = x
[(ia
+i
)*stride
+YY
] + shy
;
3697 work
->x_ci
[i
*STRIDE_XYZ
+ZZ
] = x
[(ia
+i
)*stride
+ZZ
] + shz
;
3701 /* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
3702 static void icell_set_x_supersub(int ci
,
3703 real shx
, real shy
, real shz
,
3705 int stride
, const real
*x
,
3706 nbnxn_list_work_t
*work
)
3713 ia
= ci
*GPU_NSUBCELL
*na_c
;
3714 for (i
= 0; i
< GPU_NSUBCELL
*na_c
; i
++)
3716 x_ci
[i
*DIM
+ XX
] = x
[(ia
+i
)*stride
+ XX
] + shx
;
3717 x_ci
[i
*DIM
+ YY
] = x
[(ia
+i
)*stride
+ YY
] + shy
;
3718 x_ci
[i
*DIM
+ ZZ
] = x
[(ia
+i
)*stride
+ ZZ
] + shz
;
3722 #ifdef NBNXN_SEARCH_BB_SIMD4
3723 /* Copies PBC shifted super-cell packed atom coordinates to working array */
3724 static void icell_set_x_supersub_simd4(int ci
,
3725 real shx
, real shy
, real shz
,
3727 int stride
, const real
*x
,
3728 nbnxn_list_work_t
*work
)
3730 int si
, io
, ia
, i
, j
;
3735 for (si
= 0; si
< GPU_NSUBCELL
; si
++)
3737 for (i
= 0; i
< na_c
; i
+= STRIDE_PBB
)
3740 ia
= ci
*GPU_NSUBCELL
*na_c
+ io
;
3741 for (j
= 0; j
< STRIDE_PBB
; j
++)
3743 x_ci
[io
*DIM
+ j
+ XX
*STRIDE_PBB
] = x
[(ia
+j
)*stride
+XX
] + shx
;
3744 x_ci
[io
*DIM
+ j
+ YY
*STRIDE_PBB
] = x
[(ia
+j
)*stride
+YY
] + shy
;
3745 x_ci
[io
*DIM
+ j
+ ZZ
*STRIDE_PBB
] = x
[(ia
+j
)*stride
+ZZ
] + shz
;
3752 static real nbnxn_rlist_inc_nonloc_fac
= 0.6;
3754 /* Due to the cluster size the effective pair-list is longer than
3755 * that of a simple atom pair-list. This function gives the extra distance.
3757 real
nbnxn_get_rlist_effective_inc(int cluster_size
, real atom_density
)
3759 return ((0.5 + nbnxn_rlist_inc_nonloc_fac
)*sqr(((cluster_size
) - 1.0)/(cluster_size
))*pow((cluster_size
)/(atom_density
), 1.0/3.0));
3762 /* Estimates the interaction volume^2 for non-local interactions */
3763 static real
nonlocal_vol2(const gmx_domdec_zones_t
*zones
, rvec ls
, real r
)
3772 /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
3773 * not home interaction volume^2. As these volumes are not additive,
3774 * this is an overestimate, but it would only be significant in the limit
3775 * of small cells, where we anyhow need to split the lists into
3776 * as small parts as possible.
3779 for (z
= 0; z
< zones
->n
; z
++)
3781 if (zones
->shift
[z
][XX
] + zones
->shift
[z
][YY
] + zones
->shift
[z
][ZZ
] == 1)
3786 for (d
= 0; d
< DIM
; d
++)
3788 if (zones
->shift
[z
][d
] == 0)
3792 za
*= zones
->size
[z
].x1
[d
] - zones
->size
[z
].x0
[d
];
3796 /* 4 octants of a sphere */
3797 vold_est
= 0.25*M_PI
*r
*r
*r
*r
;
3798 /* 4 quarter pie slices on the edges */
3799 vold_est
+= 4*cl
*M_PI
/6.0*r
*r
*r
;
3800 /* One rectangular volume on a face */
3801 vold_est
+= ca
*0.5*r
*r
;
3803 vol2_est_tot
+= vold_est
*za
;
3807 return vol2_est_tot
;
3810 /* Estimates the average size of a full j-list for super/sub setup */
3811 static int get_nsubpair_max(const nbnxn_search_t nbs
,
3814 int min_ci_balanced
)
3816 const nbnxn_grid_t
*grid
;
3818 real xy_diag2
, r_eff_sup
, vol_est
, nsp_est
, nsp_est_nl
;
3821 grid
= &nbs
->grid
[0];
3823 ls
[XX
] = (grid
->c1
[XX
] - grid
->c0
[XX
])/(grid
->ncx
*GPU_NSUBCELL_X
);
3824 ls
[YY
] = (grid
->c1
[YY
] - grid
->c0
[YY
])/(grid
->ncy
*GPU_NSUBCELL_Y
);
3825 ls
[ZZ
] = (grid
->c1
[ZZ
] - grid
->c0
[ZZ
])*grid
->ncx
*grid
->ncy
/(grid
->nc
*GPU_NSUBCELL_Z
);
3827 /* The average squared length of the diagonal of a sub cell */
3828 xy_diag2
= ls
[XX
]*ls
[XX
] + ls
[YY
]*ls
[YY
] + ls
[ZZ
]*ls
[ZZ
];
3830 /* The formulas below are a heuristic estimate of the average nsj per si*/
3831 r_eff_sup
= rlist
+ nbnxn_rlist_inc_nonloc_fac
*sqr((grid
->na_c
- 1.0)/grid
->na_c
)*sqrt(xy_diag2
/3);
3833 if (!nbs
->DomDec
|| nbs
->zones
->n
== 1)
3840 sqr(grid
->atom_density
/grid
->na_c
)*
3841 nonlocal_vol2(nbs
->zones
, ls
, r_eff_sup
);
3846 /* Sub-cell interacts with itself */
3847 vol_est
= ls
[XX
]*ls
[YY
]*ls
[ZZ
];
3848 /* 6/2 rectangular volume on the faces */
3849 vol_est
+= (ls
[XX
]*ls
[YY
] + ls
[XX
]*ls
[ZZ
] + ls
[YY
]*ls
[ZZ
])*r_eff_sup
;
3850 /* 12/2 quarter pie slices on the edges */
3851 vol_est
+= 2*(ls
[XX
] + ls
[YY
] + ls
[ZZ
])*0.25*M_PI
*sqr(r_eff_sup
);
3852 /* 4 octants of a sphere */
3853 vol_est
+= 0.5*4.0/3.0*M_PI
*pow(r_eff_sup
, 3);
3855 nsp_est
= grid
->nsubc_tot
*vol_est
*grid
->atom_density
/grid
->na_c
;
3857 /* Subtract the non-local pair count */
3858 nsp_est
-= nsp_est_nl
;
3862 fprintf(debug
, "nsp_est local %5.1f non-local %5.1f\n",
3863 nsp_est
, nsp_est_nl
);
3868 nsp_est
= nsp_est_nl
;
3871 if (min_ci_balanced
<= 0 || grid
->nc
>= min_ci_balanced
|| grid
->nc
== 0)
3873 /* We don't need to worry */
3878 /* Thus the (average) maximum j-list size should be as follows */
3879 nsubpair_max
= max(1, (int)(nsp_est
/min_ci_balanced
+0.5));
3881 /* Since the target value is a maximum (this avoids high outliers,
3882 * which lead to load imbalance), not average, we add half the
3883 * number of pairs in a cj4 block to get the average about right.
3885 nsubpair_max
+= GPU_NSUBCELL
*NBNXN_GPU_JGROUP_SIZE
/2;
3890 fprintf(debug
, "nbl nsp estimate %.1f, nsubpair_max %d\n",
3891 nsp_est
, nsubpair_max
);
3894 return nsubpair_max
;
3897 /* Debug list print function */
3898 static void print_nblist_ci_cj(FILE *fp
, const nbnxn_pairlist_t
*nbl
)
3902 for (i
= 0; i
< nbl
->nci
; i
++)
3904 fprintf(fp
, "ci %4d shift %2d ncj %3d\n",
3905 nbl
->ci
[i
].ci
, nbl
->ci
[i
].shift
,
3906 nbl
->ci
[i
].cj_ind_end
- nbl
->ci
[i
].cj_ind_start
);
3908 for (j
= nbl
->ci
[i
].cj_ind_start
; j
< nbl
->ci
[i
].cj_ind_end
; j
++)
3910 fprintf(fp
, " cj %5d imask %x\n",
3917 /* Debug list print function */
3918 static void print_nblist_sci_cj(FILE *fp
, const nbnxn_pairlist_t
*nbl
)
3920 int i
, j4
, j
, ncp
, si
;
3922 for (i
= 0; i
< nbl
->nsci
; i
++)
3924 fprintf(fp
, "ci %4d shift %2d ncj4 %2d\n",
3925 nbl
->sci
[i
].sci
, nbl
->sci
[i
].shift
,
3926 nbl
->sci
[i
].cj4_ind_end
- nbl
->sci
[i
].cj4_ind_start
);
3929 for (j4
= nbl
->sci
[i
].cj4_ind_start
; j4
< nbl
->sci
[i
].cj4_ind_end
; j4
++)
3931 for (j
= 0; j
< NBNXN_GPU_JGROUP_SIZE
; j
++)
3933 fprintf(fp
, " sj %5d imask %x\n",
3935 nbl
->cj4
[j4
].imei
[0].imask
);
3936 for (si
= 0; si
< GPU_NSUBCELL
; si
++)
3938 if (nbl
->cj4
[j4
].imei
[0].imask
& (1U << (j
*GPU_NSUBCELL
+ si
)))
3945 fprintf(fp
, "ci %4d shift %2d ncj4 %2d ncp %3d\n",
3946 nbl
->sci
[i
].sci
, nbl
->sci
[i
].shift
,
3947 nbl
->sci
[i
].cj4_ind_end
- nbl
->sci
[i
].cj4_ind_start
,
3952 /* Combine pair lists *nbl generated on multiple threads nblc */
3953 static void combine_nblists(int nnbl
, nbnxn_pairlist_t
**nbl
,
3954 nbnxn_pairlist_t
*nblc
)
3956 int nsci
, ncj4
, nexcl
;
3961 gmx_incons("combine_nblists does not support simple lists");
3966 nexcl
= nblc
->nexcl
;
3967 for (i
= 0; i
< nnbl
; i
++)
3969 nsci
+= nbl
[i
]->nsci
;
3970 ncj4
+= nbl
[i
]->ncj4
;
3971 nexcl
+= nbl
[i
]->nexcl
;
3974 if (nsci
> nblc
->sci_nalloc
)
3976 nb_realloc_sci(nblc
, nsci
);
3978 if (ncj4
> nblc
->cj4_nalloc
)
3980 nblc
->cj4_nalloc
= over_alloc_small(ncj4
);
3981 nbnxn_realloc_void((void **)&nblc
->cj4
,
3982 nblc
->ncj4
*sizeof(*nblc
->cj4
),
3983 nblc
->cj4_nalloc
*sizeof(*nblc
->cj4
),
3984 nblc
->alloc
, nblc
->free
);
3986 if (nexcl
> nblc
->excl_nalloc
)
3988 nblc
->excl_nalloc
= over_alloc_small(nexcl
);
3989 nbnxn_realloc_void((void **)&nblc
->excl
,
3990 nblc
->nexcl
*sizeof(*nblc
->excl
),
3991 nblc
->excl_nalloc
*sizeof(*nblc
->excl
),
3992 nblc
->alloc
, nblc
->free
);
3995 /* Each thread should copy its own data to the combined arrays,
3996 * as otherwise data will go back and forth between different caches.
3998 #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
3999 for (n
= 0; n
< nnbl
; n
++)
4006 const nbnxn_pairlist_t
*nbli
;
4008 /* Determine the offset in the combined data for our thread */
4009 sci_offset
= nblc
->nsci
;
4010 cj4_offset
= nblc
->ncj4
;
4011 ci_offset
= nblc
->nci_tot
;
4012 excl_offset
= nblc
->nexcl
;
4014 for (i
= 0; i
< n
; i
++)
4016 sci_offset
+= nbl
[i
]->nsci
;
4017 cj4_offset
+= nbl
[i
]->ncj4
;
4018 ci_offset
+= nbl
[i
]->nci_tot
;
4019 excl_offset
+= nbl
[i
]->nexcl
;
4024 for (i
= 0; i
< nbli
->nsci
; i
++)
4026 nblc
->sci
[sci_offset
+i
] = nbli
->sci
[i
];
4027 nblc
->sci
[sci_offset
+i
].cj4_ind_start
+= cj4_offset
;
4028 nblc
->sci
[sci_offset
+i
].cj4_ind_end
+= cj4_offset
;
4031 for (j4
= 0; j4
< nbli
->ncj4
; j4
++)
4033 nblc
->cj4
[cj4_offset
+j4
] = nbli
->cj4
[j4
];
4034 nblc
->cj4
[cj4_offset
+j4
].imei
[0].excl_ind
+= excl_offset
;
4035 nblc
->cj4
[cj4_offset
+j4
].imei
[1].excl_ind
+= excl_offset
;
4038 for (j4
= 0; j4
< nbli
->nexcl
; j4
++)
4040 nblc
->excl
[excl_offset
+j4
] = nbli
->excl
[j4
];
4044 for (n
= 0; n
< nnbl
; n
++)
4046 nblc
->nsci
+= nbl
[n
]->nsci
;
4047 nblc
->ncj4
+= nbl
[n
]->ncj4
;
4048 nblc
->nci_tot
+= nbl
[n
]->nci_tot
;
4049 nblc
->nexcl
+= nbl
[n
]->nexcl
;
4053 /* Returns the next ci to be processes by our thread */
4054 static gmx_bool
next_ci(const nbnxn_grid_t
*grid
,
4056 int nth
, int ci_block
,
4057 int *ci_x
, int *ci_y
,
4063 if (*ci_b
== ci_block
)
4065 /* Jump to the next block assigned to this task */
4066 *ci
+= (nth
- 1)*ci_block
;
4070 if (*ci
>= grid
->nc
*conv
)
4075 while (*ci
>= grid
->cxy_ind
[*ci_x
*grid
->ncy
+ *ci_y
+ 1]*conv
)
4078 if (*ci_y
== grid
->ncy
)
4088 /* Returns the distance^2 for which we put cell pairs in the list
4089 * without checking atom pair distances. This is usually < rlist^2.
4091 static float boundingbox_only_distance2(const nbnxn_grid_t
*gridi
,
4092 const nbnxn_grid_t
*gridj
,
4096 /* If the distance between two sub-cell bounding boxes is less
4097 * than this distance, do not check the distance between
4098 * all particle pairs in the sub-cell, since then it is likely
4099 * that the box pair has atom pairs within the cut-off.
4100 * We use the nblist cut-off minus 0.5 times the average x/y diagonal
4101 * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
4102 * Using more than 0.5 gains at most 0.5%.
4103 * If forces are calculated more than twice, the performance gain
4104 * in the force calculation outweighs the cost of checking.
4105 * Note that with subcell lists, the atom-pair distance check
4106 * is only performed when only 1 out of 8 sub-cells in within range,
4107 * this is because the GPU is much faster than the cpu.
4112 bbx
= 0.5*(gridi
->sx
+ gridj
->sx
);
4113 bby
= 0.5*(gridi
->sy
+ gridj
->sy
);
4116 bbx
/= GPU_NSUBCELL_X
;
4117 bby
/= GPU_NSUBCELL_Y
;
4120 rbb2
= sqr(max(0, rlist
- 0.5*sqrt(bbx
*bbx
+ bby
*bby
)));
4125 return (float)((1+GMX_FLOAT_EPS
)*rbb2
);
4129 static int get_ci_block_size(const nbnxn_grid_t
*gridi
,
4130 gmx_bool bDomDec
, int nth
)
4132 const int ci_block_enum
= 5;
4133 const int ci_block_denom
= 11;
4134 const int ci_block_min_atoms
= 16;
4137 /* Here we decide how to distribute the blocks over the threads.
4138 * We use prime numbers to try to avoid that the grid size becomes
4139 * a multiple of the number of threads, which would lead to some
4140 * threads getting "inner" pairs and others getting boundary pairs,
4141 * which in turns will lead to load imbalance between threads.
4142 * Set the block size as 5/11/ntask times the average number of cells
4143 * in a y,z slab. This should ensure a quite uniform distribution
4144 * of the grid parts of the different thread along all three grid
4145 * zone boundaries with 3D domain decomposition. At the same time
4146 * the blocks will not become too small.
4148 ci_block
= (gridi
->nc
*ci_block_enum
)/(ci_block_denom
*gridi
->ncx
*nth
);
4150 /* Ensure the blocks are not too small: avoids cache invalidation */
4151 if (ci_block
*gridi
->na_sc
< ci_block_min_atoms
)
4153 ci_block
= (ci_block_min_atoms
+ gridi
->na_sc
- 1)/gridi
->na_sc
;
4156 /* Without domain decomposition
4157 * or with less than 3 blocks per task, divide in nth blocks.
4159 if (!bDomDec
|| ci_block
*3*nth
> gridi
->nc
)
4161 ci_block
= (gridi
->nc
+ nth
- 1)/nth
;
4167 /* Generates the part of pair-list nbl assigned to our thread */
4168 static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs
,
4169 const nbnxn_grid_t
*gridi
,
4170 const nbnxn_grid_t
*gridj
,
4171 nbnxn_search_work_t
*work
,
4172 const nbnxn_atomdata_t
*nbat
,
4173 const t_blocka
*excl
,
4177 gmx_bool bFBufferFlag
,
4180 int min_ci_balanced
,
4182 nbnxn_pairlist_t
*nbl
)
4189 int ci_b
, ci
, ci_x
, ci_y
, ci_xy
, cj
;
4195 int conv_i
, cell0_i
;
4196 const nbnxn_bb_t
*bb_i
=NULL
;
4198 const float *pbb_i
=NULL
;
4200 const float *bbcz_i
, *bbcz_j
;
4202 real bx0
, bx1
, by0
, by1
, bz0
, bz1
;
4204 real d2cx
, d2z
, d2z_cx
, d2z_cy
, d2zx
, d2zxy
, d2xy
;
4205 int cxf
, cxl
, cyf
, cyf_x
, cyl
;
4207 int c0
, c1
, cs
, cf
, cl
;
4210 int gridi_flag_shift
= 0, gridj_flag_shift
= 0;
4211 unsigned *gridj_flag
= NULL
;
4212 int ncj_old_i
, ncj_old_j
;
4214 nbs_cycle_start(&work
->cc
[enbsCCsearch
]);
4216 if (gridj
->bSimple
!= nbl
->bSimple
)
4218 gmx_incons("Grid incompatible with pair-list");
4222 nbl
->na_sc
= gridj
->na_sc
;
4223 nbl
->na_ci
= gridj
->na_c
;
4224 nbl
->na_cj
= nbnxn_kernel_to_cj_size(nb_kernel_type
);
4225 na_cj_2log
= get_2log(nbl
->na_cj
);
4231 /* Determine conversion of clusters to flag blocks */
4232 gridi_flag_shift
= 0;
4233 while ((nbl
->na_ci
<<gridi_flag_shift
) < NBNXN_BUFFERFLAG_SIZE
)
4237 gridj_flag_shift
= 0;
4238 while ((nbl
->na_cj
<<gridj_flag_shift
) < NBNXN_BUFFERFLAG_SIZE
)
4243 gridj_flag
= work
->buffer_flags
.flag
;
4246 copy_mat(nbs
->box
, box
);
4248 rl2
= nbl
->rlist
*nbl
->rlist
;
4250 rbb2
= boundingbox_only_distance2(gridi
, gridj
, nbl
->rlist
, nbl
->bSimple
);
4254 fprintf(debug
, "nbl bounding box only distance %f\n", sqrt(rbb2
));
4257 /* Set the shift range */
4258 for (d
= 0; d
< DIM
; d
++)
4260 /* Check if we need periodicity shifts.
4261 * Without PBC or with domain decomposition we don't need them.
4263 if (d
>= ePBC2npbcdim(nbs
->ePBC
) || nbs
->dd_dim
[d
])
4270 box
[XX
][XX
] - fabs(box
[YY
][XX
]) - fabs(box
[ZZ
][XX
]) < sqrt(rl2
))
4281 if (nbl
->bSimple
&& !gridi
->bSimple
)
4283 conv_i
= gridi
->na_sc
/gridj
->na_sc
;
4284 bb_i
= gridi
->bb_simple
;
4285 bbcz_i
= gridi
->bbcz_simple
;
4286 flags_i
= gridi
->flags_simple
;
4301 /* We use the normal bounding box format for both grid types */
4304 bbcz_i
= gridi
->bbcz
;
4305 flags_i
= gridi
->flags
;
4307 cell0_i
= gridi
->cell0
*conv_i
;
4309 bbcz_j
= gridj
->bbcz
;
4313 /* Blocks of the conversion factor - 1 give a large repeat count
4314 * combined with a small block size. This should result in good
4315 * load balancing for both small and large domains.
4317 ci_block
= conv_i
- 1;
4321 fprintf(debug
, "nbl nc_i %d col.av. %.1f ci_block %d\n",
4322 gridi
->nc
, gridi
->nc
/(double)(gridi
->ncx
*gridi
->ncy
), ci_block
);
4328 /* Initially ci_b and ci to 1 before where we want them to start,
4329 * as they will both be incremented in next_ci.
4332 ci
= th
*ci_block
- 1;
4335 while (next_ci(gridi
, conv_i
, nth
, ci_block
, &ci_x
, &ci_y
, &ci_b
, &ci
))
4337 if (nbl
->bSimple
&& flags_i
[ci
] == 0)
4342 ncj_old_i
= nbl
->ncj
;
4345 if (gridj
!= gridi
&& shp
[XX
] == 0)
4349 bx1
= bb_i
[ci
].upper
[BB_X
];
4353 bx1
= gridi
->c0
[XX
] + (ci_x
+1)*gridi
->sx
;
4355 if (bx1
< gridj
->c0
[XX
])
4357 d2cx
= sqr(gridj
->c0
[XX
] - bx1
);
4366 ci_xy
= ci_x
*gridi
->ncy
+ ci_y
;
4368 /* Loop over shift vectors in three dimensions */
4369 for (tz
= -shp
[ZZ
]; tz
<= shp
[ZZ
]; tz
++)
4371 shz
= tz
*box
[ZZ
][ZZ
];
4373 bz0
= bbcz_i
[ci
*NNBSBB_D
] + shz
;
4374 bz1
= bbcz_i
[ci
*NNBSBB_D
+1] + shz
;
4386 d2z
= sqr(bz0
- box
[ZZ
][ZZ
]);
4389 d2z_cx
= d2z
+ d2cx
;
4397 bz1
/((real
)(gridi
->cxy_ind
[ci_xy
+1] - gridi
->cxy_ind
[ci_xy
]));
4402 /* The check with bz1_frac close to or larger than 1 comes later */
4404 for (ty
= -shp
[YY
]; ty
<= shp
[YY
]; ty
++)
4406 shy
= ty
*box
[YY
][YY
] + tz
*box
[ZZ
][YY
];
4410 by0
= bb_i
[ci
].lower
[BB_Y
] + shy
;
4411 by1
= bb_i
[ci
].upper
[BB_Y
] + shy
;
4415 by0
= gridi
->c0
[YY
] + (ci_y
)*gridi
->sy
+ shy
;
4416 by1
= gridi
->c0
[YY
] + (ci_y
+1)*gridi
->sy
+ shy
;
4419 get_cell_range(by0
, by1
,
4420 gridj
->ncy
, gridj
->c0
[YY
], gridj
->sy
, gridj
->inv_sy
,
4430 if (by1
< gridj
->c0
[YY
])
4432 d2z_cy
+= sqr(gridj
->c0
[YY
] - by1
);
4434 else if (by0
> gridj
->c1
[YY
])
4436 d2z_cy
+= sqr(by0
- gridj
->c1
[YY
]);
4439 for (tx
= -shp
[XX
]; tx
<= shp
[XX
]; tx
++)
4441 shift
= XYZ2IS(tx
, ty
, tz
);
4443 #ifdef NBNXN_SHIFT_BACKWARD
4444 if (gridi
== gridj
&& shift
> CENTRAL
)
4450 shx
= tx
*box
[XX
][XX
] + ty
*box
[YY
][XX
] + tz
*box
[ZZ
][XX
];
4454 bx0
= bb_i
[ci
].lower
[BB_X
] + shx
;
4455 bx1
= bb_i
[ci
].upper
[BB_X
] + shx
;
4459 bx0
= gridi
->c0
[XX
] + (ci_x
)*gridi
->sx
+ shx
;
4460 bx1
= gridi
->c0
[XX
] + (ci_x
+1)*gridi
->sx
+ shx
;
4463 get_cell_range(bx0
, bx1
,
4464 gridj
->ncx
, gridj
->c0
[XX
], gridj
->sx
, gridj
->inv_sx
,
4475 new_ci_entry(nbl
, cell0_i
+ci
, shift
, flags_i
[ci
],
4480 new_sci_entry(nbl
, cell0_i
+ci
, shift
, flags_i
[ci
],
4484 #ifndef NBNXN_SHIFT_BACKWARD
4487 if (shift
== CENTRAL
&& gridi
== gridj
&&
4491 /* Leave the pairs with i > j.
4492 * x is the major index, so skip half of it.
4499 set_icell_bb_simple(bb_i
, ci
, shx
, shy
, shz
,
4505 set_icell_bbxxxx_supersub(pbb_i
, ci
, shx
, shy
, shz
,
4508 set_icell_bb_supersub(bb_i
, ci
, shx
, shy
, shz
,
4513 nbs
->icell_set_x(cell0_i
+ci
, shx
, shy
, shz
,
4514 gridi
->na_c
, nbat
->xstride
, nbat
->x
,
4517 for (cx
= cxf
; cx
<= cxl
; cx
++)
4520 if (gridj
->c0
[XX
] + cx
*gridj
->sx
> bx1
)
4522 d2zx
+= sqr(gridj
->c0
[XX
] + cx
*gridj
->sx
- bx1
);
4524 else if (gridj
->c0
[XX
] + (cx
+1)*gridj
->sx
< bx0
)
4526 d2zx
+= sqr(gridj
->c0
[XX
] + (cx
+1)*gridj
->sx
- bx0
);
4529 #ifndef NBNXN_SHIFT_BACKWARD
4530 if (gridi
== gridj
&&
4531 cx
== 0 && cyf
< ci_y
)
4533 if (gridi
== gridj
&&
4534 cx
== 0 && shift
== CENTRAL
&& cyf
< ci_y
)
4537 /* Leave the pairs with i > j.
4538 * Skip half of y when i and j have the same x.
4547 for (cy
= cyf_x
; cy
<= cyl
; cy
++)
4549 c0
= gridj
->cxy_ind
[cx
*gridj
->ncy
+cy
];
4550 c1
= gridj
->cxy_ind
[cx
*gridj
->ncy
+cy
+1];
4551 #ifdef NBNXN_SHIFT_BACKWARD
4552 if (gridi
== gridj
&&
4553 shift
== CENTRAL
&& c0
< ci
)
4560 if (gridj
->c0
[YY
] + cy
*gridj
->sy
> by1
)
4562 d2zxy
+= sqr(gridj
->c0
[YY
] + cy
*gridj
->sy
- by1
);
4564 else if (gridj
->c0
[YY
] + (cy
+1)*gridj
->sy
< by0
)
4566 d2zxy
+= sqr(gridj
->c0
[YY
] + (cy
+1)*gridj
->sy
- by0
);
4568 if (c1
> c0
&& d2zxy
< rl2
)
4570 cs
= c0
+ (int)(bz1_frac
*(c1
- c0
));
4578 /* Find the lowest cell that can possibly
4583 (bbcz_j
[cf
*NNBSBB_D
+1] >= bz0
||
4584 d2xy
+ sqr(bbcz_j
[cf
*NNBSBB_D
+1] - bz0
) < rl2
))
4589 /* Find the highest cell that can possibly
4594 (bbcz_j
[cl
*NNBSBB_D
] <= bz1
||
4595 d2xy
+ sqr(bbcz_j
[cl
*NNBSBB_D
] - bz1
) < rl2
))
4600 #ifdef NBNXN_REFCODE
4602 /* Simple reference code, for debugging,
4603 * overrides the more complex code above.
4608 for (k
= c0
; k
< c1
; k
++)
4610 if (box_dist2(bx0
, bx1
, by0
, by1
, bz0
, bz1
, bb
+k
) < rl2
&&
4615 if (box_dist2(bx0
, bx1
, by0
, by1
, bz0
, bz1
, bb
+k
) < rl2
&&
4626 /* We want each atom/cell pair only once,
4627 * only use cj >= ci.
4629 #ifndef NBNXN_SHIFT_BACKWARD
4632 if (shift
== CENTRAL
)
4641 /* For f buffer flags with simple lists */
4642 ncj_old_j
= nbl
->ncj
;
4644 switch (nb_kernel_type
)
4646 case nbnxnk4x4_PlainC
:
4647 check_subcell_list_space_simple(nbl
, cl
-cf
+1);
4649 make_cluster_list_simple(gridj
,
4651 (gridi
== gridj
&& shift
== CENTRAL
),
4656 #ifdef GMX_NBNXN_SIMD_4XN
4657 case nbnxnk4xN_SIMD_4xN
:
4658 check_subcell_list_space_simple(nbl
, ci_to_cj(na_cj_2log
, cl
-cf
)+2);
4659 make_cluster_list_simd_4xn(gridj
,
4661 (gridi
== gridj
&& shift
== CENTRAL
),
4667 #ifdef GMX_NBNXN_SIMD_2XNN
4668 case nbnxnk4xN_SIMD_2xNN
:
4669 check_subcell_list_space_simple(nbl
, ci_to_cj(na_cj_2log
, cl
-cf
)+2);
4670 make_cluster_list_simd_2xnn(gridj
,
4672 (gridi
== gridj
&& shift
== CENTRAL
),
4678 case nbnxnk8x8x8_PlainC
:
4679 case nbnxnk8x8x8_CUDA
:
4680 check_subcell_list_space_supersub(nbl
, cl
-cf
+1);
4681 for (cj
= cf
; cj
<= cl
; cj
++)
4683 make_cluster_list_supersub(nbs
, gridi
, gridj
,
4685 (gridi
== gridj
&& shift
== CENTRAL
&& ci
== cj
),
4686 nbat
->xstride
, nbat
->x
,
4692 ncpcheck
+= cl
- cf
+ 1;
4694 if (bFBufferFlag
&& nbl
->ncj
> ncj_old_j
)
4698 cbf
= nbl
->cj
[ncj_old_j
].cj
>> gridj_flag_shift
;
4699 cbl
= nbl
->cj
[nbl
->ncj
-1].cj
>> gridj_flag_shift
;
4700 for (cb
= cbf
; cb
<= cbl
; cb
++)
4702 gridj_flag
[cb
] = 1U<<th
;
4710 /* Set the exclusions for this ci list */
4713 set_ci_top_excls(nbs
,
4715 shift
== CENTRAL
&& gridi
== gridj
,
4718 &(nbl
->ci
[nbl
->nci
]),
4723 set_sci_top_excls(nbs
,
4725 shift
== CENTRAL
&& gridi
== gridj
,
4727 &(nbl
->sci
[nbl
->nsci
]),
4731 /* Close this ci list */
4734 close_ci_entry_simple(nbl
);
4738 close_ci_entry_supersub(nbl
,
4740 progBal
, min_ci_balanced
,
4747 if (bFBufferFlag
&& nbl
->ncj
> ncj_old_i
)
4749 work
->buffer_flags
.flag
[(gridi
->cell0
+ci
)>>gridi_flag_shift
] = 1U<<th
;
4753 work
->ndistc
= ndistc
;
4755 nbs_cycle_stop(&work
->cc
[enbsCCsearch
]);
4759 fprintf(debug
, "number of distance checks %d\n", ndistc
);
4760 fprintf(debug
, "ncpcheck %s %d\n", gridi
== gridj
? "local" : "non-local",
4765 print_nblist_statistics_simple(debug
, nbl
, nbs
, rlist
);
4769 print_nblist_statistics_supersub(debug
, nbl
, nbs
, rlist
);
4775 static void reduce_buffer_flags(const nbnxn_search_t nbs
,
4777 const nbnxn_buffer_flags_t
*dest
)
4780 const unsigned *flag
;
4782 for (s
= 0; s
< nsrc
; s
++)
4784 flag
= nbs
->work
[s
].buffer_flags
.flag
;
4786 for (b
= 0; b
< dest
->nflag
; b
++)
4788 dest
->flag
[b
] |= flag
[b
];
4793 static void print_reduction_cost(const nbnxn_buffer_flags_t
*flags
, int nout
)
4795 int nelem
, nkeep
, ncopy
, nred
, b
, c
, out
;
4801 for (b
= 0; b
< flags
->nflag
; b
++)
4803 if (flags
->flag
[b
] == 1)
4805 /* Only flag 0 is set, no copy of reduction required */
4809 else if (flags
->flag
[b
] > 0)
4812 for (out
= 0; out
< nout
; out
++)
4814 if (flags
->flag
[b
] & (1U<<out
))
4831 fprintf(debug
, "nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
4833 nelem
/(double)(flags
->nflag
),
4834 nkeep
/(double)(flags
->nflag
),
4835 ncopy
/(double)(flags
->nflag
),
4836 nred
/(double)(flags
->nflag
));
4839 /* Perform a count (linear) sort to sort the smaller lists to the end.
4840 * This avoids load imbalance on the GPU, as large lists will be
4841 * scheduled and executed first and the smaller lists later.
4842 * Load balancing between multi-processors only happens at the end
4843 * and there smaller lists lead to more effective load balancing.
4844 * The sorting is done on the cj4 count, not on the actual pair counts.
4845 * Not only does this make the sort faster, but it also results in
4846 * better load balancing than using a list sorted on exact load.
4847 * This function swaps the pointer in the pair list to avoid a copy operation.
4849 static void sort_sci(nbnxn_pairlist_t
*nbl
)
4851 nbnxn_list_work_t
*work
;
4852 int m
, i
, s
, s0
, s1
;
4853 nbnxn_sci_t
*sci_sort
;
4855 if (nbl
->ncj4
<= nbl
->nsci
)
4857 /* nsci = 0 or all sci have size 1, sorting won't change the order */
4863 /* We will distinguish differences up to double the average */
4864 m
= (2*nbl
->ncj4
)/nbl
->nsci
;
4866 if (m
+ 1 > work
->sort_nalloc
)
4868 work
->sort_nalloc
= over_alloc_large(m
+ 1);
4869 srenew(work
->sort
, work
->sort_nalloc
);
4872 if (work
->sci_sort_nalloc
!= nbl
->sci_nalloc
)
4874 work
->sci_sort_nalloc
= nbl
->sci_nalloc
;
4875 nbnxn_realloc_void((void **)&work
->sci_sort
,
4877 work
->sci_sort_nalloc
*sizeof(*work
->sci_sort
),
4878 nbl
->alloc
, nbl
->free
);
4881 /* Count the entries of each size */
4882 for (i
= 0; i
<= m
; i
++)
4886 for (s
= 0; s
< nbl
->nsci
; s
++)
4888 i
= min(m
, nbl
->sci
[s
].cj4_ind_end
- nbl
->sci
[s
].cj4_ind_start
);
4891 /* Calculate the offset for each count */
4894 for (i
= m
- 1; i
>= 0; i
--)
4897 work
->sort
[i
] = work
->sort
[i
+ 1] + s0
;
4901 /* Sort entries directly into place */
4902 sci_sort
= work
->sci_sort
;
4903 for (s
= 0; s
< nbl
->nsci
; s
++)
4905 i
= min(m
, nbl
->sci
[s
].cj4_ind_end
- nbl
->sci
[s
].cj4_ind_start
);
4906 sci_sort
[work
->sort
[i
]++] = nbl
->sci
[s
];
4909 /* Swap the sci pointers so we use the new, sorted list */
4910 work
->sci_sort
= nbl
->sci
;
4911 nbl
->sci
= sci_sort
;
4914 /* Make a local or non-local pair-list, depending on iloc */
4915 void nbnxn_make_pairlist(const nbnxn_search_t nbs
,
4916 nbnxn_atomdata_t
*nbat
,
4917 const t_blocka
*excl
,
4919 int min_ci_balanced
,
4920 nbnxn_pairlist_set_t
*nbl_list
,
4925 nbnxn_grid_t
*gridi
, *gridj
;
4927 int nzi
, zi
, zj0
, zj1
, zj
;
4931 nbnxn_pairlist_t
**nbl
;
4933 gmx_bool CombineNBLists
;
4935 int np_tot
, np_noq
, np_hlj
, nap
;
4937 /* Check if we are running hybrid GPU + CPU nbnxn mode */
4938 bGPUCPU
= (!nbs
->grid
[0].bSimple
&& nbl_list
->bSimple
);
4940 nnbl
= nbl_list
->nnbl
;
4941 nbl
= nbl_list
->nbl
;
4942 CombineNBLists
= nbl_list
->bCombined
;
4946 fprintf(debug
, "ns making %d nblists\n", nnbl
);
4949 nbat
->bUseBufferFlags
= (nbat
->nout
> 1);
4950 /* We should re-init the flags before making the first list */
4951 if (nbat
->bUseBufferFlags
&& (LOCAL_I(iloc
) || bGPUCPU
))
4953 init_buffer_flags(&nbat
->buffer_flags
, nbat
->natoms
);
4956 if (nbl_list
->bSimple
)
4958 switch (nb_kernel_type
)
4960 #ifdef GMX_NBNXN_SIMD_4XN
4961 case nbnxnk4xN_SIMD_4xN
:
4962 nbs
->icell_set_x
= icell_set_x_simd_4xn
;
4965 #ifdef GMX_NBNXN_SIMD_2XNN
4966 case nbnxnk4xN_SIMD_2xNN
:
4967 nbs
->icell_set_x
= icell_set_x_simd_2xnn
;
4971 nbs
->icell_set_x
= icell_set_x_simple
;
4977 #ifdef NBNXN_SEARCH_BB_SIMD4
4978 nbs
->icell_set_x
= icell_set_x_supersub_simd4
;
4980 nbs
->icell_set_x
= icell_set_x_supersub
;
4986 /* Only zone (grid) 0 vs 0 */
4993 nzi
= nbs
->zones
->nizone
;
4996 if (!nbl_list
->bSimple
&& min_ci_balanced
> 0)
4998 nsubpair_max
= get_nsubpair_max(nbs
, iloc
, rlist
, min_ci_balanced
);
5005 /* Clear all pair-lists */
5006 for (th
= 0; th
< nnbl
; th
++)
5008 clear_pairlist(nbl
[th
]);
5011 for (zi
= 0; zi
< nzi
; zi
++)
5013 gridi
= &nbs
->grid
[zi
];
5015 if (NONLOCAL_I(iloc
))
5017 zj0
= nbs
->zones
->izone
[zi
].j0
;
5018 zj1
= nbs
->zones
->izone
[zi
].j1
;
5024 for (zj
= zj0
; zj
< zj1
; zj
++)
5026 gridj
= &nbs
->grid
[zj
];
5030 fprintf(debug
, "ns search grid %d vs %d\n", zi
, zj
);
5033 nbs_cycle_start(&nbs
->cc
[enbsCCsearch
]);
5035 if (nbl
[0]->bSimple
&& !gridi
->bSimple
)
5037 /* Hybrid list, determine blocking later */
5042 ci_block
= get_ci_block_size(gridi
, nbs
->DomDec
, nnbl
);
5045 #pragma omp parallel for num_threads(nnbl) schedule(static)
5046 for (th
= 0; th
< nnbl
; th
++)
5048 /* Re-init the thread-local work flag data before making
5049 * the first list (not an elegant conditional).
5051 if (nbat
->bUseBufferFlags
&& ((zi
== 0 && zj
== 0) ||
5052 (bGPUCPU
&& zi
== 0 && zj
== 1)))
5054 init_buffer_flags(&nbs
->work
[th
].buffer_flags
, nbat
->natoms
);
5057 if (CombineNBLists
&& th
> 0)
5059 clear_pairlist(nbl
[th
]);
5062 /* With GPU: generate progressively smaller lists for
5063 * load balancing for local only or non-local with 2 zones.
5065 progBal
= (LOCAL_I(iloc
) || nbs
->zones
->n
<= 2);
5067 /* Divide the i super cell equally over the nblists */
5068 nbnxn_make_pairlist_part(nbs
, gridi
, gridj
,
5069 &nbs
->work
[th
], nbat
, excl
,
5073 nbat
->bUseBufferFlags
,
5075 progBal
, min_ci_balanced
,
5079 nbs_cycle_stop(&nbs
->cc
[enbsCCsearch
]);
5084 for (th
= 0; th
< nnbl
; th
++)
5086 inc_nrnb(nrnb
, eNR_NBNXN_DIST2
, nbs
->work
[th
].ndistc
);
5088 if (nbl_list
->bSimple
)
5090 np_tot
+= nbl
[th
]->ncj
;
5091 np_noq
+= nbl
[th
]->work
->ncj_noq
;
5092 np_hlj
+= nbl
[th
]->work
->ncj_hlj
;
5096 /* This count ignores potential subsequent pair pruning */
5097 np_tot
+= nbl
[th
]->nci_tot
;
5100 nap
= nbl
[0]->na_ci
*nbl
[0]->na_cj
;
5101 nbl_list
->natpair_ljq
= (np_tot
- np_noq
)*nap
- np_hlj
*nap
/2;
5102 nbl_list
->natpair_lj
= np_noq
*nap
;
5103 nbl_list
->natpair_q
= np_hlj
*nap
/2;
5105 if (CombineNBLists
&& nnbl
> 1)
5107 nbs_cycle_start(&nbs
->cc
[enbsCCcombine
]);
5109 combine_nblists(nnbl
-1, nbl
+1, nbl
[0]);
5111 nbs_cycle_stop(&nbs
->cc
[enbsCCcombine
]);
5116 if (!nbl_list
->bSimple
)
5118 /* Sort the entries on size, large ones first */
5119 if (CombineNBLists
|| nnbl
== 1)
5125 #pragma omp parallel for num_threads(nnbl) schedule(static)
5126 for (th
= 0; th
< nnbl
; th
++)
5133 if (nbat
->bUseBufferFlags
)
5135 reduce_buffer_flags(nbs
, nnbl
, &nbat
->buffer_flags
);
5138 /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
5141 nbs
->search_count
++;
5143 if (nbs
->print_cycles
&&
5144 (!nbs
->DomDec
|| (nbs
->DomDec
&& !LOCAL_I(iloc
))) &&
5145 nbs
->search_count
% 100 == 0)
5147 nbs_cycle_print(stderr
, nbs
);
5150 if (debug
&& (CombineNBLists
&& nnbl
> 1))
5152 if (nbl
[0]->bSimple
)
5154 print_nblist_statistics_simple(debug
, nbl
[0], nbs
, rlist
);
5158 print_nblist_statistics_supersub(debug
, nbl
[0], nbs
, rlist
);
5166 if (nbl
[0]->bSimple
)
5168 print_nblist_ci_cj(debug
, nbl
[0]);
5172 print_nblist_sci_cj(debug
, nbl
[0]);
5176 if (nbat
->bUseBufferFlags
)
5178 print_reduction_cost(&nbat
->buffer_flags
, nnbl
);