2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 #include "nbnxn_atomdata.h"
48 #include "thread_mpi/atomic.h"
50 #include "gromacs/math/functions.h"
51 #include "gromacs/math/utilities.h"
52 #include "gromacs/math/vec.h"
53 #include "gromacs/mdlib/gmx_omp_nthreads.h"
54 #include "gromacs/mdlib/nb_verlet.h"
55 #include "gromacs/mdlib/nbnxn_consts.h"
56 #include "gromacs/mdlib/nbnxn_internal.h"
57 #include "gromacs/mdlib/nbnxn_search.h"
58 #include "gromacs/mdlib/nbnxn_util.h"
59 #include "gromacs/mdtypes/forcerec.h" // only for GET_CGINFO_*
60 #include "gromacs/mdtypes/mdatom.h"
61 #include "gromacs/pbcutil/ishift.h"
62 #include "gromacs/simd/simd.h"
63 #include "gromacs/timing/wallcycle.h"
64 #include "gromacs/utility/exceptions.h"
65 #include "gromacs/utility/fatalerror.h"
66 #include "gromacs/utility/gmxomp.h"
67 #include "gromacs/utility/logger.h"
68 #include "gromacs/utility/smalloc.h"
69 #include "gromacs/utility/stringutil.h"
71 using namespace gmx
; // TODO: Remove when this file is moved into gmx namespace
73 /* Default nbnxn allocation routine, allocates NBNXN_MEM_ALIGN byte aligned */
74 void nbnxn_alloc_aligned(void **ptr
, size_t nbytes
)
76 *ptr
= save_malloc_aligned("ptr", __FILE__
, __LINE__
, nbytes
, 1, NBNXN_MEM_ALIGN
);
79 /* Free function for memory allocated with nbnxn_alloc_aligned */
80 void nbnxn_free_aligned(void *ptr
)
85 /* Reallocation wrapper function for nbnxn data structures */
86 void nbnxn_realloc_void(void **ptr
,
87 int nbytes_copy
, int nbytes_new
,
93 ma(&ptr_new
, nbytes_new
);
95 if (nbytes_new
> 0 && ptr_new
== nullptr)
97 gmx_fatal(FARGS
, "Allocation of %d bytes failed", nbytes_new
);
102 if (nbytes_new
< nbytes_copy
)
104 gmx_incons("In nbnxn_realloc_void: new size less than copy size");
106 memcpy(ptr_new
, *ptr
, nbytes_copy
);
115 /* Reallocate the nbnxn_atomdata_t for a size of n atoms */
116 void nbnxn_atomdata_realloc(nbnxn_atomdata_t
*nbat
, int n
)
120 nbnxn_realloc_void((void **)&nbat
->type
,
121 nbat
->natoms
*sizeof(*nbat
->type
),
122 n
*sizeof(*nbat
->type
),
123 nbat
->alloc
, nbat
->free
);
124 nbnxn_realloc_void((void **)&nbat
->lj_comb
,
125 nbat
->natoms
*2*sizeof(*nbat
->lj_comb
),
126 n
*2*sizeof(*nbat
->lj_comb
),
127 nbat
->alloc
, nbat
->free
);
128 if (nbat
->XFormat
!= nbatXYZQ
)
130 nbnxn_realloc_void((void **)&nbat
->q
,
131 nbat
->natoms
*sizeof(*nbat
->q
),
133 nbat
->alloc
, nbat
->free
);
135 if (nbat
->nenergrp
> 1)
137 nbnxn_realloc_void((void **)&nbat
->energrp
,
138 nbat
->natoms
/nbat
->na_c
*sizeof(*nbat
->energrp
),
139 n
/nbat
->na_c
*sizeof(*nbat
->energrp
),
140 nbat
->alloc
, nbat
->free
);
142 nbnxn_realloc_void((void **)&nbat
->x
,
143 nbat
->natoms
*nbat
->xstride
*sizeof(*nbat
->x
),
144 n
*nbat
->xstride
*sizeof(*nbat
->x
),
145 nbat
->alloc
, nbat
->free
);
146 for (t
= 0; t
< nbat
->nout
; t
++)
148 /* Allocate one element extra for possible signaling with GPUs */
149 nbnxn_realloc_void((void **)&nbat
->out
[t
].f
,
150 nbat
->natoms
*nbat
->fstride
*sizeof(*nbat
->out
[t
].f
),
151 n
*nbat
->fstride
*sizeof(*nbat
->out
[t
].f
),
152 nbat
->alloc
, nbat
->free
);
157 /* Initializes an nbnxn_atomdata_output_t data structure */
158 static void nbnxn_atomdata_output_init(nbnxn_atomdata_output_t
*out
,
160 int nenergrp
, int stride
,
164 ma((void **)&out
->fshift
, SHIFTS
*DIM
*sizeof(*out
->fshift
));
165 out
->nV
= nenergrp
*nenergrp
;
166 ma((void **)&out
->Vvdw
, out
->nV
*sizeof(*out
->Vvdw
));
167 ma((void **)&out
->Vc
, out
->nV
*sizeof(*out
->Vc
));
169 if (nb_kernel_type
== nbnxnk4xN_SIMD_4xN
||
170 nb_kernel_type
== nbnxnk4xN_SIMD_2xNN
)
172 int cj_size
= nbnxn_kernel_to_cluster_j_size(nb_kernel_type
);
173 out
->nVS
= nenergrp
*nenergrp
*stride
*(cj_size
>>1)*cj_size
;
174 ma((void **)&out
->VSvdw
, out
->nVS
*sizeof(*out
->VSvdw
));
175 ma((void **)&out
->VSc
, out
->nVS
*sizeof(*out
->VSc
));
183 static void copy_int_to_nbat_int(const int *a
, int na
, int na_round
,
184 const int *in
, int fill
, int *innb
)
189 for (i
= 0; i
< na
; i
++)
191 innb
[j
++] = in
[a
[i
]];
193 /* Complete the partially filled last cell with fill */
194 for (; i
< na_round
; i
++)
200 void copy_rvec_to_nbat_real(const int *a
, int na
, int na_round
,
201 const rvec
*x
, int nbatFormat
,
204 /* We complete partially filled cells, can only be the last one in each
205 * column, with coordinates farAway. The actual coordinate value does
206 * not influence the results, since these filler particles do not interact.
207 * Clusters with normal atoms + fillers have a bounding box based only
208 * on the coordinates of the atoms. Clusters with only fillers have as
209 * the bounding box the coordinates of the first filler. Such clusters
210 * are not considered as i-entries, but they are considered as j-entries.
211 * So for performance it is better to have their bounding boxes far away,
212 * such that filler only clusters don't end up in the pair list.
214 const real farAway
= -1000000;
222 for (i
= 0; i
< na
; i
++)
224 xnb
[j
++] = x
[a
[i
]][XX
];
225 xnb
[j
++] = x
[a
[i
]][YY
];
226 xnb
[j
++] = x
[a
[i
]][ZZ
];
228 /* Complete the partially filled last cell with farAway elements */
229 for (; i
< na_round
; i
++)
238 for (i
= 0; i
< na
; i
++)
240 xnb
[j
++] = x
[a
[i
]][XX
];
241 xnb
[j
++] = x
[a
[i
]][YY
];
242 xnb
[j
++] = x
[a
[i
]][ZZ
];
245 /* Complete the partially filled last cell with zeros */
246 for (; i
< na_round
; i
++)
255 j
= atom_to_x_index
<c_packX4
>(a0
);
256 c
= a0
& (c_packX4
-1);
257 for (i
= 0; i
< na
; i
++)
259 xnb
[j
+XX
*c_packX4
] = x
[a
[i
]][XX
];
260 xnb
[j
+YY
*c_packX4
] = x
[a
[i
]][YY
];
261 xnb
[j
+ZZ
*c_packX4
] = x
[a
[i
]][ZZ
];
266 j
+= (DIM
-1)*c_packX4
;
270 /* Complete the partially filled last cell with zeros */
271 for (; i
< na_round
; i
++)
273 xnb
[j
+XX
*c_packX4
] = farAway
;
274 xnb
[j
+YY
*c_packX4
] = farAway
;
275 xnb
[j
+ZZ
*c_packX4
] = farAway
;
280 j
+= (DIM
-1)*c_packX4
;
286 j
= atom_to_x_index
<c_packX8
>(a0
);
287 c
= a0
& (c_packX8
- 1);
288 for (i
= 0; i
< na
; i
++)
290 xnb
[j
+XX
*c_packX8
] = x
[a
[i
]][XX
];
291 xnb
[j
+YY
*c_packX8
] = x
[a
[i
]][YY
];
292 xnb
[j
+ZZ
*c_packX8
] = x
[a
[i
]][ZZ
];
297 j
+= (DIM
-1)*c_packX8
;
301 /* Complete the partially filled last cell with zeros */
302 for (; i
< na_round
; i
++)
304 xnb
[j
+XX
*c_packX8
] = farAway
;
305 xnb
[j
+YY
*c_packX8
] = farAway
;
306 xnb
[j
+ZZ
*c_packX8
] = farAway
;
311 j
+= (DIM
-1)*c_packX8
;
317 gmx_incons("Unsupported nbnxn_atomdata_t format");
321 /* Stores the LJ parameter data in a format convenient for different kernels */
322 static void set_lj_parameter_data(nbnxn_atomdata_t
*nbat
, gmx_bool bSIMD
)
326 int nt
= nbat
->ntype
;
331 /* nbfp_aligned stores two parameters using the stride most suitable
332 * for the present SIMD architecture, as specified by the constant
333 * c_simdBestPairAlignment from the SIMD header.
334 * There's a slight inefficiency in allocating and initializing nbfp_aligned
335 * when it might not be used, but introducing the conditional code is not
338 nbat
->alloc((void **)&nbat
->nbfp_aligned
,
339 nt
*nt
*c_simdBestPairAlignment
*sizeof(*nbat
->nbfp_aligned
));
340 for (int i
= 0; i
< nt
; i
++)
342 for (int j
= 0; j
< nt
; j
++)
344 nbat
->nbfp_aligned
[(i
*nt
+j
)*c_simdBestPairAlignment
+0] = nbat
->nbfp
[(i
*nt
+j
)*2+0];
345 nbat
->nbfp_aligned
[(i
*nt
+j
)*c_simdBestPairAlignment
+1] = nbat
->nbfp
[(i
*nt
+j
)*2+1];
346 nbat
->nbfp_aligned
[(i
*nt
+j
)*c_simdBestPairAlignment
+2] = 0;
347 nbat
->nbfp_aligned
[(i
*nt
+j
)*c_simdBestPairAlignment
+3] = 0;
353 /* We use combination rule data for SIMD combination rule kernels
354 * and with LJ-PME kernels. We then only need parameters per atom type,
355 * not per pair of atom types.
357 switch (nbat
->comb_rule
)
360 nbat
->comb_rule
= ljcrGEOM
;
362 for (int i
= 0; i
< nt
; i
++)
364 /* Store the sqrt of the diagonal from the nbfp matrix */
365 nbat
->nbfp_comb
[i
*2 ] = std::sqrt(nbat
->nbfp
[(i
*nt
+i
)*2 ]);
366 nbat
->nbfp_comb
[i
*2+1] = std::sqrt(nbat
->nbfp
[(i
*nt
+i
)*2+1]);
370 for (int i
= 0; i
< nt
; i
++)
372 /* Get 6*C6 and 12*C12 from the diagonal of the nbfp matrix */
373 c6
= nbat
->nbfp
[(i
*nt
+i
)*2 ];
374 c12
= nbat
->nbfp
[(i
*nt
+i
)*2+1];
375 if (c6
> 0 && c12
> 0)
377 /* We store 0.5*2^1/6*sigma and sqrt(4*3*eps),
378 * so we get 6*C6 and 12*C12 after combining.
380 nbat
->nbfp_comb
[i
*2 ] = 0.5*gmx::sixthroot(c12
/c6
);
381 nbat
->nbfp_comb
[i
*2+1] = std::sqrt(c6
*c6
/c12
);
385 nbat
->nbfp_comb
[i
*2 ] = 0;
386 nbat
->nbfp_comb
[i
*2+1] = 0;
391 /* We always store the full matrix (see code above) */
394 gmx_incons("Unknown combination rule");
401 nbnxn_atomdata_init_simple_exclusion_masks(nbnxn_atomdata_t
*nbat
)
403 const int simd_width
= GMX_SIMD_REAL_WIDTH
;
405 /* Set the diagonal cluster pair exclusion mask setup data.
406 * In the kernel we check 0 < j - i to generate the masks.
407 * Here we store j - i for generating the mask for the first i,
408 * we substract 0.5 to avoid rounding issues.
409 * In the kernel we can subtract 1 to generate the subsequent mask.
411 int simd_4xn_diag_size
;
413 simd_4xn_diag_size
= std::max(NBNXN_CPU_CLUSTER_I_SIZE
, simd_width
);
414 snew_aligned(nbat
->simd_4xn_diagonal_j_minus_i
, simd_4xn_diag_size
, NBNXN_MEM_ALIGN
);
415 for (int j
= 0; j
< simd_4xn_diag_size
; j
++)
417 nbat
->simd_4xn_diagonal_j_minus_i
[j
] = j
- 0.5;
420 snew_aligned(nbat
->simd_2xnn_diagonal_j_minus_i
, simd_width
, NBNXN_MEM_ALIGN
);
421 for (int j
= 0; j
< simd_width
/2; j
++)
423 /* The j-cluster size is half the SIMD width */
424 nbat
->simd_2xnn_diagonal_j_minus_i
[j
] = j
- 0.5;
425 /* The next half of the SIMD width is for i + 1 */
426 nbat
->simd_2xnn_diagonal_j_minus_i
[simd_width
/2+j
] = j
- 1 - 0.5;
429 /* We use up to 32 bits for exclusion masking.
430 * The same masks are used for the 4xN and 2x(N+N) kernels.
431 * The masks are read either into integer SIMD registers or into
432 * real SIMD registers (together with a cast).
433 * In single precision this means the real and integer SIMD registers
436 simd_excl_size
= NBNXN_CPU_CLUSTER_I_SIZE
*simd_width
;
437 #if GMX_DOUBLE && !GMX_SIMD_HAVE_INT32_LOGICAL
438 snew_aligned(nbat
->simd_exclusion_filter64
, simd_excl_size
, NBNXN_MEM_ALIGN
);
440 snew_aligned(nbat
->simd_exclusion_filter
, simd_excl_size
, NBNXN_MEM_ALIGN
);
443 for (int j
= 0; j
< simd_excl_size
; j
++)
445 /* Set the consecutive bits for masking pair exclusions */
446 #if GMX_DOUBLE && !GMX_SIMD_HAVE_INT32_LOGICAL
447 nbat
->simd_exclusion_filter64
[j
] = (1U << j
);
449 nbat
->simd_exclusion_filter
[j
] = (1U << j
);
453 #if !GMX_SIMD_HAVE_LOGICAL && !GMX_SIMD_HAVE_INT32_LOGICAL
454 // If the SIMD implementation has no bitwise logical operation support
455 // whatsoever we cannot use the normal masking. Instead,
456 // we generate a vector of all 2^4 possible ways an i atom
457 // interacts with its 4 j atoms. Each array entry contains
458 // GMX_SIMD_REAL_WIDTH values that are read with a single aligned SIMD load.
459 // Since there is no logical value representation in this case, we use
460 // any nonzero value to indicate 'true', while zero mean 'false'.
461 // This can then be converted to a SIMD boolean internally in the SIMD
462 // module by comparing to zero.
463 // Each array entry encodes how this i atom will interact with the 4 j atoms.
464 // Matching code exists in set_ci_top_excls() to generate indices into this array.
465 // Those indices are used in the kernels.
467 simd_excl_size
= NBNXN_CPU_CLUSTER_I_SIZE
*NBNXN_CPU_CLUSTER_I_SIZE
;
468 const real simdFalse
= 0.0;
469 const real simdTrue
= 1.0;
470 real
*simd_interaction_array
;
472 snew_aligned(simd_interaction_array
, simd_excl_size
* GMX_SIMD_REAL_WIDTH
, NBNXN_MEM_ALIGN
);
473 for (int j
= 0; j
< simd_excl_size
; j
++)
475 int index
= j
* GMX_SIMD_REAL_WIDTH
;
476 for (int i
= 0; i
< GMX_SIMD_REAL_WIDTH
; i
++)
478 simd_interaction_array
[index
+ i
] = (j
& (1 << i
)) ? simdTrue
: simdFalse
;
481 nbat
->simd_interaction_array
= simd_interaction_array
;
486 /* Initializes an nbnxn_atomdata_t data structure */
487 void nbnxn_atomdata_init(const gmx::MDLogger
&mdlog
,
488 nbnxn_atomdata_t
*nbat
,
490 int enbnxninitcombrule
,
491 int ntype
, const real
*nbfp
,
494 nbnxn_alloc_t
*alloc
,
500 gmx_bool simple
, bCombGeom
, bCombLB
, bSIMD
;
502 if (alloc
== nullptr)
504 nbat
->alloc
= nbnxn_alloc_aligned
;
512 nbat
->free
= nbnxn_free_aligned
;
521 fprintf(debug
, "There are %d atom types in the system, adding one for nbnxn_atomdata_t\n", ntype
);
523 nbat
->ntype
= ntype
+ 1;
524 nbat
->alloc((void **)&nbat
->nbfp
,
525 nbat
->ntype
*nbat
->ntype
*2*sizeof(*nbat
->nbfp
));
526 nbat
->alloc((void **)&nbat
->nbfp_comb
, nbat
->ntype
*2*sizeof(*nbat
->nbfp_comb
));
528 /* A tolerance of 1e-5 seems reasonable for (possibly hand-typed)
529 * force-field floating point parameters.
532 ptr
= getenv("GMX_LJCOMB_TOL");
537 sscanf(ptr
, "%lf", &dbl
);
543 /* Temporarily fill nbat->nbfp_comb with sigma and epsilon
544 * to check for the LB rule.
546 for (int i
= 0; i
< ntype
; i
++)
548 c6
= nbfp
[(i
*ntype
+i
)*2 ]/6.0;
549 c12
= nbfp
[(i
*ntype
+i
)*2+1]/12.0;
550 if (c6
> 0 && c12
> 0)
552 nbat
->nbfp_comb
[i
*2 ] = gmx::sixthroot(c12
/c6
);
553 nbat
->nbfp_comb
[i
*2+1] = 0.25*c6
*c6
/c12
;
555 else if (c6
== 0 && c12
== 0)
557 nbat
->nbfp_comb
[i
*2 ] = 0;
558 nbat
->nbfp_comb
[i
*2+1] = 0;
562 /* Can not use LB rule with only dispersion or repulsion */
567 for (int i
= 0; i
< nbat
->ntype
; i
++)
569 for (int j
= 0; j
< nbat
->ntype
; j
++)
571 if (i
< ntype
&& j
< ntype
)
573 /* fr->nbfp has been updated, so that array too now stores c6/c12 including
574 * the 6.0/12.0 prefactors to save 2 flops in the most common case (force-only).
576 c6
= nbfp
[(i
*ntype
+j
)*2 ];
577 c12
= nbfp
[(i
*ntype
+j
)*2+1];
578 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2 ] = c6
;
579 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2+1] = c12
;
581 /* Compare 6*C6 and 12*C12 for geometric cobination rule */
582 bCombGeom
= bCombGeom
&&
583 gmx_within_tol(c6
*c6
, nbfp
[(i
*ntype
+i
)*2 ]*nbfp
[(j
*ntype
+j
)*2 ], tol
) &&
584 gmx_within_tol(c12
*c12
, nbfp
[(i
*ntype
+i
)*2+1]*nbfp
[(j
*ntype
+j
)*2+1], tol
);
586 /* Compare C6 and C12 for Lorentz-Berthelot combination rule */
590 ((c6
== 0 && c12
== 0 &&
591 (nbat
->nbfp_comb
[i
*2+1] == 0 || nbat
->nbfp_comb
[j
*2+1] == 0)) ||
592 (c6
> 0 && c12
> 0 &&
593 gmx_within_tol(gmx::sixthroot(c12
/c6
),
594 0.5*(nbat
->nbfp_comb
[i
*2]+nbat
->nbfp_comb
[j
*2]), tol
) &&
595 gmx_within_tol(0.25*c6
*c6
/c12
, std::sqrt(nbat
->nbfp_comb
[i
*2+1]*nbat
->nbfp_comb
[j
*2+1]), tol
)));
599 /* Add zero parameters for the additional dummy atom type */
600 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2 ] = 0;
601 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2+1] = 0;
607 fprintf(debug
, "Combination rules: geometric %d Lorentz-Berthelot %d\n",
611 simple
= nbnxn_kernel_pairlist_simple(nb_kernel_type
);
613 switch (enbnxninitcombrule
)
615 case enbnxninitcombruleDETECT
:
616 /* We prefer the geometic combination rule,
617 * as that gives a slightly faster kernel than the LB rule.
621 nbat
->comb_rule
= ljcrGEOM
;
625 nbat
->comb_rule
= ljcrLB
;
629 nbat
->comb_rule
= ljcrNONE
;
631 nbat
->free(nbat
->nbfp_comb
);
636 if (nbat
->comb_rule
== ljcrNONE
)
638 mesg
= "Using full Lennard-Jones parameter combination matrix";
642 mesg
= gmx::formatString("Using %s Lennard-Jones combination rule",
643 nbat
->comb_rule
== ljcrGEOM
? "geometric" : "Lorentz-Berthelot");
645 GMX_LOG(mdlog
.info
).asParagraph().appendText(mesg
);
648 case enbnxninitcombruleGEOM
:
649 nbat
->comb_rule
= ljcrGEOM
;
651 case enbnxninitcombruleLB
:
652 nbat
->comb_rule
= ljcrLB
;
654 case enbnxninitcombruleNONE
:
655 nbat
->comb_rule
= ljcrNONE
;
657 nbat
->free(nbat
->nbfp_comb
);
660 gmx_incons("Unknown enbnxninitcombrule");
663 bSIMD
= (nb_kernel_type
== nbnxnk4xN_SIMD_4xN
||
664 nb_kernel_type
== nbnxnk4xN_SIMD_2xNN
);
666 set_lj_parameter_data(nbat
, bSIMD
);
669 nbat
->type
= nullptr;
670 nbat
->lj_comb
= nullptr;
677 pack_x
= std::max(NBNXN_CPU_CLUSTER_I_SIZE
,
678 nbnxn_kernel_to_cluster_j_size(nb_kernel_type
));
682 nbat
->XFormat
= nbatX4
;
685 nbat
->XFormat
= nbatX8
;
688 gmx_incons("Unsupported packing width");
693 nbat
->XFormat
= nbatXYZ
;
696 nbat
->FFormat
= nbat
->XFormat
;
700 nbat
->XFormat
= nbatXYZQ
;
701 nbat
->FFormat
= nbatXYZ
;
704 nbat
->nenergrp
= n_energygroups
;
707 // We now check for energy groups already when starting mdrun
708 GMX_RELEASE_ASSERT(n_energygroups
== 1, "GPU kernels do not support energy groups");
710 /* Temporary storage goes as #grp^3*simd_width^2/2, so limit to 64 */
711 if (nbat
->nenergrp
> 64)
713 gmx_fatal(FARGS
, "With NxN kernels not more than 64 energy groups are supported\n");
716 while (nbat
->nenergrp
> (1<<nbat
->neg_2log
))
720 nbat
->energrp
= nullptr;
721 nbat
->alloc((void **)&nbat
->shift_vec
, SHIFTS
*sizeof(*nbat
->shift_vec
));
722 nbat
->xstride
= (nbat
->XFormat
== nbatXYZQ
? STRIDE_XYZQ
: DIM
);
723 nbat
->fstride
= (nbat
->FFormat
== nbatXYZQ
? STRIDE_XYZQ
: DIM
);
729 nbnxn_atomdata_init_simple_exclusion_masks(nbat
);
733 /* Initialize the output data structures */
735 snew(nbat
->out
, nbat
->nout
);
737 for (int i
= 0; i
< nbat
->nout
; i
++)
739 nbnxn_atomdata_output_init(&nbat
->out
[i
],
741 nbat
->nenergrp
, 1<<nbat
->neg_2log
,
744 nbat
->buffer_flags
.flag
= nullptr;
745 nbat
->buffer_flags
.flag_nalloc
= 0;
747 nth
= gmx_omp_nthreads_get(emntNonbonded
);
749 ptr
= getenv("GMX_USE_TREEREDUCE");
752 nbat
->bUseTreeReduce
= strtol(ptr
, nullptr, 10);
755 else if (nth
> 8) /*on the CPU we currently don't benefit even at 32*/
757 nbat
->bUseTreeReduce
= 1;
762 nbat
->bUseTreeReduce
= 0;
764 if (nbat
->bUseTreeReduce
)
766 GMX_LOG(mdlog
.info
).asParagraph().appendText("Using tree force reduction");
768 snew(nbat
->syncStep
, nth
);
772 template<int packSize
>
773 static void copy_lj_to_nbat_lj_comb(const real
*ljparam_type
,
774 const int *type
, int na
,
777 /* The LJ params follow the combination rule:
778 * copy the params for the type array to the atom array.
780 for (int is
= 0; is
< na
; is
+= packSize
)
782 for (int k
= 0; k
< packSize
; k
++)
785 ljparam_at
[is
*2 + k
] = ljparam_type
[type
[i
]*2 ];
786 ljparam_at
[is
*2 + packSize
+ k
] = ljparam_type
[type
[i
]*2 + 1];
791 /* Sets the atom type in nbnxn_atomdata_t */
792 static void nbnxn_atomdata_set_atomtypes(nbnxn_atomdata_t
*nbat
,
793 const nbnxn_search_t nbs
,
796 for (const nbnxn_grid_t
&grid
: nbs
->grid
)
798 /* Loop over all columns and copy and fill */
799 for (int i
= 0; i
< grid
.ncx
*grid
.ncy
; i
++)
801 int ncz
= grid
.cxy_ind
[i
+1] - grid
.cxy_ind
[i
];
802 int ash
= (grid
.cell0
+ grid
.cxy_ind
[i
])*grid
.na_sc
;
804 copy_int_to_nbat_int(nbs
->a
+ash
, grid
.cxy_na
[i
], ncz
*grid
.na_sc
,
805 type
, nbat
->ntype
-1, nbat
->type
+ash
);
810 /* Sets the LJ combination rule parameters in nbnxn_atomdata_t */
811 static void nbnxn_atomdata_set_ljcombparams(nbnxn_atomdata_t
*nbat
,
812 const nbnxn_search_t nbs
)
814 if (nbat
->comb_rule
!= ljcrNONE
)
816 for (const nbnxn_grid_t
&grid
: nbs
->grid
)
818 /* Loop over all columns and copy and fill */
819 for (int i
= 0; i
< grid
.ncx
*grid
.ncy
; i
++)
821 int ncz
= grid
.cxy_ind
[i
+1] - grid
.cxy_ind
[i
];
822 int ash
= (grid
.cell0
+ grid
.cxy_ind
[i
])*grid
.na_sc
;
824 if (nbat
->XFormat
== nbatX4
)
826 copy_lj_to_nbat_lj_comb
<c_packX4
>(nbat
->nbfp_comb
,
829 nbat
->lj_comb
+ ash
*2);
831 else if (nbat
->XFormat
== nbatX8
)
833 copy_lj_to_nbat_lj_comb
<c_packX8
>(nbat
->nbfp_comb
,
836 nbat
->lj_comb
+ ash
*2);
838 else if (nbat
->XFormat
== nbatXYZQ
)
840 copy_lj_to_nbat_lj_comb
<1>(nbat
->nbfp_comb
,
843 nbat
->lj_comb
+ ash
*2);
850 /* Sets the charges in nbnxn_atomdata_t *nbat */
851 static void nbnxn_atomdata_set_charges(nbnxn_atomdata_t
*nbat
,
852 const nbnxn_search_t nbs
,
855 for (const nbnxn_grid_t
&grid
: nbs
->grid
)
857 /* Loop over all columns and copy and fill */
858 for (int cxy
= 0; cxy
< grid
.ncx
*grid
.ncy
; cxy
++)
860 int ash
= (grid
.cell0
+ grid
.cxy_ind
[cxy
])*grid
.na_sc
;
861 int na
= grid
.cxy_na
[cxy
];
862 int na_round
= (grid
.cxy_ind
[cxy
+1] - grid
.cxy_ind
[cxy
])*grid
.na_sc
;
864 if (nbat
->XFormat
== nbatXYZQ
)
866 real
*q
= nbat
->x
+ ash
*STRIDE_XYZQ
+ ZZ
+ 1;
868 for (i
= 0; i
< na
; i
++)
870 *q
= charge
[nbs
->a
[ash
+i
]];
873 /* Complete the partially filled last cell with zeros */
874 for (; i
< na_round
; i
++)
882 real
*q
= nbat
->q
+ ash
;
884 for (i
= 0; i
< na
; i
++)
886 *q
= charge
[nbs
->a
[ash
+i
]];
889 /* Complete the partially filled last cell with zeros */
890 for (; i
< na_round
; i
++)
900 /* Set the charges of perturbed atoms in nbnxn_atomdata_t to 0.
901 * This is to automatically remove the RF/PME self term in the nbnxn kernels.
902 * Part of the zero interactions are still calculated in the normal kernels.
903 * All perturbed interactions are calculated in the free energy kernel,
904 * using the original charge and LJ data, not nbnxn_atomdata_t.
906 static void nbnxn_atomdata_mask_fep(nbnxn_atomdata_t
*nbat
,
907 const nbnxn_search_t nbs
)
912 if (nbat
->XFormat
== nbatXYZQ
)
914 q
= nbat
->x
+ ZZ
+ 1;
915 stride_q
= STRIDE_XYZQ
;
923 for (const nbnxn_grid_t
&grid
: nbs
->grid
)
931 nsubc
= c_gpuNumClusterPerCell
;
934 int c_offset
= grid
.cell0
*grid
.na_sc
;
936 /* Loop over all columns and copy and fill */
937 for (int c
= 0; c
< grid
.nc
*nsubc
; c
++)
939 /* Does this cluster contain perturbed particles? */
940 if (grid
.fep
[c
] != 0)
942 for (int i
= 0; i
< grid
.na_c
; i
++)
944 /* Is this a perturbed particle? */
945 if (grid
.fep
[c
] & (1 << i
))
947 int ind
= c_offset
+ c
*grid
.na_c
+ i
;
948 /* Set atom type and charge to non-interacting */
949 nbat
->type
[ind
] = nbat
->ntype
- 1;
958 /* Copies the energy group indices to a reordered and packed array */
959 static void copy_egp_to_nbat_egps(const int *a
, int na
, int na_round
,
960 int na_c
, int bit_shift
,
961 const int *in
, int *innb
)
967 for (i
= 0; i
< na
; i
+= na_c
)
969 /* Store na_c energy group numbers into one int */
971 for (int sa
= 0; sa
< na_c
; sa
++)
976 comb
|= (GET_CGINFO_GID(in
[at
]) << (sa
*bit_shift
));
981 /* Complete the partially filled last cell with fill */
982 for (; i
< na_round
; i
+= na_c
)
988 /* Set the energy group indices for atoms in nbnxn_atomdata_t */
989 static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t
*nbat
,
990 const nbnxn_search_t nbs
,
993 if (nbat
->nenergrp
== 1)
998 for (const nbnxn_grid_t
&grid
: nbs
->grid
)
1000 /* Loop over all columns and copy and fill */
1001 for (int i
= 0; i
< grid
.ncx
*grid
.ncy
; i
++)
1003 int ncz
= grid
.cxy_ind
[i
+1] - grid
.cxy_ind
[i
];
1004 int ash
= (grid
.cell0
+ grid
.cxy_ind
[i
])*grid
.na_sc
;
1006 copy_egp_to_nbat_egps(nbs
->a
+ash
, grid
.cxy_na
[i
], ncz
*grid
.na_sc
,
1007 nbat
->na_c
, nbat
->neg_2log
,
1008 atinfo
, nbat
->energrp
+(ash
>>grid
.na_c_2log
));
1013 /* Sets all required atom parameter data in nbnxn_atomdata_t */
1014 void nbnxn_atomdata_set(nbnxn_atomdata_t
*nbat
,
1015 const nbnxn_search_t nbs
,
1016 const t_mdatoms
*mdatoms
,
1019 nbnxn_atomdata_set_atomtypes(nbat
, nbs
, mdatoms
->typeA
);
1021 nbnxn_atomdata_set_charges(nbat
, nbs
, mdatoms
->chargeA
);
1025 nbnxn_atomdata_mask_fep(nbat
, nbs
);
1028 /* This must be done after masking types for FEP */
1029 nbnxn_atomdata_set_ljcombparams(nbat
, nbs
);
1031 nbnxn_atomdata_set_energygroups(nbat
, nbs
, atinfo
);
1034 /* Copies the shift vector array to nbnxn_atomdata_t */
1035 void nbnxn_atomdata_copy_shiftvec(gmx_bool bDynamicBox
,
1037 nbnxn_atomdata_t
*nbat
)
1041 nbat
->bDynamicBox
= bDynamicBox
;
1042 for (i
= 0; i
< SHIFTS
; i
++)
1044 copy_rvec(shift_vec
[i
], nbat
->shift_vec
[i
]);
1048 /* Copies (and reorders) the coordinates to nbnxn_atomdata_t */
1049 void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search_t nbs
,
1053 nbnxn_atomdata_t
*nbat
,
1054 gmx_wallcycle
*wcycle
)
1056 wallcycle_start(wcycle
, ewcNB_XF_BUF_OPS
);
1057 wallcycle_sub_start(wcycle
, ewcsNB_X_BUF_OPS
);
1066 g1
= nbs
->grid
.size();
1074 g1
= nbs
->grid
.size();
1080 nbat
->natoms_local
= nbs
->grid
[0].nc
*nbs
->grid
[0].na_sc
;
1083 nth
= gmx_omp_nthreads_get(emntPairsearch
);
1085 #pragma omp parallel for num_threads(nth) schedule(static)
1086 for (th
= 0; th
< nth
; th
++)
1090 for (int g
= g0
; g
< g1
; g
++)
1092 const nbnxn_grid_t
*grid
;
1095 grid
= &nbs
->grid
[g
];
1097 cxy0
= (grid
->ncx
*grid
->ncy
* th
+nth
-1)/nth
;
1098 cxy1
= (grid
->ncx
*grid
->ncy
*(th
+1)+nth
-1)/nth
;
1100 for (int cxy
= cxy0
; cxy
< cxy1
; cxy
++)
1102 int na
, ash
, na_fill
;
1104 na
= grid
->cxy_na
[cxy
];
1105 ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
1107 if (g
== 0 && FillLocal
)
1110 (grid
->cxy_ind
[cxy
+1] - grid
->cxy_ind
[cxy
])*grid
->na_sc
;
1114 /* We fill only the real particle locations.
1115 * We assume the filling entries at the end have been
1116 * properly set before during pair-list generation.
1120 copy_rvec_to_nbat_real(nbs
->a
+ash
, na
, na_fill
, x
,
1121 nbat
->XFormat
, nbat
->x
, ash
);
1125 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1128 wallcycle_sub_stop(wcycle
, ewcsNB_X_BUF_OPS
);
1129 wallcycle_stop(wcycle
, ewcNB_XF_BUF_OPS
);
1133 nbnxn_atomdata_clear_reals(real
* gmx_restrict dest
,
1136 for (int i
= i0
; i
< i1
; i
++)
1142 gmx_unused
static void
1143 nbnxn_atomdata_reduce_reals(real
* gmx_restrict dest
,
1145 real
** gmx_restrict src
,
1151 /* The destination buffer contains data, add to it */
1152 for (int i
= i0
; i
< i1
; i
++)
1154 for (int s
= 0; s
< nsrc
; s
++)
1156 dest
[i
] += src
[s
][i
];
1162 /* The destination buffer is unitialized, set it first */
1163 for (int i
= i0
; i
< i1
; i
++)
1165 dest
[i
] = src
[0][i
];
1166 for (int s
= 1; s
< nsrc
; s
++)
1168 dest
[i
] += src
[s
][i
];
1174 gmx_unused
static void
1175 nbnxn_atomdata_reduce_reals_simd(real gmx_unused
* gmx_restrict dest
,
1176 gmx_bool gmx_unused bDestSet
,
1177 real gmx_unused
** gmx_restrict src
,
1178 int gmx_unused nsrc
,
1179 int gmx_unused i0
, int gmx_unused i1
)
1182 /* The SIMD width here is actually independent of that in the kernels,
1183 * but we use the same width for simplicity (usually optimal anyhow).
1185 SimdReal dest_SSE
, src_SSE
;
1189 for (int i
= i0
; i
< i1
; i
+= GMX_SIMD_REAL_WIDTH
)
1191 dest_SSE
= load
<SimdReal
>(dest
+i
);
1192 for (int s
= 0; s
< nsrc
; s
++)
1194 src_SSE
= load
<SimdReal
>(src
[s
]+i
);
1195 dest_SSE
= dest_SSE
+ src_SSE
;
1197 store(dest
+i
, dest_SSE
);
1202 for (int i
= i0
; i
< i1
; i
+= GMX_SIMD_REAL_WIDTH
)
1204 dest_SSE
= load
<SimdReal
>(src
[0]+i
);
1205 for (int s
= 1; s
< nsrc
; s
++)
1207 src_SSE
= load
<SimdReal
>(src
[s
]+i
);
1208 dest_SSE
= dest_SSE
+ src_SSE
;
1210 store(dest
+i
, dest_SSE
);
1216 /* Add part of the force array(s) from nbnxn_atomdata_t to f */
1218 nbnxn_atomdata_add_nbat_f_to_f_part(const nbnxn_search_t nbs
,
1219 const nbnxn_atomdata_t
*nbat
,
1220 nbnxn_atomdata_output_t
*out
,
1230 /* Loop over all columns and copy and fill */
1231 switch (nbat
->FFormat
)
1239 for (int a
= a0
; a
< a1
; a
++)
1241 int i
= cell
[a
]*nbat
->fstride
;
1244 f
[a
][YY
] += fnb
[i
+1];
1245 f
[a
][ZZ
] += fnb
[i
+2];
1250 for (int a
= a0
; a
< a1
; a
++)
1252 int i
= cell
[a
]*nbat
->fstride
;
1254 for (int fa
= 0; fa
< nfa
; fa
++)
1256 f
[a
][XX
] += out
[fa
].f
[i
];
1257 f
[a
][YY
] += out
[fa
].f
[i
+1];
1258 f
[a
][ZZ
] += out
[fa
].f
[i
+2];
1268 for (int a
= a0
; a
< a1
; a
++)
1270 int i
= atom_to_x_index
<c_packX4
>(cell
[a
]);
1272 f
[a
][XX
] += fnb
[i
+XX
*c_packX4
];
1273 f
[a
][YY
] += fnb
[i
+YY
*c_packX4
];
1274 f
[a
][ZZ
] += fnb
[i
+ZZ
*c_packX4
];
1279 for (int a
= a0
; a
< a1
; a
++)
1281 int i
= atom_to_x_index
<c_packX4
>(cell
[a
]);
1283 for (int fa
= 0; fa
< nfa
; fa
++)
1285 f
[a
][XX
] += out
[fa
].f
[i
+XX
*c_packX4
];
1286 f
[a
][YY
] += out
[fa
].f
[i
+YY
*c_packX4
];
1287 f
[a
][ZZ
] += out
[fa
].f
[i
+ZZ
*c_packX4
];
1297 for (int a
= a0
; a
< a1
; a
++)
1299 int i
= atom_to_x_index
<c_packX8
>(cell
[a
]);
1301 f
[a
][XX
] += fnb
[i
+XX
*c_packX8
];
1302 f
[a
][YY
] += fnb
[i
+YY
*c_packX8
];
1303 f
[a
][ZZ
] += fnb
[i
+ZZ
*c_packX8
];
1308 for (int a
= a0
; a
< a1
; a
++)
1310 int i
= atom_to_x_index
<c_packX8
>(cell
[a
]);
1312 for (int fa
= 0; fa
< nfa
; fa
++)
1314 f
[a
][XX
] += out
[fa
].f
[i
+XX
*c_packX8
];
1315 f
[a
][YY
] += out
[fa
].f
[i
+YY
*c_packX8
];
1316 f
[a
][ZZ
] += out
[fa
].f
[i
+ZZ
*c_packX8
];
1322 gmx_incons("Unsupported nbnxn_atomdata_t format");
1326 static inline unsigned char reverse_bits(unsigned char b
)
1328 /* http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64BitsDiv */
1329 return (b
* 0x0202020202ULL
& 0x010884422010ULL
) % 1023;
1332 static void nbnxn_atomdata_add_nbat_f_to_f_treereduce(const nbnxn_atomdata_t
*nbat
,
1335 const nbnxn_buffer_flags_t
*flags
= &nbat
->buffer_flags
;
1337 int next_pow2
= 1<<(gmx::log2I(nth
-1)+1);
1339 assert(nbat
->nout
== nth
); /* tree-reduce currently only works for nout==nth */
1341 memset(nbat
->syncStep
, 0, sizeof(*(nbat
->syncStep
))*nth
);
1343 #pragma omp parallel num_threads(nth)
1351 th
= gmx_omp_get_thread_num();
1353 for (group_size
= 2; group_size
< 2*next_pow2
; group_size
*= 2)
1355 int index
[2], group_pos
, partner_pos
, wu
;
1356 int partner_th
= th
^ (group_size
/2);
1361 /* wait on partner thread - replaces full barrier */
1362 int sync_th
, sync_group_size
;
1364 tMPI_Atomic_memory_barrier(); /* gurantee data is saved before marking work as done */
1365 tMPI_Atomic_set(&(nbat
->syncStep
[th
]), group_size
/2); /* mark previous step as completed */
1367 /* find thread to sync with. Equal to partner_th unless nth is not a power of two. */
1368 for (sync_th
= partner_th
, sync_group_size
= group_size
; sync_th
>= nth
&& sync_group_size
> 2; sync_group_size
/= 2)
1370 sync_th
&= ~(sync_group_size
/4);
1372 if (sync_th
< nth
) /* otherwise nothing to sync index[1] will be >=nout */
1374 /* wait on the thread which computed input data in previous step */
1375 while (tMPI_Atomic_get((volatile tMPI_Atomic_t
*)&(nbat
->syncStep
[sync_th
])) < group_size
/2)
1379 /* guarantee that no later load happens before wait loop is finisehd */
1380 tMPI_Atomic_memory_barrier();
1382 #else /* TMPI_ATOMICS */
1387 /* Calculate buffers to sum (result goes into first buffer) */
1388 group_pos
= th
% group_size
;
1389 index
[0] = th
- group_pos
;
1390 index
[1] = index
[0] + group_size
/2;
1392 /* If no second buffer, nothing to do */
1393 if (index
[1] >= nbat
->nout
&& group_size
> 2)
1398 #if NBNXN_BUFFERFLAG_MAX_THREADS > 256
1399 #error reverse_bits assumes max 256 threads
1401 /* Position is permuted so that one of the 2 vectors being added was computed on the same thread in the previous step.
1402 This improves locality and enables to sync with just a single thread between steps (=the levels in the btree).
1403 The permutation which allows this corresponds to reversing the bits of the group position.
1405 group_pos
= reverse_bits(group_pos
)/(256/group_size
);
1407 partner_pos
= group_pos
^ 1;
1409 /* loop over two work-units (own and partner) */
1410 for (wu
= 0; wu
< 2; wu
++)
1414 if (partner_th
< nth
)
1416 break; /* partner exists we don't have to do his work */
1420 group_pos
= partner_pos
;
1424 /* Calculate the cell-block range for our thread */
1425 b0
= (flags
->nflag
* group_pos
)/group_size
;
1426 b1
= (flags
->nflag
*(group_pos
+1))/group_size
;
1428 for (b
= b0
; b
< b1
; b
++)
1430 i0
= b
*NBNXN_BUFFERFLAG_SIZE
*nbat
->fstride
;
1431 i1
= (b
+1)*NBNXN_BUFFERFLAG_SIZE
*nbat
->fstride
;
1433 if (bitmask_is_set(flags
->flag
[b
], index
[1]) || group_size
> 2)
1436 nbnxn_atomdata_reduce_reals_simd
1438 nbnxn_atomdata_reduce_reals
1440 (nbat
->out
[index
[0]].f
,
1441 bitmask_is_set(flags
->flag
[b
], index
[0]) || group_size
> 2,
1442 &(nbat
->out
[index
[1]].f
), 1, i0
, i1
);
1445 else if (!bitmask_is_set(flags
->flag
[b
], index
[0]))
1447 nbnxn_atomdata_clear_reals(nbat
->out
[index
[0]].f
,
1454 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1459 static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(const nbnxn_atomdata_t
*nbat
,
1462 #pragma omp parallel for num_threads(nth) schedule(static)
1463 for (int th
= 0; th
< nth
; th
++)
1467 const nbnxn_buffer_flags_t
*flags
;
1469 real
*fptr
[NBNXN_BUFFERFLAG_MAX_THREADS
];
1471 flags
= &nbat
->buffer_flags
;
1473 /* Calculate the cell-block range for our thread */
1474 int b0
= (flags
->nflag
* th
)/nth
;
1475 int b1
= (flags
->nflag
*(th
+1))/nth
;
1477 for (int b
= b0
; b
< b1
; b
++)
1479 int i0
= b
*NBNXN_BUFFERFLAG_SIZE
*nbat
->fstride
;
1480 int i1
= (b
+1)*NBNXN_BUFFERFLAG_SIZE
*nbat
->fstride
;
1483 for (int out
= 1; out
< nbat
->nout
; out
++)
1485 if (bitmask_is_set(flags
->flag
[b
], out
))
1487 fptr
[nfptr
++] = nbat
->out
[out
].f
;
1493 nbnxn_atomdata_reduce_reals_simd
1495 nbnxn_atomdata_reduce_reals
1498 bitmask_is_set(flags
->flag
[b
], 0),
1502 else if (!bitmask_is_set(flags
->flag
[b
], 0))
1504 nbnxn_atomdata_clear_reals(nbat
->out
[0].f
,
1509 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1513 /* Add the force array(s) from nbnxn_atomdata_t to f */
1514 void nbnxn_atomdata_add_nbat_f_to_f(const nbnxn_search_t nbs
,
1516 const nbnxn_atomdata_t
*nbat
,
1518 gmx_wallcycle
*wcycle
)
1520 wallcycle_start(wcycle
, ewcNB_XF_BUF_OPS
);
1521 wallcycle_sub_start(wcycle
, ewcsNB_F_BUF_OPS
);
1525 nbs_cycle_start(&nbs
->cc
[enbsCCreducef
]);
1531 na
= nbs
->natoms_nonlocal
;
1535 na
= nbs
->natoms_local
;
1538 a0
= nbs
->natoms_local
;
1539 na
= nbs
->natoms_nonlocal
- nbs
->natoms_local
;
1543 int nth
= gmx_omp_nthreads_get(emntNonbonded
);
1547 if (locality
!= eatAll
)
1549 gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll");
1552 /* Reduce the force thread output buffers into buffer 0, before adding
1553 * them to the, differently ordered, "real" force buffer.
1555 if (nbat
->bUseTreeReduce
)
1557 nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbat
, nth
);
1561 nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbat
, nth
);
1564 #pragma omp parallel for num_threads(nth) schedule(static)
1565 for (int th
= 0; th
< nth
; th
++)
1569 nbnxn_atomdata_add_nbat_f_to_f_part(nbs
, nbat
,
1576 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1579 nbs_cycle_stop(&nbs
->cc
[enbsCCreducef
]);
1581 wallcycle_sub_stop(wcycle
, ewcsNB_F_BUF_OPS
);
1582 wallcycle_stop(wcycle
, ewcNB_XF_BUF_OPS
);
1585 /* Adds the shift forces from nbnxn_atomdata_t to fshift */
1586 void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t
*nbat
,
1589 const nbnxn_atomdata_output_t
* out
= nbat
->out
;
1591 for (int s
= 0; s
< SHIFTS
; s
++)
1595 for (int th
= 0; th
< nbat
->nout
; th
++)
1597 sum
[XX
] += out
[th
].fshift
[s
*DIM
+XX
];
1598 sum
[YY
] += out
[th
].fshift
[s
*DIM
+YY
];
1599 sum
[ZZ
] += out
[th
].fshift
[s
*DIM
+ZZ
];
1601 rvec_inc(fshift
[s
], sum
);