2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 #include "nbnxn_atomdata.h"
48 #include "thread_mpi/atomic.h"
50 #include "gromacs/math/functions.h"
51 #include "gromacs/math/utilities.h"
52 #include "gromacs/math/vec.h"
53 #include "gromacs/mdlib/gmx_omp_nthreads.h"
54 #include "gromacs/mdlib/nb_verlet.h"
55 #include "gromacs/mdlib/nbnxn_consts.h"
56 #include "gromacs/mdlib/nbnxn_internal.h"
57 #include "gromacs/mdlib/nbnxn_search.h"
58 #include "gromacs/mdlib/nbnxn_util.h"
59 #include "gromacs/mdtypes/forcerec.h" // only for GET_CGINFO_*
60 #include "gromacs/mdtypes/mdatom.h"
61 #include "gromacs/pbcutil/ishift.h"
62 #include "gromacs/simd/simd.h"
63 #include "gromacs/utility/exceptions.h"
64 #include "gromacs/utility/fatalerror.h"
65 #include "gromacs/utility/gmxomp.h"
66 #include "gromacs/utility/logger.h"
67 #include "gromacs/utility/smalloc.h"
68 #include "gromacs/utility/stringutil.h"
70 using namespace gmx
; // TODO: Remove when this file is moved into gmx namespace
72 /* Default nbnxn allocation routine, allocates NBNXN_MEM_ALIGN byte aligned */
73 void nbnxn_alloc_aligned(void **ptr
, size_t nbytes
)
75 *ptr
= save_malloc_aligned("ptr", __FILE__
, __LINE__
, nbytes
, 1, NBNXN_MEM_ALIGN
);
78 /* Free function for memory allocated with nbnxn_alloc_aligned */
79 void nbnxn_free_aligned(void *ptr
)
84 /* Reallocation wrapper function for nbnxn data structures */
85 void nbnxn_realloc_void(void **ptr
,
86 int nbytes_copy
, int nbytes_new
,
92 ma(&ptr_new
, nbytes_new
);
94 if (nbytes_new
> 0 && ptr_new
== nullptr)
96 gmx_fatal(FARGS
, "Allocation of %d bytes failed", nbytes_new
);
101 if (nbytes_new
< nbytes_copy
)
103 gmx_incons("In nbnxn_realloc_void: new size less than copy size");
105 memcpy(ptr_new
, *ptr
, nbytes_copy
);
114 /* Reallocate the nbnxn_atomdata_t for a size of n atoms */
115 void nbnxn_atomdata_realloc(nbnxn_atomdata_t
*nbat
, int n
)
119 nbnxn_realloc_void((void **)&nbat
->type
,
120 nbat
->natoms
*sizeof(*nbat
->type
),
121 n
*sizeof(*nbat
->type
),
122 nbat
->alloc
, nbat
->free
);
123 nbnxn_realloc_void((void **)&nbat
->lj_comb
,
124 nbat
->natoms
*2*sizeof(*nbat
->lj_comb
),
125 n
*2*sizeof(*nbat
->lj_comb
),
126 nbat
->alloc
, nbat
->free
);
127 if (nbat
->XFormat
!= nbatXYZQ
)
129 nbnxn_realloc_void((void **)&nbat
->q
,
130 nbat
->natoms
*sizeof(*nbat
->q
),
132 nbat
->alloc
, nbat
->free
);
134 if (nbat
->nenergrp
> 1)
136 nbnxn_realloc_void((void **)&nbat
->energrp
,
137 nbat
->natoms
/nbat
->na_c
*sizeof(*nbat
->energrp
),
138 n
/nbat
->na_c
*sizeof(*nbat
->energrp
),
139 nbat
->alloc
, nbat
->free
);
141 nbnxn_realloc_void((void **)&nbat
->x
,
142 nbat
->natoms
*nbat
->xstride
*sizeof(*nbat
->x
),
143 n
*nbat
->xstride
*sizeof(*nbat
->x
),
144 nbat
->alloc
, nbat
->free
);
145 for (t
= 0; t
< nbat
->nout
; t
++)
147 /* Allocate one element extra for possible signaling with GPUs */
148 nbnxn_realloc_void((void **)&nbat
->out
[t
].f
,
149 nbat
->natoms
*nbat
->fstride
*sizeof(*nbat
->out
[t
].f
),
150 n
*nbat
->fstride
*sizeof(*nbat
->out
[t
].f
),
151 nbat
->alloc
, nbat
->free
);
156 /* Initializes an nbnxn_atomdata_output_t data structure */
157 static void nbnxn_atomdata_output_init(nbnxn_atomdata_output_t
*out
,
159 int nenergrp
, int stride
,
163 ma((void **)&out
->fshift
, SHIFTS
*DIM
*sizeof(*out
->fshift
));
164 out
->nV
= nenergrp
*nenergrp
;
165 ma((void **)&out
->Vvdw
, out
->nV
*sizeof(*out
->Vvdw
));
166 ma((void **)&out
->Vc
, out
->nV
*sizeof(*out
->Vc
));
168 if (nb_kernel_type
== nbnxnk4xN_SIMD_4xN
||
169 nb_kernel_type
== nbnxnk4xN_SIMD_2xNN
)
171 int cj_size
= nbnxn_kernel_to_cluster_j_size(nb_kernel_type
);
172 out
->nVS
= nenergrp
*nenergrp
*stride
*(cj_size
>>1)*cj_size
;
173 ma((void **)&out
->VSvdw
, out
->nVS
*sizeof(*out
->VSvdw
));
174 ma((void **)&out
->VSc
, out
->nVS
*sizeof(*out
->VSc
));
182 static void copy_int_to_nbat_int(const int *a
, int na
, int na_round
,
183 const int *in
, int fill
, int *innb
)
188 for (i
= 0; i
< na
; i
++)
190 innb
[j
++] = in
[a
[i
]];
192 /* Complete the partially filled last cell with fill */
193 for (; i
< na_round
; i
++)
199 void copy_rvec_to_nbat_real(const int *a
, int na
, int na_round
,
200 const rvec
*x
, int nbatFormat
,
203 /* We complete partially filled cells, can only be the last one in each
204 * column, with coordinates farAway. The actual coordinate value does
205 * not influence the results, since these filler particles do not interact.
206 * Clusters with normal atoms + fillers have a bounding box based only
207 * on the coordinates of the atoms. Clusters with only fillers have as
208 * the bounding box the coordinates of the first filler. Such clusters
209 * are not considered as i-entries, but they are considered as j-entries.
210 * So for performance it is better to have their bounding boxes far away,
211 * such that filler only clusters don't end up in the pair list.
213 const real farAway
= -1000000;
221 for (i
= 0; i
< na
; i
++)
223 xnb
[j
++] = x
[a
[i
]][XX
];
224 xnb
[j
++] = x
[a
[i
]][YY
];
225 xnb
[j
++] = x
[a
[i
]][ZZ
];
227 /* Complete the partially filled last cell with farAway elements */
228 for (; i
< na_round
; i
++)
237 for (i
= 0; i
< na
; i
++)
239 xnb
[j
++] = x
[a
[i
]][XX
];
240 xnb
[j
++] = x
[a
[i
]][YY
];
241 xnb
[j
++] = x
[a
[i
]][ZZ
];
244 /* Complete the partially filled last cell with zeros */
245 for (; i
< na_round
; i
++)
254 j
= atom_to_x_index
<c_packX4
>(a0
);
255 c
= a0
& (c_packX4
-1);
256 for (i
= 0; i
< na
; i
++)
258 xnb
[j
+XX
*c_packX4
] = x
[a
[i
]][XX
];
259 xnb
[j
+YY
*c_packX4
] = x
[a
[i
]][YY
];
260 xnb
[j
+ZZ
*c_packX4
] = x
[a
[i
]][ZZ
];
265 j
+= (DIM
-1)*c_packX4
;
269 /* Complete the partially filled last cell with zeros */
270 for (; i
< na_round
; i
++)
272 xnb
[j
+XX
*c_packX4
] = farAway
;
273 xnb
[j
+YY
*c_packX4
] = farAway
;
274 xnb
[j
+ZZ
*c_packX4
] = farAway
;
279 j
+= (DIM
-1)*c_packX4
;
285 j
= atom_to_x_index
<c_packX8
>(a0
);
286 c
= a0
& (c_packX8
- 1);
287 for (i
= 0; i
< na
; i
++)
289 xnb
[j
+XX
*c_packX8
] = x
[a
[i
]][XX
];
290 xnb
[j
+YY
*c_packX8
] = x
[a
[i
]][YY
];
291 xnb
[j
+ZZ
*c_packX8
] = x
[a
[i
]][ZZ
];
296 j
+= (DIM
-1)*c_packX8
;
300 /* Complete the partially filled last cell with zeros */
301 for (; i
< na_round
; i
++)
303 xnb
[j
+XX
*c_packX8
] = farAway
;
304 xnb
[j
+YY
*c_packX8
] = farAway
;
305 xnb
[j
+ZZ
*c_packX8
] = farAway
;
310 j
+= (DIM
-1)*c_packX8
;
316 gmx_incons("Unsupported nbnxn_atomdata_t format");
320 /* Stores the LJ parameter data in a format convenient for different kernels */
321 static void set_lj_parameter_data(nbnxn_atomdata_t
*nbat
, gmx_bool bSIMD
)
325 int nt
= nbat
->ntype
;
330 /* nbfp_aligned stores two parameters using the stride most suitable
331 * for the present SIMD architecture, as specified by the constant
332 * c_simdBestPairAlignment from the SIMD header.
333 * There's a slight inefficiency in allocating and initializing nbfp_aligned
334 * when it might not be used, but introducing the conditional code is not
337 nbat
->alloc((void **)&nbat
->nbfp_aligned
,
338 nt
*nt
*c_simdBestPairAlignment
*sizeof(*nbat
->nbfp_aligned
));
339 for (int i
= 0; i
< nt
; i
++)
341 for (int j
= 0; j
< nt
; j
++)
343 nbat
->nbfp_aligned
[(i
*nt
+j
)*c_simdBestPairAlignment
+0] = nbat
->nbfp
[(i
*nt
+j
)*2+0];
344 nbat
->nbfp_aligned
[(i
*nt
+j
)*c_simdBestPairAlignment
+1] = nbat
->nbfp
[(i
*nt
+j
)*2+1];
345 nbat
->nbfp_aligned
[(i
*nt
+j
)*c_simdBestPairAlignment
+2] = 0;
346 nbat
->nbfp_aligned
[(i
*nt
+j
)*c_simdBestPairAlignment
+3] = 0;
352 /* We use combination rule data for SIMD combination rule kernels
353 * and with LJ-PME kernels. We then only need parameters per atom type,
354 * not per pair of atom types.
356 switch (nbat
->comb_rule
)
359 nbat
->comb_rule
= ljcrGEOM
;
361 for (int i
= 0; i
< nt
; i
++)
363 /* Store the sqrt of the diagonal from the nbfp matrix */
364 nbat
->nbfp_comb
[i
*2 ] = std::sqrt(nbat
->nbfp
[(i
*nt
+i
)*2 ]);
365 nbat
->nbfp_comb
[i
*2+1] = std::sqrt(nbat
->nbfp
[(i
*nt
+i
)*2+1]);
369 for (int i
= 0; i
< nt
; i
++)
371 /* Get 6*C6 and 12*C12 from the diagonal of the nbfp matrix */
372 c6
= nbat
->nbfp
[(i
*nt
+i
)*2 ];
373 c12
= nbat
->nbfp
[(i
*nt
+i
)*2+1];
374 if (c6
> 0 && c12
> 0)
376 /* We store 0.5*2^1/6*sigma and sqrt(4*3*eps),
377 * so we get 6*C6 and 12*C12 after combining.
379 nbat
->nbfp_comb
[i
*2 ] = 0.5*gmx::sixthroot(c12
/c6
);
380 nbat
->nbfp_comb
[i
*2+1] = std::sqrt(c6
*c6
/c12
);
384 nbat
->nbfp_comb
[i
*2 ] = 0;
385 nbat
->nbfp_comb
[i
*2+1] = 0;
390 /* We always store the full matrix (see code above) */
393 gmx_incons("Unknown combination rule");
400 nbnxn_atomdata_init_simple_exclusion_masks(nbnxn_atomdata_t
*nbat
)
402 const int simd_width
= GMX_SIMD_REAL_WIDTH
;
404 /* Set the diagonal cluster pair exclusion mask setup data.
405 * In the kernel we check 0 < j - i to generate the masks.
406 * Here we store j - i for generating the mask for the first i,
407 * we substract 0.5 to avoid rounding issues.
408 * In the kernel we can subtract 1 to generate the subsequent mask.
410 int simd_4xn_diag_size
;
412 simd_4xn_diag_size
= std::max(NBNXN_CPU_CLUSTER_I_SIZE
, simd_width
);
413 snew_aligned(nbat
->simd_4xn_diagonal_j_minus_i
, simd_4xn_diag_size
, NBNXN_MEM_ALIGN
);
414 for (int j
= 0; j
< simd_4xn_diag_size
; j
++)
416 nbat
->simd_4xn_diagonal_j_minus_i
[j
] = j
- 0.5;
419 snew_aligned(nbat
->simd_2xnn_diagonal_j_minus_i
, simd_width
, NBNXN_MEM_ALIGN
);
420 for (int j
= 0; j
< simd_width
/2; j
++)
422 /* The j-cluster size is half the SIMD width */
423 nbat
->simd_2xnn_diagonal_j_minus_i
[j
] = j
- 0.5;
424 /* The next half of the SIMD width is for i + 1 */
425 nbat
->simd_2xnn_diagonal_j_minus_i
[simd_width
/2+j
] = j
- 1 - 0.5;
428 /* We use up to 32 bits for exclusion masking.
429 * The same masks are used for the 4xN and 2x(N+N) kernels.
430 * The masks are read either into integer SIMD registers or into
431 * real SIMD registers (together with a cast).
432 * In single precision this means the real and integer SIMD registers
435 simd_excl_size
= NBNXN_CPU_CLUSTER_I_SIZE
*simd_width
;
436 #if GMX_DOUBLE && !GMX_SIMD_HAVE_INT32_LOGICAL
437 snew_aligned(nbat
->simd_exclusion_filter64
, simd_excl_size
, NBNXN_MEM_ALIGN
);
439 snew_aligned(nbat
->simd_exclusion_filter
, simd_excl_size
, NBNXN_MEM_ALIGN
);
442 for (int j
= 0; j
< simd_excl_size
; j
++)
444 /* Set the consecutive bits for masking pair exclusions */
445 #if GMX_DOUBLE && !GMX_SIMD_HAVE_INT32_LOGICAL
446 nbat
->simd_exclusion_filter64
[j
] = (1U << j
);
448 nbat
->simd_exclusion_filter
[j
] = (1U << j
);
452 #if !GMX_SIMD_HAVE_LOGICAL && !GMX_SIMD_HAVE_INT32_LOGICAL
453 // If the SIMD implementation has no bitwise logical operation support
454 // whatsoever we cannot use the normal masking. Instead,
455 // we generate a vector of all 2^4 possible ways an i atom
456 // interacts with its 4 j atoms. Each array entry contains
457 // GMX_SIMD_REAL_WIDTH values that are read with a single aligned SIMD load.
458 // Since there is no logical value representation in this case, we use
459 // any nonzero value to indicate 'true', while zero mean 'false'.
460 // This can then be converted to a SIMD boolean internally in the SIMD
461 // module by comparing to zero.
462 // Each array entry encodes how this i atom will interact with the 4 j atoms.
463 // Matching code exists in set_ci_top_excls() to generate indices into this array.
464 // Those indices are used in the kernels.
466 simd_excl_size
= NBNXN_CPU_CLUSTER_I_SIZE
*NBNXN_CPU_CLUSTER_I_SIZE
;
467 const real simdFalse
= 0.0;
468 const real simdTrue
= 1.0;
469 real
*simd_interaction_array
;
471 snew_aligned(simd_interaction_array
, simd_excl_size
* GMX_SIMD_REAL_WIDTH
, NBNXN_MEM_ALIGN
);
472 for (int j
= 0; j
< simd_excl_size
; j
++)
474 int index
= j
* GMX_SIMD_REAL_WIDTH
;
475 for (int i
= 0; i
< GMX_SIMD_REAL_WIDTH
; i
++)
477 simd_interaction_array
[index
+ i
] = (j
& (1 << i
)) ? simdTrue
: simdFalse
;
480 nbat
->simd_interaction_array
= simd_interaction_array
;
485 /* Initializes an nbnxn_atomdata_t data structure */
486 void nbnxn_atomdata_init(const gmx::MDLogger
&mdlog
,
487 nbnxn_atomdata_t
*nbat
,
489 int enbnxninitcombrule
,
490 int ntype
, const real
*nbfp
,
493 nbnxn_alloc_t
*alloc
,
499 gmx_bool simple
, bCombGeom
, bCombLB
, bSIMD
;
501 if (alloc
== nullptr)
503 nbat
->alloc
= nbnxn_alloc_aligned
;
511 nbat
->free
= nbnxn_free_aligned
;
520 fprintf(debug
, "There are %d atom types in the system, adding one for nbnxn_atomdata_t\n", ntype
);
522 nbat
->ntype
= ntype
+ 1;
523 nbat
->alloc((void **)&nbat
->nbfp
,
524 nbat
->ntype
*nbat
->ntype
*2*sizeof(*nbat
->nbfp
));
525 nbat
->alloc((void **)&nbat
->nbfp_comb
, nbat
->ntype
*2*sizeof(*nbat
->nbfp_comb
));
527 /* A tolerance of 1e-5 seems reasonable for (possibly hand-typed)
528 * force-field floating point parameters.
531 ptr
= getenv("GMX_LJCOMB_TOL");
536 sscanf(ptr
, "%lf", &dbl
);
542 /* Temporarily fill nbat->nbfp_comb with sigma and epsilon
543 * to check for the LB rule.
545 for (int i
= 0; i
< ntype
; i
++)
547 c6
= nbfp
[(i
*ntype
+i
)*2 ]/6.0;
548 c12
= nbfp
[(i
*ntype
+i
)*2+1]/12.0;
549 if (c6
> 0 && c12
> 0)
551 nbat
->nbfp_comb
[i
*2 ] = gmx::sixthroot(c12
/c6
);
552 nbat
->nbfp_comb
[i
*2+1] = 0.25*c6
*c6
/c12
;
554 else if (c6
== 0 && c12
== 0)
556 nbat
->nbfp_comb
[i
*2 ] = 0;
557 nbat
->nbfp_comb
[i
*2+1] = 0;
561 /* Can not use LB rule with only dispersion or repulsion */
566 for (int i
= 0; i
< nbat
->ntype
; i
++)
568 for (int j
= 0; j
< nbat
->ntype
; j
++)
570 if (i
< ntype
&& j
< ntype
)
572 /* fr->nbfp has been updated, so that array too now stores c6/c12 including
573 * the 6.0/12.0 prefactors to save 2 flops in the most common case (force-only).
575 c6
= nbfp
[(i
*ntype
+j
)*2 ];
576 c12
= nbfp
[(i
*ntype
+j
)*2+1];
577 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2 ] = c6
;
578 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2+1] = c12
;
580 /* Compare 6*C6 and 12*C12 for geometric cobination rule */
581 bCombGeom
= bCombGeom
&&
582 gmx_within_tol(c6
*c6
, nbfp
[(i
*ntype
+i
)*2 ]*nbfp
[(j
*ntype
+j
)*2 ], tol
) &&
583 gmx_within_tol(c12
*c12
, nbfp
[(i
*ntype
+i
)*2+1]*nbfp
[(j
*ntype
+j
)*2+1], tol
);
585 /* Compare C6 and C12 for Lorentz-Berthelot combination rule */
589 ((c6
== 0 && c12
== 0 &&
590 (nbat
->nbfp_comb
[i
*2+1] == 0 || nbat
->nbfp_comb
[j
*2+1] == 0)) ||
591 (c6
> 0 && c12
> 0 &&
592 gmx_within_tol(gmx::sixthroot(c12
/c6
),
593 0.5*(nbat
->nbfp_comb
[i
*2]+nbat
->nbfp_comb
[j
*2]), tol
) &&
594 gmx_within_tol(0.25*c6
*c6
/c12
, std::sqrt(nbat
->nbfp_comb
[i
*2+1]*nbat
->nbfp_comb
[j
*2+1]), tol
)));
598 /* Add zero parameters for the additional dummy atom type */
599 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2 ] = 0;
600 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2+1] = 0;
606 fprintf(debug
, "Combination rules: geometric %d Lorentz-Berthelot %d\n",
610 simple
= nbnxn_kernel_pairlist_simple(nb_kernel_type
);
612 switch (enbnxninitcombrule
)
614 case enbnxninitcombruleDETECT
:
615 /* We prefer the geometic combination rule,
616 * as that gives a slightly faster kernel than the LB rule.
620 nbat
->comb_rule
= ljcrGEOM
;
624 nbat
->comb_rule
= ljcrLB
;
628 nbat
->comb_rule
= ljcrNONE
;
630 nbat
->free(nbat
->nbfp_comb
);
635 if (nbat
->comb_rule
== ljcrNONE
)
637 mesg
= "Using full Lennard-Jones parameter combination matrix";
641 mesg
= gmx::formatString("Using %s Lennard-Jones combination rule",
642 nbat
->comb_rule
== ljcrGEOM
? "geometric" : "Lorentz-Berthelot");
644 GMX_LOG(mdlog
.info
).asParagraph().appendText(mesg
);
647 case enbnxninitcombruleGEOM
:
648 nbat
->comb_rule
= ljcrGEOM
;
650 case enbnxninitcombruleLB
:
651 nbat
->comb_rule
= ljcrLB
;
653 case enbnxninitcombruleNONE
:
654 nbat
->comb_rule
= ljcrNONE
;
656 nbat
->free(nbat
->nbfp_comb
);
659 gmx_incons("Unknown enbnxninitcombrule");
662 bSIMD
= (nb_kernel_type
== nbnxnk4xN_SIMD_4xN
||
663 nb_kernel_type
== nbnxnk4xN_SIMD_2xNN
);
665 set_lj_parameter_data(nbat
, bSIMD
);
668 nbat
->type
= nullptr;
669 nbat
->lj_comb
= nullptr;
676 pack_x
= std::max(NBNXN_CPU_CLUSTER_I_SIZE
,
677 nbnxn_kernel_to_cluster_j_size(nb_kernel_type
));
681 nbat
->XFormat
= nbatX4
;
684 nbat
->XFormat
= nbatX8
;
687 gmx_incons("Unsupported packing width");
692 nbat
->XFormat
= nbatXYZ
;
695 nbat
->FFormat
= nbat
->XFormat
;
699 nbat
->XFormat
= nbatXYZQ
;
700 nbat
->FFormat
= nbatXYZ
;
703 nbat
->nenergrp
= n_energygroups
;
706 // We now check for energy groups already when starting mdrun
707 GMX_RELEASE_ASSERT(n_energygroups
== 1, "GPU kernels do not support energy groups");
709 /* Temporary storage goes as #grp^3*simd_width^2/2, so limit to 64 */
710 if (nbat
->nenergrp
> 64)
712 gmx_fatal(FARGS
, "With NxN kernels not more than 64 energy groups are supported\n");
715 while (nbat
->nenergrp
> (1<<nbat
->neg_2log
))
719 nbat
->energrp
= nullptr;
720 nbat
->alloc((void **)&nbat
->shift_vec
, SHIFTS
*sizeof(*nbat
->shift_vec
));
721 nbat
->xstride
= (nbat
->XFormat
== nbatXYZQ
? STRIDE_XYZQ
: DIM
);
722 nbat
->fstride
= (nbat
->FFormat
== nbatXYZQ
? STRIDE_XYZQ
: DIM
);
728 nbnxn_atomdata_init_simple_exclusion_masks(nbat
);
732 /* Initialize the output data structures */
734 snew(nbat
->out
, nbat
->nout
);
736 for (int i
= 0; i
< nbat
->nout
; i
++)
738 nbnxn_atomdata_output_init(&nbat
->out
[i
],
740 nbat
->nenergrp
, 1<<nbat
->neg_2log
,
743 nbat
->buffer_flags
.flag
= nullptr;
744 nbat
->buffer_flags
.flag_nalloc
= 0;
746 nth
= gmx_omp_nthreads_get(emntNonbonded
);
748 ptr
= getenv("GMX_USE_TREEREDUCE");
751 nbat
->bUseTreeReduce
= strtol(ptr
, nullptr, 10);
754 else if (nth
> 8) /*on the CPU we currently don't benefit even at 32*/
756 nbat
->bUseTreeReduce
= 1;
761 nbat
->bUseTreeReduce
= 0;
763 if (nbat
->bUseTreeReduce
)
765 GMX_LOG(mdlog
.info
).asParagraph().appendText("Using tree force reduction");
767 snew(nbat
->syncStep
, nth
);
771 template<int packSize
>
772 static void copy_lj_to_nbat_lj_comb(const real
*ljparam_type
,
773 const int *type
, int na
,
776 /* The LJ params follow the combination rule:
777 * copy the params for the type array to the atom array.
779 for (int is
= 0; is
< na
; is
+= packSize
)
781 for (int k
= 0; k
< packSize
; k
++)
784 ljparam_at
[is
*2 + k
] = ljparam_type
[type
[i
]*2 ];
785 ljparam_at
[is
*2 + packSize
+ k
] = ljparam_type
[type
[i
]*2 + 1];
790 /* Sets the atom type in nbnxn_atomdata_t */
791 static void nbnxn_atomdata_set_atomtypes(nbnxn_atomdata_t
*nbat
,
792 const nbnxn_search_t nbs
,
795 for (int g
= 0; g
< nbs
->ngrid
; g
++)
797 const nbnxn_grid_t
* grid
= &nbs
->grid
[g
];
799 /* Loop over all columns and copy and fill */
800 for (int i
= 0; i
< grid
->ncx
*grid
->ncy
; i
++)
802 int ncz
= grid
->cxy_ind
[i
+1] - grid
->cxy_ind
[i
];
803 int ash
= (grid
->cell0
+ grid
->cxy_ind
[i
])*grid
->na_sc
;
805 copy_int_to_nbat_int(nbs
->a
+ash
, grid
->cxy_na
[i
], ncz
*grid
->na_sc
,
806 type
, nbat
->ntype
-1, nbat
->type
+ash
);
811 /* Sets the LJ combination rule parameters in nbnxn_atomdata_t */
812 static void nbnxn_atomdata_set_ljcombparams(nbnxn_atomdata_t
*nbat
,
813 const nbnxn_search_t nbs
)
815 if (nbat
->comb_rule
!= ljcrNONE
)
817 for (int g
= 0; g
< nbs
->ngrid
; g
++)
819 const nbnxn_grid_t
* grid
= &nbs
->grid
[g
];
821 /* Loop over all columns and copy and fill */
822 for (int i
= 0; i
< grid
->ncx
*grid
->ncy
; i
++)
824 int ncz
= grid
->cxy_ind
[i
+1] - grid
->cxy_ind
[i
];
825 int ash
= (grid
->cell0
+ grid
->cxy_ind
[i
])*grid
->na_sc
;
827 if (nbat
->XFormat
== nbatX4
)
829 copy_lj_to_nbat_lj_comb
<c_packX4
>(nbat
->nbfp_comb
,
832 nbat
->lj_comb
+ ash
*2);
834 else if (nbat
->XFormat
== nbatX8
)
836 copy_lj_to_nbat_lj_comb
<c_packX8
>(nbat
->nbfp_comb
,
839 nbat
->lj_comb
+ ash
*2);
841 else if (nbat
->XFormat
== nbatXYZQ
)
843 copy_lj_to_nbat_lj_comb
<1>(nbat
->nbfp_comb
,
846 nbat
->lj_comb
+ ash
*2);
853 /* Sets the charges in nbnxn_atomdata_t *nbat */
854 static void nbnxn_atomdata_set_charges(nbnxn_atomdata_t
*nbat
,
855 const nbnxn_search_t nbs
,
861 for (int g
= 0; g
< nbs
->ngrid
; g
++)
863 const nbnxn_grid_t
* grid
= &nbs
->grid
[g
];
865 /* Loop over all columns and copy and fill */
866 for (int cxy
= 0; cxy
< grid
->ncx
*grid
->ncy
; cxy
++)
868 int ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
869 int na
= grid
->cxy_na
[cxy
];
870 int na_round
= (grid
->cxy_ind
[cxy
+1] - grid
->cxy_ind
[cxy
])*grid
->na_sc
;
872 if (nbat
->XFormat
== nbatXYZQ
)
874 q
= nbat
->x
+ ash
*STRIDE_XYZQ
+ ZZ
+ 1;
875 for (i
= 0; i
< na
; i
++)
877 *q
= charge
[nbs
->a
[ash
+i
]];
880 /* Complete the partially filled last cell with zeros */
881 for (; i
< na_round
; i
++)
890 for (i
= 0; i
< na
; i
++)
892 *q
= charge
[nbs
->a
[ash
+i
]];
895 /* Complete the partially filled last cell with zeros */
896 for (; i
< na_round
; i
++)
906 /* Set the charges of perturbed atoms in nbnxn_atomdata_t to 0.
907 * This is to automatically remove the RF/PME self term in the nbnxn kernels.
908 * Part of the zero interactions are still calculated in the normal kernels.
909 * All perturbed interactions are calculated in the free energy kernel,
910 * using the original charge and LJ data, not nbnxn_atomdata_t.
912 static void nbnxn_atomdata_mask_fep(nbnxn_atomdata_t
*nbat
,
913 const nbnxn_search_t nbs
)
918 if (nbat
->XFormat
== nbatXYZQ
)
920 q
= nbat
->x
+ ZZ
+ 1;
921 stride_q
= STRIDE_XYZQ
;
929 for (int g
= 0; g
< nbs
->ngrid
; g
++)
931 const nbnxn_grid_t
* grid
= &nbs
->grid
[g
];
938 nsubc
= c_gpuNumClusterPerCell
;
941 int c_offset
= grid
->cell0
*grid
->na_sc
;
943 /* Loop over all columns and copy and fill */
944 for (int c
= 0; c
< grid
->nc
*nsubc
; c
++)
946 /* Does this cluster contain perturbed particles? */
947 if (grid
->fep
[c
] != 0)
949 for (int i
= 0; i
< grid
->na_c
; i
++)
951 /* Is this a perturbed particle? */
952 if (grid
->fep
[c
] & (1 << i
))
954 int ind
= c_offset
+ c
*grid
->na_c
+ i
;
955 /* Set atom type and charge to non-interacting */
956 nbat
->type
[ind
] = nbat
->ntype
- 1;
965 /* Copies the energy group indices to a reordered and packed array */
966 static void copy_egp_to_nbat_egps(const int *a
, int na
, int na_round
,
967 int na_c
, int bit_shift
,
968 const int *in
, int *innb
)
974 for (i
= 0; i
< na
; i
+= na_c
)
976 /* Store na_c energy group numbers into one int */
978 for (int sa
= 0; sa
< na_c
; sa
++)
983 comb
|= (GET_CGINFO_GID(in
[at
]) << (sa
*bit_shift
));
988 /* Complete the partially filled last cell with fill */
989 for (; i
< na_round
; i
+= na_c
)
995 /* Set the energy group indices for atoms in nbnxn_atomdata_t */
996 static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t
*nbat
,
997 const nbnxn_search_t nbs
,
1000 if (nbat
->nenergrp
== 1)
1005 for (int g
= 0; g
< nbs
->ngrid
; g
++)
1007 const nbnxn_grid_t
* grid
= &nbs
->grid
[g
];
1009 /* Loop over all columns and copy and fill */
1010 for (int i
= 0; i
< grid
->ncx
*grid
->ncy
; i
++)
1012 int ncz
= grid
->cxy_ind
[i
+1] - grid
->cxy_ind
[i
];
1013 int ash
= (grid
->cell0
+ grid
->cxy_ind
[i
])*grid
->na_sc
;
1015 copy_egp_to_nbat_egps(nbs
->a
+ash
, grid
->cxy_na
[i
], ncz
*grid
->na_sc
,
1016 nbat
->na_c
, nbat
->neg_2log
,
1017 atinfo
, nbat
->energrp
+(ash
>>grid
->na_c_2log
));
1022 /* Sets all required atom parameter data in nbnxn_atomdata_t */
1023 void nbnxn_atomdata_set(nbnxn_atomdata_t
*nbat
,
1024 const nbnxn_search_t nbs
,
1025 const t_mdatoms
*mdatoms
,
1028 nbnxn_atomdata_set_atomtypes(nbat
, nbs
, mdatoms
->typeA
);
1030 nbnxn_atomdata_set_charges(nbat
, nbs
, mdatoms
->chargeA
);
1034 nbnxn_atomdata_mask_fep(nbat
, nbs
);
1037 /* This must be done after masking types for FEP */
1038 nbnxn_atomdata_set_ljcombparams(nbat
, nbs
);
1040 nbnxn_atomdata_set_energygroups(nbat
, nbs
, atinfo
);
1043 /* Copies the shift vector array to nbnxn_atomdata_t */
1044 void nbnxn_atomdata_copy_shiftvec(gmx_bool bDynamicBox
,
1046 nbnxn_atomdata_t
*nbat
)
1050 nbat
->bDynamicBox
= bDynamicBox
;
1051 for (i
= 0; i
< SHIFTS
; i
++)
1053 copy_rvec(shift_vec
[i
], nbat
->shift_vec
[i
]);
1057 /* Copies (and reorders) the coordinates to nbnxn_atomdata_t */
1058 void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search_t nbs
,
1062 nbnxn_atomdata_t
*nbat
)
1085 nbat
->natoms_local
= nbs
->grid
[0].nc
*nbs
->grid
[0].na_sc
;
1088 nth
= gmx_omp_nthreads_get(emntPairsearch
);
1090 #pragma omp parallel for num_threads(nth) schedule(static)
1091 for (th
= 0; th
< nth
; th
++)
1095 for (int g
= g0
; g
< g1
; g
++)
1097 const nbnxn_grid_t
*grid
;
1100 grid
= &nbs
->grid
[g
];
1102 cxy0
= (grid
->ncx
*grid
->ncy
* th
+nth
-1)/nth
;
1103 cxy1
= (grid
->ncx
*grid
->ncy
*(th
+1)+nth
-1)/nth
;
1105 for (int cxy
= cxy0
; cxy
< cxy1
; cxy
++)
1107 int na
, ash
, na_fill
;
1109 na
= grid
->cxy_na
[cxy
];
1110 ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
1112 if (g
== 0 && FillLocal
)
1115 (grid
->cxy_ind
[cxy
+1] - grid
->cxy_ind
[cxy
])*grid
->na_sc
;
1119 /* We fill only the real particle locations.
1120 * We assume the filling entries at the end have been
1121 * properly set before during pair-list generation.
1125 copy_rvec_to_nbat_real(nbs
->a
+ash
, na
, na_fill
, x
,
1126 nbat
->XFormat
, nbat
->x
, ash
);
1130 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1135 nbnxn_atomdata_clear_reals(real
* gmx_restrict dest
,
1138 for (int i
= i0
; i
< i1
; i
++)
1144 gmx_unused
static void
1145 nbnxn_atomdata_reduce_reals(real
* gmx_restrict dest
,
1147 real
** gmx_restrict src
,
1153 /* The destination buffer contains data, add to it */
1154 for (int i
= i0
; i
< i1
; i
++)
1156 for (int s
= 0; s
< nsrc
; s
++)
1158 dest
[i
] += src
[s
][i
];
1164 /* The destination buffer is unitialized, set it first */
1165 for (int i
= i0
; i
< i1
; i
++)
1167 dest
[i
] = src
[0][i
];
1168 for (int s
= 1; s
< nsrc
; s
++)
1170 dest
[i
] += src
[s
][i
];
1176 gmx_unused
static void
1177 nbnxn_atomdata_reduce_reals_simd(real gmx_unused
* gmx_restrict dest
,
1178 gmx_bool gmx_unused bDestSet
,
1179 real gmx_unused
** gmx_restrict src
,
1180 int gmx_unused nsrc
,
1181 int gmx_unused i0
, int gmx_unused i1
)
1184 /* The SIMD width here is actually independent of that in the kernels,
1185 * but we use the same width for simplicity (usually optimal anyhow).
1187 SimdReal dest_SSE
, src_SSE
;
1191 for (int i
= i0
; i
< i1
; i
+= GMX_SIMD_REAL_WIDTH
)
1193 dest_SSE
= load
<SimdReal
>(dest
+i
);
1194 for (int s
= 0; s
< nsrc
; s
++)
1196 src_SSE
= load
<SimdReal
>(src
[s
]+i
);
1197 dest_SSE
= dest_SSE
+ src_SSE
;
1199 store(dest
+i
, dest_SSE
);
1204 for (int i
= i0
; i
< i1
; i
+= GMX_SIMD_REAL_WIDTH
)
1206 dest_SSE
= load
<SimdReal
>(src
[0]+i
);
1207 for (int s
= 1; s
< nsrc
; s
++)
1209 src_SSE
= load
<SimdReal
>(src
[s
]+i
);
1210 dest_SSE
= dest_SSE
+ src_SSE
;
1212 store(dest
+i
, dest_SSE
);
1218 /* Add part of the force array(s) from nbnxn_atomdata_t to f */
1220 nbnxn_atomdata_add_nbat_f_to_f_part(const nbnxn_search_t nbs
,
1221 const nbnxn_atomdata_t
*nbat
,
1222 nbnxn_atomdata_output_t
*out
,
1232 /* Loop over all columns and copy and fill */
1233 switch (nbat
->FFormat
)
1241 for (int a
= a0
; a
< a1
; a
++)
1243 int i
= cell
[a
]*nbat
->fstride
;
1246 f
[a
][YY
] += fnb
[i
+1];
1247 f
[a
][ZZ
] += fnb
[i
+2];
1252 for (int a
= a0
; a
< a1
; a
++)
1254 int i
= cell
[a
]*nbat
->fstride
;
1256 for (int fa
= 0; fa
< nfa
; fa
++)
1258 f
[a
][XX
] += out
[fa
].f
[i
];
1259 f
[a
][YY
] += out
[fa
].f
[i
+1];
1260 f
[a
][ZZ
] += out
[fa
].f
[i
+2];
1270 for (int a
= a0
; a
< a1
; a
++)
1272 int i
= atom_to_x_index
<c_packX4
>(cell
[a
]);
1274 f
[a
][XX
] += fnb
[i
+XX
*c_packX4
];
1275 f
[a
][YY
] += fnb
[i
+YY
*c_packX4
];
1276 f
[a
][ZZ
] += fnb
[i
+ZZ
*c_packX4
];
1281 for (int a
= a0
; a
< a1
; a
++)
1283 int i
= atom_to_x_index
<c_packX4
>(cell
[a
]);
1285 for (int fa
= 0; fa
< nfa
; fa
++)
1287 f
[a
][XX
] += out
[fa
].f
[i
+XX
*c_packX4
];
1288 f
[a
][YY
] += out
[fa
].f
[i
+YY
*c_packX4
];
1289 f
[a
][ZZ
] += out
[fa
].f
[i
+ZZ
*c_packX4
];
1299 for (int a
= a0
; a
< a1
; a
++)
1301 int i
= atom_to_x_index
<c_packX8
>(cell
[a
]);
1303 f
[a
][XX
] += fnb
[i
+XX
*c_packX8
];
1304 f
[a
][YY
] += fnb
[i
+YY
*c_packX8
];
1305 f
[a
][ZZ
] += fnb
[i
+ZZ
*c_packX8
];
1310 for (int a
= a0
; a
< a1
; a
++)
1312 int i
= atom_to_x_index
<c_packX8
>(cell
[a
]);
1314 for (int fa
= 0; fa
< nfa
; fa
++)
1316 f
[a
][XX
] += out
[fa
].f
[i
+XX
*c_packX8
];
1317 f
[a
][YY
] += out
[fa
].f
[i
+YY
*c_packX8
];
1318 f
[a
][ZZ
] += out
[fa
].f
[i
+ZZ
*c_packX8
];
1324 gmx_incons("Unsupported nbnxn_atomdata_t format");
1328 static inline unsigned char reverse_bits(unsigned char b
)
1330 /* http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64BitsDiv */
1331 return (b
* 0x0202020202ULL
& 0x010884422010ULL
) % 1023;
1334 static void nbnxn_atomdata_add_nbat_f_to_f_treereduce(const nbnxn_atomdata_t
*nbat
,
1337 const nbnxn_buffer_flags_t
*flags
= &nbat
->buffer_flags
;
1339 int next_pow2
= 1<<(gmx::log2I(nth
-1)+1);
1341 assert(nbat
->nout
== nth
); /* tree-reduce currently only works for nout==nth */
1343 memset(nbat
->syncStep
, 0, sizeof(*(nbat
->syncStep
))*nth
);
1345 #pragma omp parallel num_threads(nth)
1353 th
= gmx_omp_get_thread_num();
1355 for (group_size
= 2; group_size
< 2*next_pow2
; group_size
*= 2)
1357 int index
[2], group_pos
, partner_pos
, wu
;
1358 int partner_th
= th
^ (group_size
/2);
1363 /* wait on partner thread - replaces full barrier */
1364 int sync_th
, sync_group_size
;
1366 tMPI_Atomic_memory_barrier(); /* gurantee data is saved before marking work as done */
1367 tMPI_Atomic_set(&(nbat
->syncStep
[th
]), group_size
/2); /* mark previous step as completed */
1369 /* find thread to sync with. Equal to partner_th unless nth is not a power of two. */
1370 for (sync_th
= partner_th
, sync_group_size
= group_size
; sync_th
>= nth
&& sync_group_size
> 2; sync_group_size
/= 2)
1372 sync_th
&= ~(sync_group_size
/4);
1374 if (sync_th
< nth
) /* otherwise nothing to sync index[1] will be >=nout */
1376 /* wait on the thread which computed input data in previous step */
1377 while (tMPI_Atomic_get((volatile tMPI_Atomic_t
*)&(nbat
->syncStep
[sync_th
])) < group_size
/2)
1381 /* guarantee that no later load happens before wait loop is finisehd */
1382 tMPI_Atomic_memory_barrier();
1384 #else /* TMPI_ATOMICS */
1389 /* Calculate buffers to sum (result goes into first buffer) */
1390 group_pos
= th
% group_size
;
1391 index
[0] = th
- group_pos
;
1392 index
[1] = index
[0] + group_size
/2;
1394 /* If no second buffer, nothing to do */
1395 if (index
[1] >= nbat
->nout
&& group_size
> 2)
1400 #if NBNXN_BUFFERFLAG_MAX_THREADS > 256
1401 #error reverse_bits assumes max 256 threads
1403 /* Position is permuted so that one of the 2 vectors being added was computed on the same thread in the previous step.
1404 This improves locality and enables to sync with just a single thread between steps (=the levels in the btree).
1405 The permutation which allows this corresponds to reversing the bits of the group position.
1407 group_pos
= reverse_bits(group_pos
)/(256/group_size
);
1409 partner_pos
= group_pos
^ 1;
1411 /* loop over two work-units (own and partner) */
1412 for (wu
= 0; wu
< 2; wu
++)
1416 if (partner_th
< nth
)
1418 break; /* partner exists we don't have to do his work */
1422 group_pos
= partner_pos
;
1426 /* Calculate the cell-block range for our thread */
1427 b0
= (flags
->nflag
* group_pos
)/group_size
;
1428 b1
= (flags
->nflag
*(group_pos
+1))/group_size
;
1430 for (b
= b0
; b
< b1
; b
++)
1432 i0
= b
*NBNXN_BUFFERFLAG_SIZE
*nbat
->fstride
;
1433 i1
= (b
+1)*NBNXN_BUFFERFLAG_SIZE
*nbat
->fstride
;
1435 if (bitmask_is_set(flags
->flag
[b
], index
[1]) || group_size
> 2)
1438 nbnxn_atomdata_reduce_reals_simd
1440 nbnxn_atomdata_reduce_reals
1442 (nbat
->out
[index
[0]].f
,
1443 bitmask_is_set(flags
->flag
[b
], index
[0]) || group_size
> 2,
1444 &(nbat
->out
[index
[1]].f
), 1, i0
, i1
);
1447 else if (!bitmask_is_set(flags
->flag
[b
], index
[0]))
1449 nbnxn_atomdata_clear_reals(nbat
->out
[index
[0]].f
,
1456 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1461 static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(const nbnxn_atomdata_t
*nbat
,
1464 #pragma omp parallel for num_threads(nth) schedule(static)
1465 for (int th
= 0; th
< nth
; th
++)
1469 const nbnxn_buffer_flags_t
*flags
;
1471 real
*fptr
[NBNXN_BUFFERFLAG_MAX_THREADS
];
1473 flags
= &nbat
->buffer_flags
;
1475 /* Calculate the cell-block range for our thread */
1476 int b0
= (flags
->nflag
* th
)/nth
;
1477 int b1
= (flags
->nflag
*(th
+1))/nth
;
1479 for (int b
= b0
; b
< b1
; b
++)
1481 int i0
= b
*NBNXN_BUFFERFLAG_SIZE
*nbat
->fstride
;
1482 int i1
= (b
+1)*NBNXN_BUFFERFLAG_SIZE
*nbat
->fstride
;
1485 for (int out
= 1; out
< nbat
->nout
; out
++)
1487 if (bitmask_is_set(flags
->flag
[b
], out
))
1489 fptr
[nfptr
++] = nbat
->out
[out
].f
;
1495 nbnxn_atomdata_reduce_reals_simd
1497 nbnxn_atomdata_reduce_reals
1500 bitmask_is_set(flags
->flag
[b
], 0),
1504 else if (!bitmask_is_set(flags
->flag
[b
], 0))
1506 nbnxn_atomdata_clear_reals(nbat
->out
[0].f
,
1511 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1515 /* Add the force array(s) from nbnxn_atomdata_t to f */
1516 void nbnxn_atomdata_add_nbat_f_to_f(const nbnxn_search_t nbs
,
1518 const nbnxn_atomdata_t
*nbat
,
1523 nbs_cycle_start(&nbs
->cc
[enbsCCreducef
]);
1529 na
= nbs
->natoms_nonlocal
;
1533 na
= nbs
->natoms_local
;
1536 a0
= nbs
->natoms_local
;
1537 na
= nbs
->natoms_nonlocal
- nbs
->natoms_local
;
1541 int nth
= gmx_omp_nthreads_get(emntNonbonded
);
1545 if (locality
!= eatAll
)
1547 gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll");
1550 /* Reduce the force thread output buffers into buffer 0, before adding
1551 * them to the, differently ordered, "real" force buffer.
1553 if (nbat
->bUseTreeReduce
)
1555 nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbat
, nth
);
1559 nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbat
, nth
);
1562 #pragma omp parallel for num_threads(nth) schedule(static)
1563 for (int th
= 0; th
< nth
; th
++)
1567 nbnxn_atomdata_add_nbat_f_to_f_part(nbs
, nbat
,
1574 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1577 nbs_cycle_stop(&nbs
->cc
[enbsCCreducef
]);
1580 /* Adds the shift forces from nbnxn_atomdata_t to fshift */
1581 void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t
*nbat
,
1584 const nbnxn_atomdata_output_t
* out
= nbat
->out
;
1586 for (int s
= 0; s
< SHIFTS
; s
++)
1590 for (int th
= 0; th
< nbat
->nout
; th
++)
1592 sum
[XX
] += out
[th
].fshift
[s
*DIM
+XX
];
1593 sum
[YY
] += out
[th
].fshift
[s
*DIM
+YY
];
1594 sum
[ZZ
] += out
[th
].fshift
[s
*DIM
+ZZ
];
1596 rvec_inc(fshift
[s
], sum
);