2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 #include "nbnxn_atomdata.h"
47 #include "thread_mpi/atomic.h"
49 #include "gromacs/legacyheaders/gmx_omp_nthreads.h"
50 #include "gromacs/legacyheaders/macros.h"
51 #include "gromacs/math/vec.h"
52 #include "gromacs/mdlib/nb_verlet.h"
53 #include "gromacs/mdlib/nbnxn_consts.h"
54 #include "gromacs/mdlib/nbnxn_internal.h"
55 #include "gromacs/mdlib/nbnxn_search.h"
56 #include "gromacs/mdlib/nbnxn_simd.h"
57 #include "gromacs/mdlib/nbnxn_util.h"
58 #include "gromacs/pbcutil/ishift.h"
59 #include "gromacs/simd/simd.h"
60 #include "gromacs/utility/fatalerror.h"
61 #include "gromacs/utility/gmxomp.h"
62 #include "gromacs/utility/smalloc.h"
64 /* Default nbnxn allocation routine, allocates NBNXN_MEM_ALIGN byte aligned */
65 void nbnxn_alloc_aligned(void **ptr
, size_t nbytes
)
67 *ptr
= save_malloc_aligned("ptr", __FILE__
, __LINE__
, nbytes
, 1, NBNXN_MEM_ALIGN
);
70 /* Free function for memory allocated with nbnxn_alloc_aligned */
71 void nbnxn_free_aligned(void *ptr
)
76 /* Reallocation wrapper function for nbnxn data structures */
77 void nbnxn_realloc_void(void **ptr
,
78 int nbytes_copy
, int nbytes_new
,
84 ma(&ptr_new
, nbytes_new
);
86 if (nbytes_new
> 0 && ptr_new
== NULL
)
88 gmx_fatal(FARGS
, "Allocation of %d bytes failed", nbytes_new
);
93 if (nbytes_new
< nbytes_copy
)
95 gmx_incons("In nbnxn_realloc_void: new size less than copy size");
97 memcpy(ptr_new
, *ptr
, nbytes_copy
);
106 /* Reallocate the nbnxn_atomdata_t for a size of n atoms */
107 void nbnxn_atomdata_realloc(nbnxn_atomdata_t
*nbat
, int n
)
111 nbnxn_realloc_void((void **)&nbat
->type
,
112 nbat
->natoms
*sizeof(*nbat
->type
),
113 n
*sizeof(*nbat
->type
),
114 nbat
->alloc
, nbat
->free
);
115 nbnxn_realloc_void((void **)&nbat
->lj_comb
,
116 nbat
->natoms
*2*sizeof(*nbat
->lj_comb
),
117 n
*2*sizeof(*nbat
->lj_comb
),
118 nbat
->alloc
, nbat
->free
);
119 if (nbat
->XFormat
!= nbatXYZQ
)
121 nbnxn_realloc_void((void **)&nbat
->q
,
122 nbat
->natoms
*sizeof(*nbat
->q
),
124 nbat
->alloc
, nbat
->free
);
126 if (nbat
->nenergrp
> 1)
128 nbnxn_realloc_void((void **)&nbat
->energrp
,
129 nbat
->natoms
/nbat
->na_c
*sizeof(*nbat
->energrp
),
130 n
/nbat
->na_c
*sizeof(*nbat
->energrp
),
131 nbat
->alloc
, nbat
->free
);
133 nbnxn_realloc_void((void **)&nbat
->x
,
134 nbat
->natoms
*nbat
->xstride
*sizeof(*nbat
->x
),
135 n
*nbat
->xstride
*sizeof(*nbat
->x
),
136 nbat
->alloc
, nbat
->free
);
137 for (t
= 0; t
< nbat
->nout
; t
++)
139 /* Allocate one element extra for possible signaling with GPUs */
140 nbnxn_realloc_void((void **)&nbat
->out
[t
].f
,
141 nbat
->natoms
*nbat
->fstride
*sizeof(*nbat
->out
[t
].f
),
142 n
*nbat
->fstride
*sizeof(*nbat
->out
[t
].f
),
143 nbat
->alloc
, nbat
->free
);
148 /* Initializes an nbnxn_atomdata_output_t data structure */
149 static void nbnxn_atomdata_output_init(nbnxn_atomdata_output_t
*out
,
151 int nenergrp
, int stride
,
157 ma((void **)&out
->fshift
, SHIFTS
*DIM
*sizeof(*out
->fshift
));
158 out
->nV
= nenergrp
*nenergrp
;
159 ma((void **)&out
->Vvdw
, out
->nV
*sizeof(*out
->Vvdw
));
160 ma((void **)&out
->Vc
, out
->nV
*sizeof(*out
->Vc
));
162 if (nb_kernel_type
== nbnxnk4xN_SIMD_4xN
||
163 nb_kernel_type
== nbnxnk4xN_SIMD_2xNN
)
165 cj_size
= nbnxn_kernel_to_cluster_j_size(nb_kernel_type
);
166 out
->nVS
= nenergrp
*nenergrp
*stride
*(cj_size
>>1)*cj_size
;
167 ma((void **)&out
->VSvdw
, out
->nVS
*sizeof(*out
->VSvdw
));
168 ma((void **)&out
->VSc
, out
->nVS
*sizeof(*out
->VSc
));
176 static void copy_int_to_nbat_int(const int *a
, int na
, int na_round
,
177 const int *in
, int fill
, int *innb
)
182 for (i
= 0; i
< na
; i
++)
184 innb
[j
++] = in
[a
[i
]];
186 /* Complete the partially filled last cell with fill */
187 for (; i
< na_round
; i
++)
193 static void clear_nbat_real(int na
, int nbatFormat
, real
*xnb
, int a0
)
200 for (a
= 0; a
< na
; a
++)
202 for (d
= 0; d
< DIM
; d
++)
204 xnb
[(a0
+a
)*STRIDE_XYZ
+d
] = 0;
209 for (a
= 0; a
< na
; a
++)
211 for (d
= 0; d
< DIM
; d
++)
213 xnb
[(a0
+a
)*STRIDE_XYZQ
+d
] = 0;
219 c
= a0
& (PACK_X4
-1);
220 for (a
= 0; a
< na
; a
++)
222 xnb
[j
+XX
*PACK_X4
] = 0;
223 xnb
[j
+YY
*PACK_X4
] = 0;
224 xnb
[j
+ZZ
*PACK_X4
] = 0;
229 j
+= (DIM
-1)*PACK_X4
;
236 c
= a0
& (PACK_X8
-1);
237 for (a
= 0; a
< na
; a
++)
239 xnb
[j
+XX
*PACK_X8
] = 0;
240 xnb
[j
+YY
*PACK_X8
] = 0;
241 xnb
[j
+ZZ
*PACK_X8
] = 0;
246 j
+= (DIM
-1)*PACK_X8
;
254 void copy_rvec_to_nbat_real(const int *a
, int na
, int na_round
,
255 rvec
*x
, int nbatFormat
, real
*xnb
, int a0
,
256 int cx
, int cy
, int cz
)
260 /* We might need to place filler particles to fill up the cell to na_round.
261 * The coefficients (LJ and q) for such particles are zero.
262 * But we might still get NaN as 0*NaN when distances are too small.
263 * We hope that -107 nm is far away enough from to zero
264 * to avoid accidental short distances to particles shifted down for pbc.
266 #define NBAT_FAR_AWAY 107
272 for (i
= 0; i
< na
; i
++)
274 xnb
[j
++] = x
[a
[i
]][XX
];
275 xnb
[j
++] = x
[a
[i
]][YY
];
276 xnb
[j
++] = x
[a
[i
]][ZZ
];
278 /* Complete the partially filled last cell with copies of the last element.
279 * This simplifies the bounding box calculation and avoid
280 * numerical issues with atoms that are coincidentally close.
282 for (; i
< na_round
; i
++)
284 xnb
[j
++] = -NBAT_FAR_AWAY
*(1 + cx
);
285 xnb
[j
++] = -NBAT_FAR_AWAY
*(1 + cy
);
286 xnb
[j
++] = -NBAT_FAR_AWAY
*(1 + cz
+ i
);
291 for (i
= 0; i
< na
; i
++)
293 xnb
[j
++] = x
[a
[i
]][XX
];
294 xnb
[j
++] = x
[a
[i
]][YY
];
295 xnb
[j
++] = x
[a
[i
]][ZZ
];
298 /* Complete the partially filled last cell with particles far apart */
299 for (; i
< na_round
; i
++)
301 xnb
[j
++] = -NBAT_FAR_AWAY
*(1 + cx
);
302 xnb
[j
++] = -NBAT_FAR_AWAY
*(1 + cy
);
303 xnb
[j
++] = -NBAT_FAR_AWAY
*(1 + cz
+ i
);
309 c
= a0
& (PACK_X4
-1);
310 for (i
= 0; i
< na
; i
++)
312 xnb
[j
+XX
*PACK_X4
] = x
[a
[i
]][XX
];
313 xnb
[j
+YY
*PACK_X4
] = x
[a
[i
]][YY
];
314 xnb
[j
+ZZ
*PACK_X4
] = x
[a
[i
]][ZZ
];
319 j
+= (DIM
-1)*PACK_X4
;
323 /* Complete the partially filled last cell with particles far apart */
324 for (; i
< na_round
; i
++)
326 xnb
[j
+XX
*PACK_X4
] = -NBAT_FAR_AWAY
*(1 + cx
);
327 xnb
[j
+YY
*PACK_X4
] = -NBAT_FAR_AWAY
*(1 + cy
);
328 xnb
[j
+ZZ
*PACK_X4
] = -NBAT_FAR_AWAY
*(1 + cz
+ i
);
333 j
+= (DIM
-1)*PACK_X4
;
340 c
= a0
& (PACK_X8
- 1);
341 for (i
= 0; i
< na
; i
++)
343 xnb
[j
+XX
*PACK_X8
] = x
[a
[i
]][XX
];
344 xnb
[j
+YY
*PACK_X8
] = x
[a
[i
]][YY
];
345 xnb
[j
+ZZ
*PACK_X8
] = x
[a
[i
]][ZZ
];
350 j
+= (DIM
-1)*PACK_X8
;
354 /* Complete the partially filled last cell with particles far apart */
355 for (; i
< na_round
; i
++)
357 xnb
[j
+XX
*PACK_X8
] = -NBAT_FAR_AWAY
*(1 + cx
);
358 xnb
[j
+YY
*PACK_X8
] = -NBAT_FAR_AWAY
*(1 + cy
);
359 xnb
[j
+ZZ
*PACK_X8
] = -NBAT_FAR_AWAY
*(1 + cz
+ i
);
364 j
+= (DIM
-1)*PACK_X8
;
370 gmx_incons("Unsupported nbnxn_atomdata_t format");
374 /* Stores the LJ parameter data in a format convenient for different kernels */
375 static void set_lj_parameter_data(nbnxn_atomdata_t
*nbat
, gmx_bool bSIMD
)
384 /* nbfp_s4 stores two parameters using a stride of 4,
385 * because this would suit x86 SIMD single-precision
386 * quad-load intrinsics. There's a slight inefficiency in
387 * allocating and initializing nbfp_s4 when it might not
388 * be used, but introducing the conditional code is not
389 * really worth it. */
390 nbat
->alloc((void **)&nbat
->nbfp_s4
, nt
*nt
*4*sizeof(*nbat
->nbfp_s4
));
391 for (i
= 0; i
< nt
; i
++)
393 for (j
= 0; j
< nt
; j
++)
395 nbat
->nbfp_s4
[(i
*nt
+j
)*4+0] = nbat
->nbfp
[(i
*nt
+j
)*2+0];
396 nbat
->nbfp_s4
[(i
*nt
+j
)*4+1] = nbat
->nbfp
[(i
*nt
+j
)*2+1];
397 nbat
->nbfp_s4
[(i
*nt
+j
)*4+2] = 0;
398 nbat
->nbfp_s4
[(i
*nt
+j
)*4+3] = 0;
403 /* We use combination rule data for SIMD combination rule kernels
404 * and with LJ-PME kernels. We then only need parameters per atom type,
405 * not per pair of atom types.
407 switch (nbat
->comb_rule
)
410 nbat
->comb_rule
= ljcrGEOM
;
412 for (i
= 0; i
< nt
; i
++)
414 /* Store the sqrt of the diagonal from the nbfp matrix */
415 nbat
->nbfp_comb
[i
*2 ] = sqrt(nbat
->nbfp
[(i
*nt
+i
)*2 ]);
416 nbat
->nbfp_comb
[i
*2+1] = sqrt(nbat
->nbfp
[(i
*nt
+i
)*2+1]);
420 for (i
= 0; i
< nt
; i
++)
422 /* Get 6*C6 and 12*C12 from the diagonal of the nbfp matrix */
423 c6
= nbat
->nbfp
[(i
*nt
+i
)*2 ];
424 c12
= nbat
->nbfp
[(i
*nt
+i
)*2+1];
425 if (c6
> 0 && c12
> 0)
427 /* We store 0.5*2^1/6*sigma and sqrt(4*3*eps),
428 * so we get 6*C6 and 12*C12 after combining.
430 nbat
->nbfp_comb
[i
*2 ] = 0.5*pow(c12
/c6
, 1.0/6.0);
431 nbat
->nbfp_comb
[i
*2+1] = sqrt(c6
*c6
/c12
);
435 nbat
->nbfp_comb
[i
*2 ] = 0;
436 nbat
->nbfp_comb
[i
*2+1] = 0;
441 /* We always store the full matrix (see code above) */
444 gmx_incons("Unknown combination rule");
449 #ifdef GMX_NBNXN_SIMD
451 nbnxn_atomdata_init_simple_exclusion_masks(nbnxn_atomdata_t
*nbat
)
454 const int simd_width
= GMX_SIMD_REAL_WIDTH
;
456 /* Set the diagonal cluster pair exclusion mask setup data.
457 * In the kernel we check 0 < j - i to generate the masks.
458 * Here we store j - i for generating the mask for the first i,
459 * we substract 0.5 to avoid rounding issues.
460 * In the kernel we can subtract 1 to generate the subsequent mask.
462 int simd_4xn_diag_size
;
463 const real simdFalse
= -1, simdTrue
= 1;
464 real
*simd_interaction_array
;
466 simd_4xn_diag_size
= max(NBNXN_CPU_CLUSTER_I_SIZE
, simd_width
);
467 snew_aligned(nbat
->simd_4xn_diagonal_j_minus_i
, simd_4xn_diag_size
, NBNXN_MEM_ALIGN
);
468 for (j
= 0; j
< simd_4xn_diag_size
; j
++)
470 nbat
->simd_4xn_diagonal_j_minus_i
[j
] = j
- 0.5;
473 snew_aligned(nbat
->simd_2xnn_diagonal_j_minus_i
, simd_width
, NBNXN_MEM_ALIGN
);
474 for (j
= 0; j
< simd_width
/2; j
++)
476 /* The j-cluster size is half the SIMD width */
477 nbat
->simd_2xnn_diagonal_j_minus_i
[j
] = j
- 0.5;
478 /* The next half of the SIMD width is for i + 1 */
479 nbat
->simd_2xnn_diagonal_j_minus_i
[simd_width
/2+j
] = j
- 1 - 0.5;
482 /* We use up to 32 bits for exclusion masking.
483 * The same masks are used for the 4xN and 2x(N+N) kernels.
484 * The masks are read either into epi32 SIMD registers or into
485 * real SIMD registers (together with a cast).
486 * In single precision this means the real and epi32 SIMD registers
488 * In double precision the epi32 registers can be smaller than
489 * the real registers, so depending on the architecture, we might
490 * need to use two, identical, 32-bit masks per real.
492 simd_excl_size
= NBNXN_CPU_CLUSTER_I_SIZE
*simd_width
;
493 snew_aligned(nbat
->simd_exclusion_filter1
, simd_excl_size
, NBNXN_MEM_ALIGN
);
494 snew_aligned(nbat
->simd_exclusion_filter2
, simd_excl_size
*2, NBNXN_MEM_ALIGN
);
496 for (j
= 0; j
< simd_excl_size
; j
++)
498 /* Set the consecutive bits for masking pair exclusions */
499 nbat
->simd_exclusion_filter1
[j
] = (1U << j
);
500 nbat
->simd_exclusion_filter2
[j
*2 + 0] = (1U << j
);
501 nbat
->simd_exclusion_filter2
[j
*2 + 1] = (1U << j
);
504 #if (defined GMX_SIMD_IBM_QPX)
505 /* The QPX kernels shouldn't do the bit masking that is done on
506 * x86, because the SIMD units lack bit-wise operations. Instead,
507 * we generate a vector of all 2^4 possible ways an i atom
508 * interacts with its 4 j atoms. Each array entry contains
509 * simd_width signed ints that are read in a single SIMD
510 * load. These ints must contain values that will be interpreted
511 * as true and false when loaded in the SIMD floating-point
512 * registers, ie. any positive or any negative value,
513 * respectively. Each array entry encodes how this i atom will
514 * interact with the 4 j atoms. Matching code exists in
515 * set_ci_top_excls() to generate indices into this array. Those
516 * indices are used in the kernels. */
518 simd_excl_size
= NBNXN_CPU_CLUSTER_I_SIZE
*NBNXN_CPU_CLUSTER_I_SIZE
;
519 const int qpx_simd_width
= GMX_SIMD_REAL_WIDTH
;
520 snew_aligned(simd_interaction_array
, simd_excl_size
* qpx_simd_width
, NBNXN_MEM_ALIGN
);
521 for (j
= 0; j
< simd_excl_size
; j
++)
523 int index
= j
* qpx_simd_width
;
524 for (i
= 0; i
< qpx_simd_width
; i
++)
526 simd_interaction_array
[index
+ i
] = (j
& (1 << i
)) ? simdTrue
: simdFalse
;
529 nbat
->simd_interaction_array
= simd_interaction_array
;
534 /* Initializes an nbnxn_atomdata_t data structure */
535 void nbnxn_atomdata_init(FILE *fp
,
536 nbnxn_atomdata_t
*nbat
,
538 int enbnxninitcombrule
,
539 int ntype
, const real
*nbfp
,
542 nbnxn_alloc_t
*alloc
,
548 gmx_bool simple
, bCombGeom
, bCombLB
, bSIMD
;
552 nbat
->alloc
= nbnxn_alloc_aligned
;
560 nbat
->free
= nbnxn_free_aligned
;
569 fprintf(debug
, "There are %d atom types in the system, adding one for nbnxn_atomdata_t\n", ntype
);
571 nbat
->ntype
= ntype
+ 1;
572 nbat
->alloc((void **)&nbat
->nbfp
,
573 nbat
->ntype
*nbat
->ntype
*2*sizeof(*nbat
->nbfp
));
574 nbat
->alloc((void **)&nbat
->nbfp_comb
, nbat
->ntype
*2*sizeof(*nbat
->nbfp_comb
));
576 /* A tolerance of 1e-5 seems reasonable for (possibly hand-typed)
577 * force-field floating point parameters.
580 ptr
= getenv("GMX_LJCOMB_TOL");
585 sscanf(ptr
, "%lf", &dbl
);
591 /* Temporarily fill nbat->nbfp_comb with sigma and epsilon
592 * to check for the LB rule.
594 for (i
= 0; i
< ntype
; i
++)
596 c6
= nbfp
[(i
*ntype
+i
)*2 ]/6.0;
597 c12
= nbfp
[(i
*ntype
+i
)*2+1]/12.0;
598 if (c6
> 0 && c12
> 0)
600 nbat
->nbfp_comb
[i
*2 ] = pow(c12
/c6
, 1.0/6.0);
601 nbat
->nbfp_comb
[i
*2+1] = 0.25*c6
*c6
/c12
;
603 else if (c6
== 0 && c12
== 0)
605 nbat
->nbfp_comb
[i
*2 ] = 0;
606 nbat
->nbfp_comb
[i
*2+1] = 0;
610 /* Can not use LB rule with only dispersion or repulsion */
615 for (i
= 0; i
< nbat
->ntype
; i
++)
617 for (j
= 0; j
< nbat
->ntype
; j
++)
619 if (i
< ntype
&& j
< ntype
)
621 /* fr->nbfp has been updated, so that array too now stores c6/c12 including
622 * the 6.0/12.0 prefactors to save 2 flops in the most common case (force-only).
624 c6
= nbfp
[(i
*ntype
+j
)*2 ];
625 c12
= nbfp
[(i
*ntype
+j
)*2+1];
626 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2 ] = c6
;
627 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2+1] = c12
;
629 /* Compare 6*C6 and 12*C12 for geometric cobination rule */
630 bCombGeom
= bCombGeom
&&
631 gmx_within_tol(c6
*c6
, nbfp
[(i
*ntype
+i
)*2 ]*nbfp
[(j
*ntype
+j
)*2 ], tol
) &&
632 gmx_within_tol(c12
*c12
, nbfp
[(i
*ntype
+i
)*2+1]*nbfp
[(j
*ntype
+j
)*2+1], tol
);
634 /* Compare C6 and C12 for Lorentz-Berthelot combination rule */
638 ((c6
== 0 && c12
== 0 &&
639 (nbat
->nbfp_comb
[i
*2+1] == 0 || nbat
->nbfp_comb
[j
*2+1] == 0)) ||
640 (c6
> 0 && c12
> 0 &&
641 gmx_within_tol(pow(c12
/c6
, 1.0/6.0), 0.5*(nbat
->nbfp_comb
[i
*2]+nbat
->nbfp_comb
[j
*2]), tol
) &&
642 gmx_within_tol(0.25*c6
*c6
/c12
, sqrt(nbat
->nbfp_comb
[i
*2+1]*nbat
->nbfp_comb
[j
*2+1]), tol
)));
646 /* Add zero parameters for the additional dummy atom type */
647 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2 ] = 0;
648 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2+1] = 0;
654 fprintf(debug
, "Combination rules: geometric %d Lorentz-Berthelot %d\n",
658 simple
= nbnxn_kernel_pairlist_simple(nb_kernel_type
);
660 switch (enbnxninitcombrule
)
662 case enbnxninitcombruleDETECT
:
663 /* We prefer the geometic combination rule,
664 * as that gives a slightly faster kernel than the LB rule.
668 nbat
->comb_rule
= ljcrGEOM
;
672 nbat
->comb_rule
= ljcrLB
;
676 nbat
->comb_rule
= ljcrNONE
;
678 nbat
->free(nbat
->nbfp_comb
);
683 if (nbat
->comb_rule
== ljcrNONE
)
685 fprintf(fp
, "Using full Lennard-Jones parameter combination matrix\n\n");
689 fprintf(fp
, "Using %s Lennard-Jones combination rule\n\n",
690 nbat
->comb_rule
== ljcrGEOM
? "geometric" : "Lorentz-Berthelot");
694 case enbnxninitcombruleGEOM
:
695 nbat
->comb_rule
= ljcrGEOM
;
697 case enbnxninitcombruleLB
:
698 nbat
->comb_rule
= ljcrLB
;
700 case enbnxninitcombruleNONE
:
701 nbat
->comb_rule
= ljcrNONE
;
703 nbat
->free(nbat
->nbfp_comb
);
706 gmx_incons("Unknown enbnxninitcombrule");
709 bSIMD
= (nb_kernel_type
== nbnxnk4xN_SIMD_4xN
||
710 nb_kernel_type
== nbnxnk4xN_SIMD_2xNN
);
712 set_lj_parameter_data(nbat
, bSIMD
);
716 nbat
->lj_comb
= NULL
;
723 pack_x
= max(NBNXN_CPU_CLUSTER_I_SIZE
,
724 nbnxn_kernel_to_cluster_j_size(nb_kernel_type
));
728 nbat
->XFormat
= nbatX4
;
731 nbat
->XFormat
= nbatX8
;
734 gmx_incons("Unsupported packing width");
739 nbat
->XFormat
= nbatXYZ
;
742 nbat
->FFormat
= nbat
->XFormat
;
746 nbat
->XFormat
= nbatXYZQ
;
747 nbat
->FFormat
= nbatXYZ
;
750 nbat
->nenergrp
= n_energygroups
;
753 /* Energy groups not supported yet for super-sub lists */
754 if (n_energygroups
> 1 && fp
!= NULL
)
756 fprintf(fp
, "\nNOTE: With GPUs, reporting energy group contributions is not supported\n\n");
760 /* Temporary storage goes as #grp^3*simd_width^2/2, so limit to 64 */
761 if (nbat
->nenergrp
> 64)
763 gmx_fatal(FARGS
, "With NxN kernels not more than 64 energy groups are supported\n");
766 while (nbat
->nenergrp
> (1<<nbat
->neg_2log
))
770 nbat
->energrp
= NULL
;
771 nbat
->alloc((void **)&nbat
->shift_vec
, SHIFTS
*sizeof(*nbat
->shift_vec
));
772 nbat
->xstride
= (nbat
->XFormat
== nbatXYZQ
? STRIDE_XYZQ
: DIM
);
773 nbat
->fstride
= (nbat
->FFormat
== nbatXYZQ
? STRIDE_XYZQ
: DIM
);
776 #ifdef GMX_NBNXN_SIMD
779 nbnxn_atomdata_init_simple_exclusion_masks(nbat
);
783 /* Initialize the output data structures */
785 snew(nbat
->out
, nbat
->nout
);
787 for (i
= 0; i
< nbat
->nout
; i
++)
789 nbnxn_atomdata_output_init(&nbat
->out
[i
],
791 nbat
->nenergrp
, 1<<nbat
->neg_2log
,
794 nbat
->buffer_flags
.flag
= NULL
;
795 nbat
->buffer_flags
.flag_nalloc
= 0;
797 nth
= gmx_omp_nthreads_get(emntNonbonded
);
799 ptr
= getenv("GMX_USE_TREEREDUCE");
802 nbat
->bUseTreeReduce
= strtol(ptr
, 0, 10);
805 else if (nth
> 8) /*on the CPU we currently don't benefit even at 32*/
807 nbat
->bUseTreeReduce
= 1;
812 nbat
->bUseTreeReduce
= 0;
814 if (nbat
->bUseTreeReduce
)
818 fprintf(fp
, "Using tree force reduction\n\n");
820 snew(nbat
->syncStep
, nth
);
824 static void copy_lj_to_nbat_lj_comb_x4(const real
*ljparam_type
,
825 const int *type
, int na
,
830 /* The LJ params follow the combination rule:
831 * copy the params for the type array to the atom array.
833 for (is
= 0; is
< na
; is
+= PACK_X4
)
835 for (k
= 0; k
< PACK_X4
; k
++)
838 ljparam_at
[is
*2 +k
] = ljparam_type
[type
[i
]*2 ];
839 ljparam_at
[is
*2+PACK_X4
+k
] = ljparam_type
[type
[i
]*2+1];
844 static void copy_lj_to_nbat_lj_comb_x8(const real
*ljparam_type
,
845 const int *type
, int na
,
850 /* The LJ params follow the combination rule:
851 * copy the params for the type array to the atom array.
853 for (is
= 0; is
< na
; is
+= PACK_X8
)
855 for (k
= 0; k
< PACK_X8
; k
++)
858 ljparam_at
[is
*2 +k
] = ljparam_type
[type
[i
]*2 ];
859 ljparam_at
[is
*2+PACK_X8
+k
] = ljparam_type
[type
[i
]*2+1];
864 /* Sets the atom type in nbnxn_atomdata_t */
865 static void nbnxn_atomdata_set_atomtypes(nbnxn_atomdata_t
*nbat
,
867 const nbnxn_search_t nbs
,
871 const nbnxn_grid_t
*grid
;
873 for (g
= 0; g
< ngrid
; g
++)
875 grid
= &nbs
->grid
[g
];
877 /* Loop over all columns and copy and fill */
878 for (i
= 0; i
< grid
->ncx
*grid
->ncy
; i
++)
880 ncz
= grid
->cxy_ind
[i
+1] - grid
->cxy_ind
[i
];
881 ash
= (grid
->cell0
+ grid
->cxy_ind
[i
])*grid
->na_sc
;
883 copy_int_to_nbat_int(nbs
->a
+ash
, grid
->cxy_na
[i
], ncz
*grid
->na_sc
,
884 type
, nbat
->ntype
-1, nbat
->type
+ash
);
889 /* Sets the LJ combination rule parameters in nbnxn_atomdata_t */
890 static void nbnxn_atomdata_set_ljcombparams(nbnxn_atomdata_t
*nbat
,
892 const nbnxn_search_t nbs
)
895 const nbnxn_grid_t
*grid
;
897 if (nbat
->comb_rule
!= ljcrNONE
)
899 for (g
= 0; g
< ngrid
; g
++)
901 grid
= &nbs
->grid
[g
];
903 /* Loop over all columns and copy and fill */
904 for (i
= 0; i
< grid
->ncx
*grid
->ncy
; i
++)
906 ncz
= grid
->cxy_ind
[i
+1] - grid
->cxy_ind
[i
];
907 ash
= (grid
->cell0
+ grid
->cxy_ind
[i
])*grid
->na_sc
;
909 if (nbat
->XFormat
== nbatX4
)
911 copy_lj_to_nbat_lj_comb_x4(nbat
->nbfp_comb
,
912 nbat
->type
+ash
, ncz
*grid
->na_sc
,
913 nbat
->lj_comb
+ash
*2);
915 else if (nbat
->XFormat
== nbatX8
)
917 copy_lj_to_nbat_lj_comb_x8(nbat
->nbfp_comb
,
918 nbat
->type
+ash
, ncz
*grid
->na_sc
,
919 nbat
->lj_comb
+ash
*2);
926 /* Sets the charges in nbnxn_atomdata_t *nbat */
927 static void nbnxn_atomdata_set_charges(nbnxn_atomdata_t
*nbat
,
929 const nbnxn_search_t nbs
,
932 int g
, cxy
, ncz
, ash
, na
, na_round
, i
, j
;
934 const nbnxn_grid_t
*grid
;
936 for (g
= 0; g
< ngrid
; g
++)
938 grid
= &nbs
->grid
[g
];
940 /* Loop over all columns and copy and fill */
941 for (cxy
= 0; cxy
< grid
->ncx
*grid
->ncy
; cxy
++)
943 ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
944 na
= grid
->cxy_na
[cxy
];
945 na_round
= (grid
->cxy_ind
[cxy
+1] - grid
->cxy_ind
[cxy
])*grid
->na_sc
;
947 if (nbat
->XFormat
== nbatXYZQ
)
949 q
= nbat
->x
+ ash
*STRIDE_XYZQ
+ ZZ
+ 1;
950 for (i
= 0; i
< na
; i
++)
952 *q
= charge
[nbs
->a
[ash
+i
]];
955 /* Complete the partially filled last cell with zeros */
956 for (; i
< na_round
; i
++)
965 for (i
= 0; i
< na
; i
++)
967 *q
= charge
[nbs
->a
[ash
+i
]];
970 /* Complete the partially filled last cell with zeros */
971 for (; i
< na_round
; i
++)
981 /* Set the charges of perturbed atoms in nbnxn_atomdata_t to 0.
982 * This is to automatically remove the RF/PME self term in the nbnxn kernels.
983 * Part of the zero interactions are still calculated in the normal kernels.
984 * All perturbed interactions are calculated in the free energy kernel,
985 * using the original charge and LJ data, not nbnxn_atomdata_t.
987 static void nbnxn_atomdata_mask_fep(nbnxn_atomdata_t
*nbat
,
989 const nbnxn_search_t nbs
)
992 int stride_q
, g
, nsubc
, c_offset
, c
, subc
, i
, ind
;
993 const nbnxn_grid_t
*grid
;
995 if (nbat
->XFormat
== nbatXYZQ
)
997 q
= nbat
->x
+ ZZ
+ 1;
998 stride_q
= STRIDE_XYZQ
;
1006 for (g
= 0; g
< ngrid
; g
++)
1008 grid
= &nbs
->grid
[g
];
1015 nsubc
= GPU_NSUBCELL
;
1018 c_offset
= grid
->cell0
*grid
->na_sc
;
1020 /* Loop over all columns and copy and fill */
1021 for (c
= 0; c
< grid
->nc
*nsubc
; c
++)
1023 /* Does this cluster contain perturbed particles? */
1024 if (grid
->fep
[c
] != 0)
1026 for (i
= 0; i
< grid
->na_c
; i
++)
1028 /* Is this a perturbed particle? */
1029 if (grid
->fep
[c
] & (1 << i
))
1031 ind
= c_offset
+ c
*grid
->na_c
+ i
;
1032 /* Set atom type and charge to non-interacting */
1033 nbat
->type
[ind
] = nbat
->ntype
- 1;
1034 q
[ind
*stride_q
] = 0;
1042 /* Copies the energy group indices to a reordered and packed array */
1043 static void copy_egp_to_nbat_egps(const int *a
, int na
, int na_round
,
1044 int na_c
, int bit_shift
,
1045 const int *in
, int *innb
)
1051 for (i
= 0; i
< na
; i
+= na_c
)
1053 /* Store na_c energy group numbers into one int */
1055 for (sa
= 0; sa
< na_c
; sa
++)
1060 comb
|= (GET_CGINFO_GID(in
[at
]) << (sa
*bit_shift
));
1065 /* Complete the partially filled last cell with fill */
1066 for (; i
< na_round
; i
+= na_c
)
1072 /* Set the energy group indices for atoms in nbnxn_atomdata_t */
1073 static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t
*nbat
,
1075 const nbnxn_search_t nbs
,
1079 const nbnxn_grid_t
*grid
;
1081 if (nbat
->nenergrp
== 1)
1086 for (g
= 0; g
< ngrid
; g
++)
1088 grid
= &nbs
->grid
[g
];
1090 /* Loop over all columns and copy and fill */
1091 for (i
= 0; i
< grid
->ncx
*grid
->ncy
; i
++)
1093 ncz
= grid
->cxy_ind
[i
+1] - grid
->cxy_ind
[i
];
1094 ash
= (grid
->cell0
+ grid
->cxy_ind
[i
])*grid
->na_sc
;
1096 copy_egp_to_nbat_egps(nbs
->a
+ash
, grid
->cxy_na
[i
], ncz
*grid
->na_sc
,
1097 nbat
->na_c
, nbat
->neg_2log
,
1098 atinfo
, nbat
->energrp
+(ash
>>grid
->na_c_2log
));
1103 /* Sets all required atom parameter data in nbnxn_atomdata_t */
1104 void nbnxn_atomdata_set(nbnxn_atomdata_t
*nbat
,
1106 const nbnxn_search_t nbs
,
1107 const t_mdatoms
*mdatoms
,
1112 if (locality
== eatLocal
)
1121 nbnxn_atomdata_set_atomtypes(nbat
, ngrid
, nbs
, mdatoms
->typeA
);
1123 nbnxn_atomdata_set_charges(nbat
, ngrid
, nbs
, mdatoms
->chargeA
);
1127 nbnxn_atomdata_mask_fep(nbat
, ngrid
, nbs
);
1130 /* This must be done after masking types for FEP */
1131 nbnxn_atomdata_set_ljcombparams(nbat
, ngrid
, nbs
);
1133 nbnxn_atomdata_set_energygroups(nbat
, ngrid
, nbs
, atinfo
);
1136 /* Copies the shift vector array to nbnxn_atomdata_t */
1137 void nbnxn_atomdata_copy_shiftvec(gmx_bool bDynamicBox
,
1139 nbnxn_atomdata_t
*nbat
)
1143 nbat
->bDynamicBox
= bDynamicBox
;
1144 for (i
= 0; i
< SHIFTS
; i
++)
1146 copy_rvec(shift_vec
[i
], nbat
->shift_vec
[i
]);
1150 /* Copies (and reorders) the coordinates to nbnxn_atomdata_t */
1151 void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search_t nbs
,
1155 nbnxn_atomdata_t
*nbat
)
1178 nbat
->natoms_local
= nbs
->grid
[0].nc
*nbs
->grid
[0].na_sc
;
1181 nth
= gmx_omp_nthreads_get(emntPairsearch
);
1183 #pragma omp parallel for num_threads(nth) schedule(static)
1184 for (th
= 0; th
< nth
; th
++)
1188 for (g
= g0
; g
< g1
; g
++)
1190 const nbnxn_grid_t
*grid
;
1191 int cxy0
, cxy1
, cxy
;
1193 grid
= &nbs
->grid
[g
];
1195 cxy0
= (grid
->ncx
*grid
->ncy
* th
+nth
-1)/nth
;
1196 cxy1
= (grid
->ncx
*grid
->ncy
*(th
+1)+nth
-1)/nth
;
1198 for (cxy
= cxy0
; cxy
< cxy1
; cxy
++)
1200 int na
, ash
, na_fill
;
1202 na
= grid
->cxy_na
[cxy
];
1203 ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
1205 if (g
== 0 && FillLocal
)
1208 (grid
->cxy_ind
[cxy
+1] - grid
->cxy_ind
[cxy
])*grid
->na_sc
;
1212 /* We fill only the real particle locations.
1213 * We assume the filling entries at the end have been
1214 * properly set before during ns.
1218 copy_rvec_to_nbat_real(nbs
->a
+ash
, na
, na_fill
, x
,
1219 nbat
->XFormat
, nbat
->x
, ash
,
1227 nbnxn_atomdata_clear_reals(real
* gmx_restrict dest
,
1232 for (i
= i0
; i
< i1
; i
++)
1239 nbnxn_atomdata_reduce_reals(real
* gmx_restrict dest
,
1241 real
** gmx_restrict src
,
1249 /* The destination buffer contains data, add to it */
1250 for (i
= i0
; i
< i1
; i
++)
1252 for (s
= 0; s
< nsrc
; s
++)
1254 dest
[i
] += src
[s
][i
];
1260 /* The destination buffer is unitialized, set it first */
1261 for (i
= i0
; i
< i1
; i
++)
1263 dest
[i
] = src
[0][i
];
1264 for (s
= 1; s
< nsrc
; s
++)
1266 dest
[i
] += src
[s
][i
];
1273 nbnxn_atomdata_reduce_reals_simd(real gmx_unused
* gmx_restrict dest
,
1274 gmx_bool gmx_unused bDestSet
,
1275 real gmx_unused
** gmx_restrict src
,
1276 int gmx_unused nsrc
,
1277 int gmx_unused i0
, int gmx_unused i1
)
1279 #ifdef GMX_NBNXN_SIMD
1280 /* The SIMD width here is actually independent of that in the kernels,
1281 * but we use the same width for simplicity (usually optimal anyhow).
1284 gmx_simd_real_t dest_SSE
, src_SSE
;
1288 for (i
= i0
; i
< i1
; i
+= GMX_SIMD_REAL_WIDTH
)
1290 dest_SSE
= gmx_simd_load_r(dest
+i
);
1291 for (s
= 0; s
< nsrc
; s
++)
1293 src_SSE
= gmx_simd_load_r(src
[s
]+i
);
1294 dest_SSE
= gmx_simd_add_r(dest_SSE
, src_SSE
);
1296 gmx_simd_store_r(dest
+i
, dest_SSE
);
1301 for (i
= i0
; i
< i1
; i
+= GMX_SIMD_REAL_WIDTH
)
1303 dest_SSE
= gmx_simd_load_r(src
[0]+i
);
1304 for (s
= 1; s
< nsrc
; s
++)
1306 src_SSE
= gmx_simd_load_r(src
[s
]+i
);
1307 dest_SSE
= gmx_simd_add_r(dest_SSE
, src_SSE
);
1309 gmx_simd_store_r(dest
+i
, dest_SSE
);
1315 /* Add part of the force array(s) from nbnxn_atomdata_t to f */
1317 nbnxn_atomdata_add_nbat_f_to_f_part(const nbnxn_search_t nbs
,
1318 const nbnxn_atomdata_t
*nbat
,
1319 nbnxn_atomdata_output_t
*out
,
1330 /* Loop over all columns and copy and fill */
1331 switch (nbat
->FFormat
)
1339 for (a
= a0
; a
< a1
; a
++)
1341 i
= cell
[a
]*nbat
->fstride
;
1344 f
[a
][YY
] += fnb
[i
+1];
1345 f
[a
][ZZ
] += fnb
[i
+2];
1350 for (a
= a0
; a
< a1
; a
++)
1352 i
= cell
[a
]*nbat
->fstride
;
1354 for (fa
= 0; fa
< nfa
; fa
++)
1356 f
[a
][XX
] += out
[fa
].f
[i
];
1357 f
[a
][YY
] += out
[fa
].f
[i
+1];
1358 f
[a
][ZZ
] += out
[fa
].f
[i
+2];
1368 for (a
= a0
; a
< a1
; a
++)
1370 i
= X4_IND_A(cell
[a
]);
1372 f
[a
][XX
] += fnb
[i
+XX
*PACK_X4
];
1373 f
[a
][YY
] += fnb
[i
+YY
*PACK_X4
];
1374 f
[a
][ZZ
] += fnb
[i
+ZZ
*PACK_X4
];
1379 for (a
= a0
; a
< a1
; a
++)
1381 i
= X4_IND_A(cell
[a
]);
1383 for (fa
= 0; fa
< nfa
; fa
++)
1385 f
[a
][XX
] += out
[fa
].f
[i
+XX
*PACK_X4
];
1386 f
[a
][YY
] += out
[fa
].f
[i
+YY
*PACK_X4
];
1387 f
[a
][ZZ
] += out
[fa
].f
[i
+ZZ
*PACK_X4
];
1397 for (a
= a0
; a
< a1
; a
++)
1399 i
= X8_IND_A(cell
[a
]);
1401 f
[a
][XX
] += fnb
[i
+XX
*PACK_X8
];
1402 f
[a
][YY
] += fnb
[i
+YY
*PACK_X8
];
1403 f
[a
][ZZ
] += fnb
[i
+ZZ
*PACK_X8
];
1408 for (a
= a0
; a
< a1
; a
++)
1410 i
= X8_IND_A(cell
[a
]);
1412 for (fa
= 0; fa
< nfa
; fa
++)
1414 f
[a
][XX
] += out
[fa
].f
[i
+XX
*PACK_X8
];
1415 f
[a
][YY
] += out
[fa
].f
[i
+YY
*PACK_X8
];
1416 f
[a
][ZZ
] += out
[fa
].f
[i
+ZZ
*PACK_X8
];
1422 gmx_incons("Unsupported nbnxn_atomdata_t format");
1426 static gmx_inline
unsigned char reverse_bits(unsigned char b
)
1428 /* http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64BitsDiv */
1429 return (b
* 0x0202020202ULL
& 0x010884422010ULL
) % 1023;
1432 static void nbnxn_atomdata_add_nbat_f_to_f_treereduce(const nbnxn_atomdata_t
*nbat
,
1435 const nbnxn_buffer_flags_t
*flags
= &nbat
->buffer_flags
;
1437 int next_pow2
= 1<<(gmx_log2i(nth
-1)+1);
1439 assert(nbat
->nout
== nth
); /* tree-reduce currently only works for nout==nth */
1441 memset(nbat
->syncStep
, 0, sizeof(*(nbat
->syncStep
))*nth
);
1443 #pragma omp parallel num_threads(nth)
1449 th
= gmx_omp_get_thread_num();
1451 for (group_size
= 2; group_size
< 2*next_pow2
; group_size
*= 2)
1453 int index
[2], group_pos
, partner_pos
, wu
;
1454 int partner_th
= th
^ (group_size
/2);
1459 /* wait on partner thread - replaces full barrier */
1460 int sync_th
, sync_group_size
;
1462 tMPI_Atomic_memory_barrier(); /* gurantee data is saved before marking work as done */
1463 tMPI_Atomic_set(&(nbat
->syncStep
[th
]), group_size
/2); /* mark previous step as completed */
1465 /* find thread to sync with. Equal to partner_th unless nth is not a power of two. */
1466 for (sync_th
= partner_th
, sync_group_size
= group_size
; sync_th
>= nth
&& sync_group_size
> 2; sync_group_size
/= 2)
1468 sync_th
&= ~(sync_group_size
/4);
1470 if (sync_th
< nth
) /* otherwise nothing to sync index[1] will be >=nout */
1472 /* wait on the thread which computed input data in previous step */
1473 while (tMPI_Atomic_get((volatile tMPI_Atomic_t
*)&(nbat
->syncStep
[sync_th
])) < group_size
/2)
1477 /* guarantee that no later load happens before wait loop is finisehd */
1478 tMPI_Atomic_memory_barrier();
1480 #else /* TMPI_ATOMICS */
1485 /* Calculate buffers to sum (result goes into first buffer) */
1486 group_pos
= th
% group_size
;
1487 index
[0] = th
- group_pos
;
1488 index
[1] = index
[0] + group_size
/2;
1490 /* If no second buffer, nothing to do */
1491 if (index
[1] >= nbat
->nout
&& group_size
> 2)
1496 #if NBNXN_BUFFERFLAG_MAX_THREADS > 256
1497 #error reverse_bits assumes max 256 threads
1499 /* Position is permuted so that one of the 2 vectors being added was computed on the same thread in the previous step.
1500 This improves locality and enables to sync with just a single thread between steps (=the levels in the btree).
1501 The permutation which allows this corresponds to reversing the bits of the group position.
1503 group_pos
= reverse_bits(group_pos
)/(256/group_size
);
1505 partner_pos
= group_pos
^ 1;
1507 /* loop over two work-units (own and partner) */
1508 for (wu
= 0; wu
< 2; wu
++)
1512 if (partner_th
< nth
)
1514 break; /* partner exists we don't have to do his work */
1518 group_pos
= partner_pos
;
1522 /* Calculate the cell-block range for our thread */
1523 b0
= (flags
->nflag
* group_pos
)/group_size
;
1524 b1
= (flags
->nflag
*(group_pos
+1))/group_size
;
1526 for (b
= b0
; b
< b1
; b
++)
1528 i0
= b
*NBNXN_BUFFERFLAG_SIZE
*nbat
->fstride
;
1529 i1
= (b
+1)*NBNXN_BUFFERFLAG_SIZE
*nbat
->fstride
;
1531 if (bitmask_is_set(flags
->flag
[b
], index
[1]) || group_size
> 2)
1533 #ifdef GMX_NBNXN_SIMD
1534 nbnxn_atomdata_reduce_reals_simd
1536 nbnxn_atomdata_reduce_reals
1538 (nbat
->out
[index
[0]].f
,
1539 bitmask_is_set(flags
->flag
[b
], index
[0]) || group_size
> 2,
1540 &(nbat
->out
[index
[1]].f
), 1, i0
, i1
);
1543 else if (!bitmask_is_set(flags
->flag
[b
], index
[0]))
1545 nbnxn_atomdata_clear_reals(nbat
->out
[index
[0]].f
,
1555 static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(const nbnxn_atomdata_t
*nbat
,
1559 #pragma omp parallel for num_threads(nth) schedule(static)
1560 for (th
= 0; th
< nth
; th
++)
1562 const nbnxn_buffer_flags_t
*flags
;
1566 real
*fptr
[NBNXN_BUFFERFLAG_MAX_THREADS
];
1569 flags
= &nbat
->buffer_flags
;
1571 /* Calculate the cell-block range for our thread */
1572 b0
= (flags
->nflag
* th
)/nth
;
1573 b1
= (flags
->nflag
*(th
+1))/nth
;
1575 for (b
= b0
; b
< b1
; b
++)
1577 i0
= b
*NBNXN_BUFFERFLAG_SIZE
*nbat
->fstride
;
1578 i1
= (b
+1)*NBNXN_BUFFERFLAG_SIZE
*nbat
->fstride
;
1581 for (out
= 1; out
< nbat
->nout
; out
++)
1583 if (bitmask_is_set(flags
->flag
[b
], out
))
1585 fptr
[nfptr
++] = nbat
->out
[out
].f
;
1590 #ifdef GMX_NBNXN_SIMD
1591 nbnxn_atomdata_reduce_reals_simd
1593 nbnxn_atomdata_reduce_reals
1596 bitmask_is_set(flags
->flag
[b
], 0),
1600 else if (!bitmask_is_set(flags
->flag
[b
], 0))
1602 nbnxn_atomdata_clear_reals(nbat
->out
[0].f
,
1609 /* Add the force array(s) from nbnxn_atomdata_t to f */
1610 void nbnxn_atomdata_add_nbat_f_to_f(const nbnxn_search_t nbs
,
1612 const nbnxn_atomdata_t
*nbat
,
1618 nbs_cycle_start(&nbs
->cc
[enbsCCreducef
]);
1624 na
= nbs
->natoms_nonlocal
;
1628 na
= nbs
->natoms_local
;
1631 a0
= nbs
->natoms_local
;
1632 na
= nbs
->natoms_nonlocal
- nbs
->natoms_local
;
1636 nth
= gmx_omp_nthreads_get(emntNonbonded
);
1640 if (locality
!= eatAll
)
1642 gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll");
1645 /* Reduce the force thread output buffers into buffer 0, before adding
1646 * them to the, differently ordered, "real" force buffer.
1648 if (nbat
->bUseTreeReduce
)
1650 nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbat
, nth
);
1654 nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbat
, nth
);
1657 #pragma omp parallel for num_threads(nth) schedule(static)
1658 for (th
= 0; th
< nth
; th
++)
1660 nbnxn_atomdata_add_nbat_f_to_f_part(nbs
, nbat
,
1668 nbs_cycle_stop(&nbs
->cc
[enbsCCreducef
]);
1671 /* Adds the shift forces from nbnxn_atomdata_t to fshift */
1672 void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t
*nbat
,
1675 const nbnxn_atomdata_output_t
*out
;
1682 for (s
= 0; s
< SHIFTS
; s
++)
1685 for (th
= 0; th
< nbat
->nout
; th
++)
1687 sum
[XX
] += out
[th
].fshift
[s
*DIM
+XX
];
1688 sum
[YY
] += out
[th
].fshift
[s
*DIM
+YY
];
1689 sum
[ZZ
] += out
[th
].fshift
[s
*DIM
+ZZ
];
1691 rvec_inc(fshift
[s
], sum
);