2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 #include "kernel_gpu_ref.h"
43 #include "gromacs/math/functions.h"
44 #include "gromacs/math/utilities.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/mdlib/ppforceworkload.h"
47 #include "gromacs/mdtypes/md_enums.h"
48 #include "gromacs/nbnxm/atomdata.h"
49 #include "gromacs/nbnxm/nbnxm.h"
50 #include "gromacs/nbnxm/pairlist.h"
51 #include "gromacs/pbcutil/ishift.h"
52 #include "gromacs/utility/fatalerror.h"
54 static const int c_numClPerSupercl
= c_nbnxnGpuNumClusterPerSupercluster
;
55 static const int c_clSize
= c_nbnxnGpuClusterSize
;
58 nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu
*nbl
,
59 const nbnxn_atomdata_t
*nbat
,
60 const interaction_const_t
*iconst
,
62 const gmx::ForceFlags
&forceFlags
,
64 gmx::ArrayRef
<real
> f
,
70 const real
*Ftab
= nullptr;
71 real rcut2
, rvdw2
, rlist2
;
76 int cj4_ind0
, cj4_ind1
, cj4_ind
;
78 int ic
, jc
, ia
, ja
, is
, ifs
, js
, jfs
, im
, jm
;
82 real fscal
, tx
, ty
, tz
;
85 real qq
, vcoul
= 0, krsq
, vctot
;
91 real Vvdw_rep
, Vvdw_disp
;
92 real ix
, iy
, iz
, fix
, fiy
, fiz
;
94 real dx
, dy
, dz
, rsq
, rinv
;
98 const nbnxn_excl_t
*excl
[2];
100 int npair_tot
, npair
;
101 int nhwu
, nhwu_pruned
;
103 if (nbl
->na_ci
!= c_clSize
)
105 gmx_fatal(FARGS
, "The neighborlist cluster size in the GPU reference kernel is %d, expected it to be %d", nbl
->na_ci
, c_clSize
);
108 if (clearF
== enbvClearFYes
)
116 bEwald
= EEL_FULL(iconst
->eeltype
);
119 Ftab
= iconst
->coulombEwaldTables
->tableF
.data();
122 rcut2
= iconst
->rcoulomb
*iconst
->rcoulomb
;
123 rvdw2
= iconst
->rvdw
*iconst
->rvdw
;
125 rlist2
= nbl
->rlist
*nbl
->rlist
;
127 const int *type
= nbat
->params().type
.data();
128 facel
= iconst
->epsfac
;
129 const real
*shiftvec
= shift_vec
[0];
130 const real
*vdwparam
= nbat
->params().nbfp
.data();
131 ntype
= nbat
->params().numTypes
;
133 const real
*x
= nbat
->x().data();
139 for (const nbnxn_sci_t
&nbln
: nbl
->sci
)
142 shX
= shiftvec
[ish3
];
143 shY
= shiftvec
[ish3
+1];
144 shZ
= shiftvec
[ish3
+2];
145 cj4_ind0
= nbln
.cj4_ind_start
;
146 cj4_ind1
= nbln
.cj4_ind_end
;
151 if (nbln
.shift
== CENTRAL
&&
152 nbl
->cj4
[cj4_ind0
].cj
[0] == sci
*c_numClPerSupercl
)
154 /* we have the diagonal:
155 * add the charge self interaction energy term
157 for (im
= 0; im
< c_numClPerSupercl
; im
++)
159 ci
= sci
*c_numClPerSupercl
+ im
;
160 for (ic
= 0; ic
< c_clSize
; ic
++)
162 ia
= ci
*c_clSize
+ ic
;
163 iq
= x
[ia
*nbat
->xstride
+3];
169 vctot
*= -facel
*0.5*iconst
->c_rf
;
173 /* last factor 1/sqrt(pi) */
174 vctot
*= -facel
*iconst
->ewaldcoeff_q
*M_1_SQRTPI
;
178 for (cj4_ind
= cj4_ind0
; (cj4_ind
< cj4_ind1
); cj4_ind
++)
180 excl
[0] = &nbl
->excl
[nbl
->cj4
[cj4_ind
].imei
[0].excl_ind
];
181 excl
[1] = &nbl
->excl
[nbl
->cj4
[cj4_ind
].imei
[1].excl_ind
];
183 for (jm
= 0; jm
< c_nbnxnGpuJgroupSize
; jm
++)
185 cj
= nbl
->cj4
[cj4_ind
].cj
[jm
];
187 for (im
= 0; im
< c_numClPerSupercl
; im
++)
189 /* We're only using the first imask,
190 * but here imei[1].imask is identical.
192 if ((nbl
->cj4
[cj4_ind
].imei
[0].imask
>> (jm
*c_numClPerSupercl
+ im
)) & 1)
194 gmx_bool within_rlist
;
196 ci
= sci
*c_numClPerSupercl
+ im
;
198 within_rlist
= FALSE
;
200 for (ic
= 0; ic
< c_clSize
; ic
++)
202 ia
= ci
*c_clSize
+ ic
;
204 is
= ia
*nbat
->xstride
;
205 ifs
= ia
*nbat
->fstride
;
210 nti
= ntype
*2*type
[ia
];
216 for (jc
= 0; jc
< c_clSize
; jc
++)
218 ja
= cj
*c_clSize
+ jc
;
220 if (nbln
.shift
== CENTRAL
&&
221 ci
== cj
&& ja
<= ia
)
226 constexpr int clusterPerSplit
= c_nbnxnGpuClusterSize
/c_nbnxnGpuClusterpairSplit
;
227 int_bit
= static_cast<real
>((excl
[jc
/clusterPerSplit
]->pair
[(jc
& (clusterPerSplit
- 1))*c_clSize
+ ic
]
228 >> (jm
*c_numClPerSupercl
+ im
)) & 1);
230 js
= ja
*nbat
->xstride
;
231 jfs
= ja
*nbat
->fstride
;
238 rsq
= dx
*dx
+ dy
*dy
+ dz
*dz
;
248 if (type
[ia
] != ntype
-1 && type
[ja
] != ntype
-1)
253 // Ensure distance do not become so small that r^-12 overflows
254 rsq
= std::max(rsq
, NBNXN_MIN_RSQ
);
256 rinv
= gmx::invsqrt(rsq
);
263 krsq
= iconst
->k_rf
*rsq
;
264 fscal
= qq
*(int_bit
*rinv
- 2*krsq
)*rinvsq
;
265 if (forceFlags
.computeEnergy
)
267 vcoul
= qq
*(int_bit
*rinv
+ krsq
- iconst
->c_rf
);
273 rt
= r
*iconst
->coulombEwaldTables
->scale
;
274 n0
= static_cast<int>(rt
);
275 eps
= rt
- static_cast<real
>(n0
);
277 fexcl
= (1 - eps
)*Ftab
[n0
] + eps
*Ftab
[n0
+1];
279 fscal
= qq
*(int_bit
*rinvsq
- fexcl
)*rinv
;
281 if (forceFlags
.computeEnergy
)
283 vcoul
= qq
*((int_bit
- std::erf(iconst
->ewaldcoeff_q
*r
))*rinv
- int_bit
*iconst
->sh_ewald
);
289 tj
= nti
+ 2*type
[ja
];
291 /* Vanilla Lennard-Jones cutoff */
293 c12
= vdwparam
[tj
+1];
295 rinvsix
= int_bit
*rinvsq
*rinvsq
*rinvsq
;
296 Vvdw_disp
= c6
*rinvsix
;
297 Vvdw_rep
= c12
*rinvsix
*rinvsix
;
298 fscal
+= (Vvdw_rep
- Vvdw_disp
)*rinvsq
;
300 if (forceFlags
.computeEnergy
)
305 (Vvdw_rep
+ int_bit
*c12
*iconst
->repulsion_shift
.cpot
)/12 -
306 (Vvdw_disp
+ int_bit
*c6
*iconst
->dispersion_shift
.cpot
)/6;
324 fshift
[ish3
] = fshift
[ish3
] + fix
;
325 fshift
[ish3
+1] = fshift
[ish3
+1] + fiy
;
326 fshift
[ish3
+2] = fshift
[ish3
+2] + fiz
;
328 /* Count in half work-units.
329 * In CUDA one work-unit is 2 warps.
331 if ((ic
+1) % (c_clSize
/c_nbnxnGpuClusterpairSplit
) == 0)
341 within_rlist
= FALSE
;
350 if (forceFlags
.computeEnergy
)
353 Vc
[ggid
] = Vc
[ggid
] + vctot
;
354 Vvdw
[ggid
] = Vvdw
[ggid
] + Vvdwtot
;
360 fprintf(debug
, "number of half %dx%d atom pairs: %d after pruning: %d fraction %4.2f\n",
361 nbl
->na_ci
, nbl
->na_ci
,
362 nhwu
, nhwu_pruned
, nhwu_pruned
/static_cast<double>(nhwu
));
363 fprintf(debug
, "generic kernel pair interactions: %d\n",
364 nhwu
*nbl
->na_ci
/2*nbl
->na_ci
);
365 fprintf(debug
, "generic kernel post-prune pair interactions: %d\n",
366 nhwu_pruned
*nbl
->na_ci
/2*nbl
->na_ci
);
367 fprintf(debug
, "generic kernel non-zero pair interactions: %d\n",
369 fprintf(debug
, "ratio non-zero/post-prune pair interactions: %4.2f\n",
370 npair_tot
/static_cast<double>(nhwu_pruned
*gmx::exactDiv(nbl
->na_ci
, 2)*nbl
->na_ci
));