2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 * Utility constant and function declaration for the CUDA non-bonded kernels.
39 * This header should be included once at the top level, just before the
40 * kernels are included (has to be preceded by nbnxn_cuda_types.h).
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \ingroup module_mdlib
47 /* Note that floating-point constants in CUDA code should be suffixed
48 * with f (e.g. 0.5f), to stop the compiler producing intermediate
49 * code that is in double precision.
52 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
53 #include "gromacs/gpu_utils/vectype_ops.cuh"
55 #include "nbnxn_cuda_types.h"
57 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
58 #define NBNXN_CUDA_KERNEL_UTILS_CUH
60 /* Use texture objects if supported by the target hardware. */
61 #if GMX_PTX_ARCH >= 300
62 /* Note: convenience macro, needs to be undef-ed at the end of the file. */
66 #define WARP_SIZE_POW2_EXPONENT (5)
67 #define CL_SIZE_POW2_EXPONENT (3) /* change this together with GPU_NS_CLUSTER_SIZE !*/
68 #define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
69 #define FBUF_STRIDE (CL_SIZE_SQ)
71 #define ONE_SIXTH_F 0.16666667f
72 #define ONE_TWELVETH_F 0.08333333f
74 /* With multiple compilation units this ensures that texture refs are available
75 in the the kernels' compilation units. */
76 #if !GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
77 /*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */
78 extern texture<float, 1, cudaReadModeElementType> nbfp_texref;
80 /*! Texture reference for LJ-PME parameters; bound to cu_nbparam_t.nbfp_comb */
81 extern texture<float, 1, cudaReadModeElementType> nbfp_comb_texref;
83 /*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
84 extern texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
85 #endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
87 /*! Apply force switch, force + energy version. */
88 static inline __device__
89 void calculate_force_switch_F(const cu_nbparam_t nbparam,
98 /* force switch constants */
99 float disp_shift_V2 = nbparam.dispersion_shift.c2;
100 float disp_shift_V3 = nbparam.dispersion_shift.c3;
101 float repu_shift_V2 = nbparam.repulsion_shift.c2;
102 float repu_shift_V3 = nbparam.repulsion_shift.c3;
105 r_switch = r - nbparam.rvdw_switch;
106 r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
109 -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
110 c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
113 /*! Apply force switch, force-only version. */
114 static inline __device__
115 void calculate_force_switch_F_E(const cu_nbparam_t nbparam,
125 /* force switch constants */
126 float disp_shift_V2 = nbparam.dispersion_shift.c2;
127 float disp_shift_V3 = nbparam.dispersion_shift.c3;
128 float repu_shift_V2 = nbparam.repulsion_shift.c2;
129 float repu_shift_V3 = nbparam.repulsion_shift.c3;
131 float disp_shift_F2 = nbparam.dispersion_shift.c2/3;
132 float disp_shift_F3 = nbparam.dispersion_shift.c3/4;
133 float repu_shift_F2 = nbparam.repulsion_shift.c2/3;
134 float repu_shift_F3 = nbparam.repulsion_shift.c3/4;
137 r_switch = r - nbparam.rvdw_switch;
138 r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
141 -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
142 c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
144 c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
145 c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
148 /*! Apply potential switch, force-only version. */
149 static inline __device__
150 void calculate_potential_switch_F(const cu_nbparam_t nbparam,
161 /* potential switch constants */
162 float switch_V3 = nbparam.vdw_switch.c3;
163 float switch_V4 = nbparam.vdw_switch.c4;
164 float switch_V5 = nbparam.vdw_switch.c5;
165 float switch_F2 = 3*nbparam.vdw_switch.c3;
166 float switch_F3 = 4*nbparam.vdw_switch.c4;
167 float switch_F4 = 5*nbparam.vdw_switch.c5;
170 r_switch = r - nbparam.rvdw_switch;
172 /* Unlike in the F+E kernel, conditional is faster here */
175 sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
176 dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
178 *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
182 /*! Apply potential switch, force + energy version. */
183 static inline __device__
184 void calculate_potential_switch_F_E(const cu_nbparam_t nbparam,
195 /* potential switch constants */
196 float switch_V3 = nbparam.vdw_switch.c3;
197 float switch_V4 = nbparam.vdw_switch.c4;
198 float switch_V5 = nbparam.vdw_switch.c5;
199 float switch_F2 = 3*nbparam.vdw_switch.c3;
200 float switch_F3 = 4*nbparam.vdw_switch.c4;
201 float switch_F4 = 5*nbparam.vdw_switch.c5;
204 r_switch = r - nbparam.rvdw_switch;
205 r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
207 /* Unlike in the F-only kernel, masking is faster here */
208 sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
209 dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
211 *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
215 /*! Calculate LJ-PME grid force contribution with
216 * geometric combination rule.
218 static inline __device__
219 void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
228 float c6grid, inv_r6_nm, cr2, expmcr2, poly;
231 c6grid = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
233 c6grid = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
234 #endif /* USE_TEXOBJ */
236 /* Recalculate inv_r6 without exclusion mask */
237 inv_r6_nm = inv_r2*inv_r2*inv_r2;
239 expmcr2 = expf(-cr2);
240 poly = 1.0f + cr2 + 0.5f*cr2*cr2;
242 /* Subtract the grid force from the total LJ force */
243 *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
246 /*! Calculate LJ-PME grid force + energy contribution with
247 * geometric combination rule.
249 static inline __device__
250 void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
261 float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
264 c6grid = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
266 c6grid = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
267 #endif /* USE_TEXOBJ */
269 /* Recalculate inv_r6 without exclusion mask */
270 inv_r6_nm = inv_r2*inv_r2*inv_r2;
272 expmcr2 = expf(-cr2);
273 poly = 1.0f + cr2 + 0.5f*cr2*cr2;
275 /* Subtract the grid force from the total LJ force */
276 *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
278 /* Shift should be applied only to real LJ pairs */
279 sh_mask = nbparam.sh_lj_ewald*int_bit;
280 *E_lj += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
283 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
284 * Lorentz-Berthelot combination rule.
285 * We use a single F+E kernel with conditional because the performance impact
286 * of this is pretty small and LB on the CPU is anyway very slow.
288 static inline __device__
289 void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
300 float c6grid, inv_r6_nm, cr2, expmcr2, poly;
301 float sigma, sigma2, epsilon;
303 /* sigma and epsilon are scaled to give 6*C6 */
305 sigma = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei ) + tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej );
306 epsilon = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei + 1) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej + 1);
308 sigma = tex1Dfetch(nbfp_comb_texref, 2*typei ) + tex1Dfetch(nbfp_comb_texref, 2*typej );
309 epsilon = tex1Dfetch(nbfp_comb_texref, 2*typei + 1) * tex1Dfetch(nbfp_comb_texref, 2*typej + 1);
310 #endif /* USE_TEXOBJ */
311 sigma2 = sigma*sigma;
312 c6grid = epsilon*sigma2*sigma2*sigma2;
314 /* Recalculate inv_r6 without exclusion mask */
315 inv_r6_nm = inv_r2*inv_r2*inv_r2;
317 expmcr2 = expf(-cr2);
318 poly = 1.0f + cr2 + 0.5f*cr2*cr2;
320 /* Subtract the grid force from the total LJ force */
321 *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
327 /* Shift should be applied only to real LJ pairs */
328 sh_mask = nbparam.sh_lj_ewald*int_bit;
329 *E_lj += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
333 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
334 * Original idea: from the OpenMM project
336 static inline __device__
337 float interpolate_coulomb_force_r(float r, float scale)
339 float normalized = scale * r;
340 int index = (int) normalized;
341 float fract2 = normalized - index;
342 float fract1 = 1.0f - fract2;
344 return fract1 * tex1Dfetch(coulomb_tab_texref, index)
345 + fract2 * tex1Dfetch(coulomb_tab_texref, index + 1);
348 static inline __device__
349 float interpolate_coulomb_force_r(cudaTextureObject_t texobj_coulomb_tab,
350 float r, float scale)
352 float normalized = scale * r;
353 int index = (int) normalized;
354 float fract2 = normalized - index;
355 float fract1 = 1.0f - fract2;
357 return fract1 * tex1Dfetch<float>(texobj_coulomb_tab, index) +
358 fract2 * tex1Dfetch<float>(texobj_coulomb_tab, index + 1);
361 /*! Calculate analytical Ewald correction term. */
362 static inline __device__
363 float pmecorrF(float z2)
365 const float FN6 = -1.7357322914161492954e-8f;
366 const float FN5 = 1.4703624142580877519e-6f;
367 const float FN4 = -0.000053401640219807709149f;
368 const float FN3 = 0.0010054721316683106153f;
369 const float FN2 = -0.019278317264888380590f;
370 const float FN1 = 0.069670166153766424023f;
371 const float FN0 = -0.75225204789749321333f;
373 const float FD4 = 0.0011193462567257629232f;
374 const float FD3 = 0.014866955030185295499f;
375 const float FD2 = 0.11583842382862377919f;
376 const float FD1 = 0.50736591960530292870f;
377 const float FD0 = 1.0f;
380 float polyFN0, polyFN1, polyFD0, polyFD1;
384 polyFD0 = FD4*z4 + FD2;
385 polyFD1 = FD3*z4 + FD1;
386 polyFD0 = polyFD0*z4 + FD0;
387 polyFD0 = polyFD1*z2 + polyFD0;
389 polyFD0 = 1.0f/polyFD0;
391 polyFN0 = FN6*z4 + FN4;
392 polyFN1 = FN5*z4 + FN3;
393 polyFN0 = polyFN0*z4 + FN2;
394 polyFN1 = polyFN1*z4 + FN1;
395 polyFN0 = polyFN0*z4 + FN0;
396 polyFN0 = polyFN1*z2 + polyFN0;
398 return polyFN0*polyFD0;
401 /*! Final j-force reduction; this generic implementation works with
402 * arbitrary array sizes.
404 static inline __device__
405 void reduce_force_j_generic(float *f_buf, float3 *fout,
406 int tidxi, int tidxj, int aidx)
411 for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
413 f += f_buf[FBUF_STRIDE * tidxi + j];
416 atomicAdd((&fout[aidx].x)+tidxi, f);
420 /*! Final j-force reduction; this implementation only with power of two
421 * array sizes and with sm >= 3.0
423 #if GMX_PTX_ARCH >= 300
424 static inline __device__
425 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
428 f.x += __shfl_down(f.x, 1);
429 f.y += __shfl_up (f.y, 1);
430 f.z += __shfl_down(f.z, 1);
437 f.x += __shfl_down(f.x, 2);
438 f.z += __shfl_up (f.z, 2);
445 f.x += __shfl_down(f.x, 4);
449 atomicAdd((&fout[aidx].x) + tidxi, f.x);
454 /*! Final i-force reduction; this generic implementation works with
455 * arbitrary array sizes.
456 * TODO: add the tidxi < 3 trick
458 static inline __device__
459 void reduce_force_i_generic(float *f_buf, float3 *fout,
460 float *fshift_buf, bool bCalcFshift,
461 int tidxi, int tidxj, int aidx)
466 for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
468 f += f_buf[tidxj * FBUF_STRIDE + j];
471 atomicAdd(&fout[aidx].x + tidxj, f);
480 /*! Final i-force reduction; this implementation works only with power of two
483 static inline __device__
484 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
485 float *fshift_buf, bool bCalcFshift,
486 int tidxi, int tidxj, int aidx)
491 /* Reduce the initial CL_SIZE values for each i atom to half
492 * every step by using CL_SIZE * i threads.
493 * Can't just use i as loop variable because than nvcc refuses to unroll.
497 for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
502 f_buf[ tidxj * CL_SIZE + tidxi] += f_buf[ (tidxj + i) * CL_SIZE + tidxi];
503 f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
504 f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
509 /* i == 1, last reduction step, writing to global mem */
512 /* tidxj*FBUF_STRIDE selects x, y or z */
513 f = f_buf[tidxj * FBUF_STRIDE + tidxi] +
514 f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
516 atomicAdd(&(fout[aidx].x) + tidxj, f);
526 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
527 * on whether the size of the array to be reduced is power of two or not.
529 static inline __device__
530 void reduce_force_i(float *f_buf, float3 *f,
531 float *fshift_buf, bool bCalcFshift,
532 int tidxi, int tidxj, int ai)
534 if ((CL_SIZE & (CL_SIZE - 1)))
536 reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
540 reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
544 /*! Final i-force reduction; this implementation works only with power of two
545 * array sizes and with sm >= 3.0
547 #if GMX_PTX_ARCH >= 300
548 static inline __device__
549 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
550 float *fshift_buf, bool bCalcFshift,
553 fin.x += __shfl_down(fin.x, CL_SIZE);
554 fin.y += __shfl_up (fin.y, CL_SIZE);
555 fin.z += __shfl_down(fin.z, CL_SIZE);
562 fin.x += __shfl_down(fin.x, 2*CL_SIZE);
563 fin.z += __shfl_up (fin.z, 2*CL_SIZE);
570 /* Threads 0,1,2 and 4,5,6 increment x,y,z for their warp */
573 atomicAdd(&fout[aidx].x + (tidxj & ~4), fin.x);
577 *fshift_buf += fin.x;
583 /*! Energy reduction; this implementation works only with power of two
586 static inline __device__
587 void reduce_energy_pow2(volatile float *buf,
588 float *e_lj, float *e_el,
596 /* Can't just use i as loop variable because than nvcc refuses to unroll. */
598 for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
602 buf[ tidx] += buf[ tidx + i];
603 buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
608 // TODO do this on two threads - requires e_lj and e_el to be stored on adjascent
609 // memory locations to make sense
610 /* last reduction step, writing to global mem */
613 e1 = buf[ tidx] + buf[ tidx + i];
614 e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
621 /*! Energy reduction; this implementation works only with power of two
622 * array sizes and with sm >= 3.0
624 #if GMX_PTX_ARCH >= 300
625 static inline __device__
626 void reduce_energy_warp_shfl(float E_lj, float E_el,
627 float *e_lj, float *e_el,
634 for (i = 0; i < 5; i++)
636 E_lj += __shfl_down(E_lj, sh);
637 E_el += __shfl_down(E_el, sh);
641 /* The first thread in the warp writes the reduced energies */
642 if (tidx == 0 || tidx == WARP_SIZE)
644 atomicAdd(e_lj, E_lj);
645 atomicAdd(e_el, E_el);
648 #endif /* GMX_PTX_ARCH */
652 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */