From d1a07e21a9e92e374ebd40da2c39ff223a9138e1 Mon Sep 17 00:00:00 2001 From: Berk Hess Date: Thu, 25 Feb 2016 20:27:36 +0100 Subject: [PATCH] Remove epsfac from GPU kernel inner-loops Multiplying the i-charge instead of the j-charge with epsfac in the GPU kernels removes a flop form the inner-loop. On Maxwell with CUDA the gain is 2-3% in the force kernels and 3-4% in the energy kernels (which is more than a flop, probably a register less is used). There is a single additional division in the energy kernel, but the gain more than compensates for this. Change-Id: I6924b32f3f61d3b7bbe532f95361b7fefb577609 --- src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh | 20 ++++++++++++-------- src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh | 19 +++++++++++-------- .../mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh | 19 +++++++++++-------- .../mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh | 19 +++++++++++-------- 4 files changed, 45 insertions(+), 32 deletions(-) diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh index c223e14622..b482e529ef 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh @@ -265,7 +265,10 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda) /* Pre-load i-atom x and q into shared memory */ ci = sci * c_numClPerSupercl + tidxj; ai = ci * c_clSize + tidxi; - xqib[tidxj * c_clSize + tidxi] = xq[ai] + shift_vec[nb_sci.shift]; + + xqbuf = xq[ai] + shift_vec[nb_sci.shift]; + xqbuf.w *= nbparam.epsfac; + xqib[tidxj * c_clSize + tidxi] = xqbuf; #ifdef IATYPE_SHMEM /* Pre-load the i-atom types into shared memory */ atib[tidxj * c_clSize + tidxi] = atom_types[ai]; @@ -317,17 +320,18 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda) #endif /* LJ_EWALD */ #if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF - E_el /= c_clSize*NTHREAD_Z; + /* Correct for epsfac^2 due to adding qi^2 */ + E_el /= nbparam.epsfac*c_clSize*NTHREAD_Z; #if defined EL_RF || defined EL_CUTOFF - E_el *= -nbparam.epsfac*0.5f*c_rf; + E_el *= -0.5f*c_rf; #else - E_el *= -nbparam.epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */ + E_el *= -beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */ #endif -#endif /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */ +#endif /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */ } -#endif /* EXCLUSION_FORCES */ +#endif /* EXCLUSION_FORCES */ -#endif /* CALC_ENERGIES */ +#endif /* CALC_ENERGIES */ /* loop over the j clusters = seen by any of the atoms in the current super-cluster */ for (j4 = cij4_start + tidxz; j4 < cij4_end; j4 += NTHREAD_Z) @@ -365,7 +369,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda) /* load j atom data */ xqbuf = xq[aj]; xj = make_float3(xqbuf.x, xqbuf.y, xqbuf.z); - qj_f = nbparam.epsfac * xqbuf.w; + qj_f = xqbuf.w; typej = atom_types[aj]; fcj_buf = make_float3(0.0f); diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh index 75a88244be..20f77b0dc9 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh @@ -209,7 +209,9 @@ __global float *restrict fshift, /* stores float3 values */ /* OU ci = sci * NCL_PER_SUPERCL + tidxj; ai = ci * CL_SIZE + tidxi; - xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f); + xqbuf = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f); + xqbuf.w *= nbparam->epsfac; + xqib[tidxj * CL_SIZE + tidxi] = xqbuf; #ifdef IATYPE_SHMEM //NOTE: Should not be defined. Re-evaluate the effect of preloading at a suitable time. /* Pre-load the i-atom types into shared memory */ @@ -259,17 +261,18 @@ __global float *restrict fshift, /* stores float3 values */ /* OU #endif /* LJ_EWALD */ #if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF - E_el /= CL_SIZE; + /* Correct for epsfac^2 due to adding qi^2 */ + E_el /= nbparam->epsfac*CL_SIZE; #if defined EL_RF || defined EL_CUTOFF - E_el *= -nbparam->epsfac*0.5f*c_rf; + E_el *= -0.5f*c_rf; #else - E_el *= -nbparam->epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */ + E_el *= -beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */ #endif -#endif /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */ +#endif /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */ } -#endif /* EXCLUSION_FORCES */ +#endif /* EXCLUSION_FORCES */ -#endif /* CALC_ENERGIES */ +#endif /* CALC_ENERGIES */ /* loop over the j clusters = seen by any of the atoms in the current super-cluster */ for (j4 = cij4_start; j4 < cij4_end; j4++) @@ -309,7 +312,7 @@ __global float *restrict fshift, /* stores float3 values */ /* OU /* load j atom data */ xqbuf = xq[aj]; xj = (float3)(xqbuf.xyz); - qj_f = nbparam->epsfac * xqbuf.w; + qj_f = xqbuf.w; typej = atom_types[aj]; fcj_buf = (float3)(0.0f); diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh index 75a88244be..3fdf56c55d 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh @@ -209,7 +209,9 @@ __global float *restrict fshift, /* stores float3 values */ /* OU ci = sci * NCL_PER_SUPERCL + tidxj; ai = ci * CL_SIZE + tidxi; - xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f); + xqbuf = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f); + xqbuf.w *= nbparam->epsfac; + xqib[tidxj * CL_SIZE + tidxi] = xqbuf; #ifdef IATYPE_SHMEM //NOTE: Should not be defined. Re-evaluate the effect of preloading at a suitable time. /* Pre-load the i-atom types into shared memory */ @@ -259,17 +261,18 @@ __global float *restrict fshift, /* stores float3 values */ /* OU #endif /* LJ_EWALD */ #if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF - E_el /= CL_SIZE; + /* Correct for epsfac^2 due to adding qi^2 */ + E_el /= nbparam->epsfac*CL_SIZE; #if defined EL_RF || defined EL_CUTOFF - E_el *= -nbparam->epsfac*0.5f*c_rf; + E_el *= -0.5f*c_rf; #else - E_el *= -nbparam->epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */ + E_el *= -M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */ #endif -#endif /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */ +#endif /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */ } -#endif /* EXCLUSION_FORCES */ +#endif /* EXCLUSION_FORCES */ -#endif /* CALC_ENERGIES */ +#endif /* CALC_ENERGIES */ /* loop over the j clusters = seen by any of the atoms in the current super-cluster */ for (j4 = cij4_start; j4 < cij4_end; j4++) @@ -309,7 +312,7 @@ __global float *restrict fshift, /* stores float3 values */ /* OU /* load j atom data */ xqbuf = xq[aj]; xj = (float3)(xqbuf.xyz); - qj_f = nbparam->epsfac * xqbuf.w; + qj_f = xqbuf.w; typej = atom_types[aj]; fcj_buf = (float3)(0.0f); diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh index af10b2fd02..40d867bade 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh @@ -206,7 +206,9 @@ __global float *restrict fshift, /* stores float3 values */ /* OU ci = sci * NCL_PER_SUPERCL + tidxj; ai = ci * CL_SIZE + tidxi; - xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f); + xqbuf = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f); + xqbuf.w *= nbparam->epsfac; + xqib[tidxj * CL_SIZE + tidxi] = xqbuf; #ifdef IATYPE_SHMEM //NOTE: Should not be defined. Used with CUDA > 3.0 Re-evaluate the effect of preloading at a suitable time. /* Pre-load the i-atom types into shared memory */ @@ -256,17 +258,18 @@ __global float *restrict fshift, /* stores float3 values */ /* OU #endif /* LJ_EWALD */ #if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF - E_el /= CL_SIZE; + /* Correct for epsfac^2 due to adding qi^2 */ + E_el /= nbparam->epsfac*CL_SIZE; #if defined EL_RF || defined EL_CUTOFF - E_el *= -nbparam->epsfac*0.5f*c_rf; + E_el *= -0.5f*c_rf; #else - E_el *= -nbparam->epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */ + E_el *= -beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */ #endif -#endif /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */ +#endif /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */ } -#endif /* EXCLUSION_FORCES */ +#endif /* EXCLUSION_FORCES */ -#endif /* CALC_ENERGIES */ +#endif /* CALC_ENERGIES */ /* loop over the j clusters = seen by any of the atoms in the current super-cluster */ for (j4 = cij4_start; j4 < cij4_end; j4++) @@ -302,7 +305,7 @@ __global float *restrict fshift, /* stores float3 values */ /* OU /* load j atom data */ xqbuf = xq[aj]; xj = (float3)(xqbuf.xyz); - qj_f = nbparam->epsfac * xqbuf.w; + qj_f = xqbuf.w; typej = atom_types[aj]; fcj_buf = (float3)(0.0f); -- 2.11.4.GIT