From d1a07e21a9e92e374ebd40da2c39ff223a9138e1 Mon Sep 17 00:00:00 2001
From: Berk Hess <hess@kth.se>
Date: Thu, 25 Feb 2016 20:27:36 +0100
Subject: [PATCH] Remove epsfac from GPU kernel inner-loops

Multiplying the i-charge instead of the j-charge with epsfac in
the GPU kernels removes a flop form the inner-loop. On Maxwell with
CUDA the gain is 2-3% in the force kernels and 3-4% in the energy
kernels (which is more than a flop, probably a register less is used).
There is a single additional division in the energy kernel, but the
gain more than compensates for this.

Change-Id: I6924b32f3f61d3b7bbe532f95361b7fefb577609
---
 src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh   | 20 ++++++++++++--------
 src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh | 19 +++++++++++--------
 .../mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh      | 19 +++++++++++--------
 .../mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh      | 19 +++++++++++--------
 4 files changed, 45 insertions(+), 32 deletions(-)

diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
index c223e14622..b482e529ef 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -265,7 +265,10 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
         /* Pre-load i-atom x and q into shared memory */
         ci = sci * c_numClPerSupercl + tidxj;
         ai = ci * c_clSize + tidxi;
-        xqib[tidxj * c_clSize + tidxi] = xq[ai] + shift_vec[nb_sci.shift];
+
+        xqbuf    = xq[ai] + shift_vec[nb_sci.shift];
+        xqbuf.w *= nbparam.epsfac;
+        xqib[tidxj * c_clSize + tidxi] = xqbuf;
 #ifdef IATYPE_SHMEM
         /* Pre-load the i-atom types into shared memory */
         atib[tidxj * c_clSize + tidxi] = atom_types[ai];
@@ -317,17 +320,18 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 #endif  /* LJ_EWALD */
 
 #if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
-        E_el /= c_clSize*NTHREAD_Z;
+        /* Correct for epsfac^2 due to adding qi^2 */
+        E_el /= nbparam.epsfac*c_clSize*NTHREAD_Z;
 #if defined EL_RF || defined EL_CUTOFF
-        E_el *= -nbparam.epsfac*0.5f*c_rf;
+        E_el *= -0.5f*c_rf;
 #else
-        E_el *= -nbparam.epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
+        E_el *= -beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
 #endif
-#endif                                                 /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
+#endif                                  /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
     }
-#endif                                                 /* EXCLUSION_FORCES */
+#endif                                  /* EXCLUSION_FORCES */
 
-#endif                                                 /* CALC_ENERGIES */
+#endif                                  /* CALC_ENERGIES */
 
     /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
     for (j4 = cij4_start + tidxz; j4 < cij4_end; j4 += NTHREAD_Z)
@@ -365,7 +369,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
                     /* load j atom data */
                     xqbuf   = xq[aj];
                     xj      = make_float3(xqbuf.x, xqbuf.y, xqbuf.z);
-                    qj_f    = nbparam.epsfac * xqbuf.w;
+                    qj_f    = xqbuf.w;
                     typej   = atom_types[aj];
 
                     fcj_buf = make_float3(0.0f);
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh
index 75a88244be..20f77b0dc9 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh
@@ -209,7 +209,9 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
     ci = sci * NCL_PER_SUPERCL + tidxj;
     ai = ci * CL_SIZE + tidxi;
 
-    xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+    xqbuf    = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+    xqbuf.w *= nbparam->epsfac;
+    xqib[tidxj * CL_SIZE + tidxi] = xqbuf;
 
 #ifdef IATYPE_SHMEM //NOTE: Should not be defined. Re-evaluate the effect of preloading at a suitable time.
     /* Pre-load the i-atom types into shared memory */
@@ -259,17 +261,18 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
 #endif  /* LJ_EWALD */
 
 #if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
-        E_el /= CL_SIZE;
+        /* Correct for epsfac^2 due to adding qi^2 */
+        E_el /= nbparam->epsfac*CL_SIZE;
 #if defined EL_RF || defined EL_CUTOFF
-        E_el *= -nbparam->epsfac*0.5f*c_rf;
+        E_el *= -0.5f*c_rf;
 #else
-        E_el *= -nbparam->epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
+        E_el *= -beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
 #endif
-#endif                                                 /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
+#endif                                  /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
     }
-#endif                                                 /* EXCLUSION_FORCES */
+#endif                                  /* EXCLUSION_FORCES */
 
-#endif                                                 /* CALC_ENERGIES */
+#endif                                  /* CALC_ENERGIES */
 
     /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
     for (j4 = cij4_start; j4 < cij4_end; j4++)
@@ -309,7 +312,7 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
                     /* load j atom data */
                     xqbuf   = xq[aj];
                     xj      = (float3)(xqbuf.xyz);
-                    qj_f    = nbparam->epsfac * xqbuf.w;
+                    qj_f    = xqbuf.w;
                     typej   = atom_types[aj];
 
                     fcj_buf = (float3)(0.0f);
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh
index 75a88244be..3fdf56c55d 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh
@@ -209,7 +209,9 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
     ci = sci * NCL_PER_SUPERCL + tidxj;
     ai = ci * CL_SIZE + tidxi;
 
-    xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+    xqbuf    = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+    xqbuf.w *= nbparam->epsfac;
+    xqib[tidxj * CL_SIZE + tidxi] = xqbuf;
 
 #ifdef IATYPE_SHMEM //NOTE: Should not be defined. Re-evaluate the effect of preloading at a suitable time.
     /* Pre-load the i-atom types into shared memory */
@@ -259,17 +261,18 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
 #endif  /* LJ_EWALD */
 
 #if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
-        E_el /= CL_SIZE;
+        /* Correct for epsfac^2 due to adding qi^2 */
+        E_el /= nbparam->epsfac*CL_SIZE;
 #if defined EL_RF || defined EL_CUTOFF
-        E_el *= -nbparam->epsfac*0.5f*c_rf;
+        E_el *= -0.5f*c_rf;
 #else
-        E_el *= -nbparam->epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
+        E_el *= -M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
 #endif
-#endif                                                 /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
+#endif                             /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
     }
-#endif                                                 /* EXCLUSION_FORCES */
+#endif                             /* EXCLUSION_FORCES */
 
-#endif                                                 /* CALC_ENERGIES */
+#endif                             /* CALC_ENERGIES */
 
     /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
     for (j4 = cij4_start; j4 < cij4_end; j4++)
@@ -309,7 +312,7 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
                     /* load j atom data */
                     xqbuf   = xq[aj];
                     xj      = (float3)(xqbuf.xyz);
-                    qj_f    = nbparam->epsfac * xqbuf.w;
+                    qj_f    = xqbuf.w;
                     typej   = atom_types[aj];
 
                     fcj_buf = (float3)(0.0f);
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh
index af10b2fd02..40d867bade 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh
@@ -206,7 +206,9 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
     ci = sci * NCL_PER_SUPERCL + tidxj;
     ai = ci * CL_SIZE + tidxi;
 
-    xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+    xqbuf    = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+    xqbuf.w *= nbparam->epsfac;
+    xqib[tidxj * CL_SIZE + tidxi] = xqbuf;
 
 #ifdef IATYPE_SHMEM //NOTE: Should not be defined. Used with CUDA > 3.0 Re-evaluate the effect of preloading at a suitable time.
     /* Pre-load the i-atom types into shared memory */
@@ -256,17 +258,18 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
 #endif  /* LJ_EWALD */
 
 #if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
-        E_el /= CL_SIZE;
+        /* Correct for epsfac^2 due to adding qi^2 */
+        E_el /= nbparam->epsfac*CL_SIZE;
 #if defined EL_RF || defined EL_CUTOFF
-        E_el *= -nbparam->epsfac*0.5f*c_rf;
+        E_el *= -0.5f*c_rf;
 #else
-        E_el *= -nbparam->epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
+        E_el *= -beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
 #endif
-#endif                                                 /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
+#endif                                  /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
     }
-#endif                                                 /* EXCLUSION_FORCES */
+#endif                                  /* EXCLUSION_FORCES */
 
-#endif                                                 /* CALC_ENERGIES */
+#endif                                  /* CALC_ENERGIES */
 
     /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
     for (j4 = cij4_start; j4 < cij4_end; j4++)
@@ -302,7 +305,7 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
                     /* load j atom data */
                     xqbuf   = xq[aj];
                     xj      = (float3)(xqbuf.xyz);
-                    qj_f    = nbparam->epsfac * xqbuf.w;
+                    qj_f    = xqbuf.w;
                     typej   = atom_types[aj];
 
                     fcj_buf = (float3)(0.0f);
-- 
2.11.4.GIT