From 2aa05015eeb6d3352d0871958a7903ecb2fabd8d Mon Sep 17 00:00:00 2001
From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= <pall.szilard@gmail.com>
Date: Mon, 20 Mar 2017 00:47:32 +0100
Subject: [PATCH] Conditional tweak in the nonbonded GPU kernels

GPU compilers miss an easy optimization of a loop invariant in the
inner-lop conditional. Precomputing part of the conditional together
with using bitwise instead of logical and/or improves performance with
most compilers by up to 5%.

Change-Id: I3ba0b9025b11af3d8465e0d26ca69a78e32a0ece
---
 src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh       | 11 +++++++----
 src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh | 11 +++++++----
 src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh     | 11 +++++++----
 src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh  | 11 +++++++----
 src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh  | 11 +++++++----
 5 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
index 6209493d54..e26952aab6 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -342,6 +342,10 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 
 #endif                                  /* CALC_ENERGIES */
 
+#ifdef EXCLUSION_FORCES
+    const int nonSelfInteraction = !(nb_sci.shift == CENTRAL & tidxj <= tidxi);
+#endif
+
     /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
     for (j4 = cij4_start + tidxz; j4 < cij4_end; j4 += NTHREAD_Z)
     {
@@ -354,7 +358,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 #endif
         {
             /* Pre-load cj into shared memory on both warps separately */
-            if ((tidxj == 0 || tidxj == 4) && tidxi < c_nbnxnGpuJgroupSize)
+            if ((tidxj == 0 | tidxj == 4) & (tidxi < c_nbnxnGpuJgroupSize))
             {
                 cjs[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = pl_cj4[j4].cj[tidxi];
             }
@@ -415,10 +419,9 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 
                             /* cutoff & exclusion check */
 #ifdef EXCLUSION_FORCES
-                            if (r2 < rcoulomb_sq *
-                                (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
+                            if ((r2 < rcoulomb_sq) * (nonSelfInteraction | (ci != cj)))
 #else
-                            if (r2 < rcoulomb_sq * int_bit)
+                            if ((r2 < rcoulomb_sq) * int_bit)
 #endif
                             {
                                 /* load the rest of the i-atom parameters */
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh
index 6e08a8fef1..6bd54f4be5 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh
@@ -284,6 +284,10 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 
 #endif                                  /* CALC_ENERGIES */
 
+#ifdef EXCLUSION_FORCES
+    const int nonSelfInteraction = !(nb_sci.shift == CENTRAL & tidxj <= tidxi);
+#endif
+
     /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
     for (j4 = cij4_start; j4 < cij4_end; j4++)
     {
@@ -296,7 +300,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 #endif
         {
             /* Pre-load cj into shared memory on both warps separately */
-            if ((tidxj == 0 || tidxj == 4) && tidxi < c_nbnxnGpuJgroupSize)
+            if ((tidxj == 0 | tidxj == 4) & (tidxi < c_nbnxnGpuJgroupSize))
             {
                 cjs[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = pl_cj4[j4].cj[tidxi];
             }
@@ -359,10 +363,9 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 
                             /* cutoff & exclusion check */
 #ifdef EXCLUSION_FORCES
-                            if (r2 < rcoulomb_sq *
-                                (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
+                            if ((r2 < rcoulomb_sq) * (nonSelfInteraction | (ci != cj)))
 #else
-                            if (r2 < rcoulomb_sq * int_bit)
+                            if ((r2 < rcoulomb_sq) * int_bit)
 #endif
                             {
                                 /* load the rest of the i-atom parameters */
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh
index cb7f80bc9e..67467f4882 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh
@@ -316,6 +316,10 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
 
 #endif                                  /* CALC_ENERGIES */
 
+#ifdef EXCLUSION_FORCES
+    const int nonSelfInteraction  = !(nb_sci.shift == CENTRAL & tidxj <= tidxi);
+#endif
+
     /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
     for (j4 = cij4_start; j4 < cij4_end; j4++)
     {
@@ -328,7 +332,7 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
 #endif
         {
             /* Pre-load cj into shared memory on both warps separately */
-            if ((tidxj == 0 || tidxj == 4) && tidxi < NBNXN_GPU_JGROUP_SIZE)
+            if ((tidxj == 0 | tidxj == 4) & (tidxi < NBNXN_GPU_JGROUP_SIZE))
             {
                 cjs[tidxi + tidxj * NBNXN_GPU_JGROUP_SIZE / 4] = pl_cj4[j4].cj[tidxi];
             }
@@ -401,10 +405,9 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
 
                             /* cutoff & exclusion check */
 #ifdef EXCLUSION_FORCES
-                            if (r2 < rcoulomb_sq *
-                                (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
+                            if ((r2 < rcoulomb_sq) * (nonSelfInteraction | (ci != cj)))
 #else
-                            if (r2 < rcoulomb_sq * int_bit)
+                            if ((r2 < rcoulomb_sq) * int_bit)
 #endif
                             {
                                 /* load the rest of the i-atom parameters */
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh
index 59c1c18d22..8d265987a7 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh
@@ -319,6 +319,10 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
 
 #endif                                  /* CALC_ENERGIES */
 
+#ifdef EXCLUSION_FORCES
+    const int nonSelfInteraction  = !(nb_sci.shift == CENTRAL & tidxj <= tidxi);
+#endif
+
     /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
     for (j4 = cij4_start; j4 < cij4_end; j4++)
     {
@@ -331,7 +335,7 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
 #endif
         {
             /* Pre-load cj into shared memory on both warps separately */
-            if ((tidxj == 0 || tidxj == 4) && tidxi < NBNXN_GPU_JGROUP_SIZE)
+            if ((tidxj == 0 | tidxj == 4) & (tidxi < NBNXN_GPU_JGROUP_SIZE))
             {
                 cjs[tidxi + tidxj * NBNXN_GPU_JGROUP_SIZE / 4] = pl_cj4[j4].cj[tidxi];
             }
@@ -404,10 +408,9 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
 
                             /* cutoff & exclusion check */
 #ifdef EXCLUSION_FORCES
-                            if (r2 < rcoulomb_sq *
-                                (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
+                            if ((r2 < rcoulomb_sq) * (nonSelfInteraction | (ci != cj)))
 #else
-                            if (r2 < rcoulomb_sq * int_bit)
+                            if ((r2 < rcoulomb_sq) * int_bit)
 #endif
                             {
                                 /* load the rest of the i-atom parameters */
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh
index b683f22bea..baace28651 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh
@@ -313,6 +313,10 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
 
 #endif                                  /* CALC_ENERGIES */
 
+#ifdef EXCLUSION_FORCES
+    const int nonSelfInteraction  = !(nb_sci.shift == CENTRAL & tidxj <= tidxi);
+#endif
+
     /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
     for (j4 = cij4_start; j4 < cij4_end; j4++)
     {
@@ -325,7 +329,7 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
 #endif
         {
             /* Pre-load cj into shared memory on both warps separately */
-            if ((tidxj == 0 || tidxj == 4) && tidxi < NBNXN_GPU_JGROUP_SIZE)
+            if ((tidxj == 0 | tidxj == 4) & (tidxi < NBNXN_GPU_JGROUP_SIZE))
             {
                 cjs[tidxi + tidxj * NBNXN_GPU_JGROUP_SIZE / 4] = pl_cj4[j4].cj[tidxi];
             }
@@ -394,10 +398,9 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
 
                             /* cutoff & exclusion check */
 #ifdef EXCLUSION_FORCES
-                            if (r2 < rcoulomb_sq *
-                                (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
+                            if ((r2 < rcoulomb_sq) * (nonSelfInteraction | (ci != cj)))
 #else
-                            if (r2 < rcoulomb_sq * int_bit)
+                            if ((r2 < rcoulomb_sq) * int_bit)
 #endif
                             {
                                 /* load the rest of the i-atom parameters */
-- 
2.11.4.GIT