From 2aa05015eeb6d3352d0871958a7903ecb2fabd8d Mon Sep 17 00:00:00 2001 From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Mon, 20 Mar 2017 00:47:32 +0100 Subject: [PATCH] Conditional tweak in the nonbonded GPU kernels GPU compilers miss an easy optimization of a loop invariant in the inner-lop conditional. Precomputing part of the conditional together with using bitwise instead of logical and/or improves performance with most compilers by up to 5%. Change-Id: I3ba0b9025b11af3d8465e0d26ca69a78e32a0ece --- src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh | 11 +++++++---- src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh | 11 +++++++---- src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh | 11 +++++++---- src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh | 11 +++++++---- src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh | 11 +++++++---- 5 files changed, 35 insertions(+), 20 deletions(-) diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh index 6209493d54..e26952aab6 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh @@ -342,6 +342,10 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda) #endif /* CALC_ENERGIES */ +#ifdef EXCLUSION_FORCES + const int nonSelfInteraction = !(nb_sci.shift == CENTRAL & tidxj <= tidxi); +#endif + /* loop over the j clusters = seen by any of the atoms in the current super-cluster */ for (j4 = cij4_start + tidxz; j4 < cij4_end; j4 += NTHREAD_Z) { @@ -354,7 +358,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda) #endif { /* Pre-load cj into shared memory on both warps separately */ - if ((tidxj == 0 || tidxj == 4) && tidxi < c_nbnxnGpuJgroupSize) + if ((tidxj == 0 | tidxj == 4) & (tidxi < c_nbnxnGpuJgroupSize)) { cjs[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = pl_cj4[j4].cj[tidxi]; } @@ -415,10 +419,9 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda) /* cutoff & exclusion check */ #ifdef EXCLUSION_FORCES - if (r2 < rcoulomb_sq * - (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi)) + if ((r2 < rcoulomb_sq) * (nonSelfInteraction | (ci != cj))) #else - if (r2 < rcoulomb_sq * int_bit) + if ((r2 < rcoulomb_sq) * int_bit) #endif { /* load the rest of the i-atom parameters */ diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh index 6e08a8fef1..6bd54f4be5 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh @@ -284,6 +284,10 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda) #endif /* CALC_ENERGIES */ +#ifdef EXCLUSION_FORCES + const int nonSelfInteraction = !(nb_sci.shift == CENTRAL & tidxj <= tidxi); +#endif + /* loop over the j clusters = seen by any of the atoms in the current super-cluster */ for (j4 = cij4_start; j4 < cij4_end; j4++) { @@ -296,7 +300,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda) #endif { /* Pre-load cj into shared memory on both warps separately */ - if ((tidxj == 0 || tidxj == 4) && tidxi < c_nbnxnGpuJgroupSize) + if ((tidxj == 0 | tidxj == 4) & (tidxi < c_nbnxnGpuJgroupSize)) { cjs[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = pl_cj4[j4].cj[tidxi]; } @@ -359,10 +363,9 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda) /* cutoff & exclusion check */ #ifdef EXCLUSION_FORCES - if (r2 < rcoulomb_sq * - (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi)) + if ((r2 < rcoulomb_sq) * (nonSelfInteraction | (ci != cj))) #else - if (r2 < rcoulomb_sq * int_bit) + if ((r2 < rcoulomb_sq) * int_bit) #endif { /* load the rest of the i-atom parameters */ diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh index cb7f80bc9e..67467f4882 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh @@ -316,6 +316,10 @@ __global float *restrict fshift, /* stores float3 values */ /* OU #endif /* CALC_ENERGIES */ +#ifdef EXCLUSION_FORCES + const int nonSelfInteraction = !(nb_sci.shift == CENTRAL & tidxj <= tidxi); +#endif + /* loop over the j clusters = seen by any of the atoms in the current super-cluster */ for (j4 = cij4_start; j4 < cij4_end; j4++) { @@ -328,7 +332,7 @@ __global float *restrict fshift, /* stores float3 values */ /* OU #endif { /* Pre-load cj into shared memory on both warps separately */ - if ((tidxj == 0 || tidxj == 4) && tidxi < NBNXN_GPU_JGROUP_SIZE) + if ((tidxj == 0 | tidxj == 4) & (tidxi < NBNXN_GPU_JGROUP_SIZE)) { cjs[tidxi + tidxj * NBNXN_GPU_JGROUP_SIZE / 4] = pl_cj4[j4].cj[tidxi]; } @@ -401,10 +405,9 @@ __global float *restrict fshift, /* stores float3 values */ /* OU /* cutoff & exclusion check */ #ifdef EXCLUSION_FORCES - if (r2 < rcoulomb_sq * - (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi)) + if ((r2 < rcoulomb_sq) * (nonSelfInteraction | (ci != cj))) #else - if (r2 < rcoulomb_sq * int_bit) + if ((r2 < rcoulomb_sq) * int_bit) #endif { /* load the rest of the i-atom parameters */ diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh index 59c1c18d22..8d265987a7 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh @@ -319,6 +319,10 @@ __global float *restrict fshift, /* stores float3 values */ /* OU #endif /* CALC_ENERGIES */ +#ifdef EXCLUSION_FORCES + const int nonSelfInteraction = !(nb_sci.shift == CENTRAL & tidxj <= tidxi); +#endif + /* loop over the j clusters = seen by any of the atoms in the current super-cluster */ for (j4 = cij4_start; j4 < cij4_end; j4++) { @@ -331,7 +335,7 @@ __global float *restrict fshift, /* stores float3 values */ /* OU #endif { /* Pre-load cj into shared memory on both warps separately */ - if ((tidxj == 0 || tidxj == 4) && tidxi < NBNXN_GPU_JGROUP_SIZE) + if ((tidxj == 0 | tidxj == 4) & (tidxi < NBNXN_GPU_JGROUP_SIZE)) { cjs[tidxi + tidxj * NBNXN_GPU_JGROUP_SIZE / 4] = pl_cj4[j4].cj[tidxi]; } @@ -404,10 +408,9 @@ __global float *restrict fshift, /* stores float3 values */ /* OU /* cutoff & exclusion check */ #ifdef EXCLUSION_FORCES - if (r2 < rcoulomb_sq * - (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi)) + if ((r2 < rcoulomb_sq) * (nonSelfInteraction | (ci != cj))) #else - if (r2 < rcoulomb_sq * int_bit) + if ((r2 < rcoulomb_sq) * int_bit) #endif { /* load the rest of the i-atom parameters */ diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh index b683f22bea..baace28651 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh @@ -313,6 +313,10 @@ __global float *restrict fshift, /* stores float3 values */ /* OU #endif /* CALC_ENERGIES */ +#ifdef EXCLUSION_FORCES + const int nonSelfInteraction = !(nb_sci.shift == CENTRAL & tidxj <= tidxi); +#endif + /* loop over the j clusters = seen by any of the atoms in the current super-cluster */ for (j4 = cij4_start; j4 < cij4_end; j4++) { @@ -325,7 +329,7 @@ __global float *restrict fshift, /* stores float3 values */ /* OU #endif { /* Pre-load cj into shared memory on both warps separately */ - if ((tidxj == 0 || tidxj == 4) && tidxi < NBNXN_GPU_JGROUP_SIZE) + if ((tidxj == 0 | tidxj == 4) & (tidxi < NBNXN_GPU_JGROUP_SIZE)) { cjs[tidxi + tidxj * NBNXN_GPU_JGROUP_SIZE / 4] = pl_cj4[j4].cj[tidxi]; } @@ -394,10 +398,9 @@ __global float *restrict fshift, /* stores float3 values */ /* OU /* cutoff & exclusion check */ #ifdef EXCLUSION_FORCES - if (r2 < rcoulomb_sq * - (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi)) + if ((r2 < rcoulomb_sq) * (nonSelfInteraction | (ci != cj))) #else - if (r2 < rcoulomb_sq * int_bit) + if ((r2 < rcoulomb_sq) * int_bit) #endif { /* load the rest of the i-atom parameters */ -- 2.11.4.GIT