src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *  \brief Define OpenCL implementation of nbnxn_gpu.h
  37  *
  38  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  39  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  41  *  \author Szilárd Páll <pall.szilard@gmail.com>
  42  *  \ingroup module_mdlib
  43  */
  44 #include "gmxpre.h"
  45
  46 #include <assert.h>
  47 #include <stdlib.h>
  48
  49 #if defined(_MSVC)
  50 #include <limits>
  51 #endif
  52
  53 #include "thread_mpi/atomic.h"
  54
  55 #include "gromacs/gpu_utils/oclutils.h"
  56 #include "gromacs/hardware/hw_info.h"
  57 #include "gromacs/mdlib/force_flags.h"
  58 #include "gromacs/mdlib/nb_verlet.h"
  59 #include "gromacs/mdlib/nbnxn_consts.h"
  60 #include "gromacs/mdlib/nbnxn_gpu.h"
  61 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  62 #include "gromacs/mdlib/nbnxn_pairlist.h"
  63 #include "gromacs/pbcutil/ishift.h"
  64 #include "gromacs/timing/gpu_timing.h"
  65 #include "gromacs/utility/cstringutil.h"
  66 #include "gromacs/utility/fatalerror.h"
  67 #include "gromacs/utility/gmxassert.h"
  68
  69 #include "nbnxn_ocl_internal.h"
  70 #include "nbnxn_ocl_types.h"
  71
  72 #if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
  73 #define USE_TEXOBJ
  74 #endif
  75
  76 /*! \brief Convenience constants */
  77 //@{
  78 static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
  79 static const int c_clSize          = c_nbnxnGpuClusterSize;
  80 //@}
  81
  82 /*! \brief Always/never run the energy/pruning kernels -- only for benchmarking purposes */
  83 //@{
  84 static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
  85 static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
  86 static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
  87 //@}
  88
  89 /* Uncomment this define to enable kernel debugging */
  90 //#define DEBUG_OCL
  91
  92 /*! \brief Specifies which kernel run to debug */
  93 #define DEBUG_RUN_STEP 2
  94
  95 /*! \brief Validates the input global work size parameter.
  96  */
  97 static inline void validate_global_work_size(size_t *global_work_size, int work_dim, const gmx_device_info_t *dinfo)
  98 {
  99     cl_uint device_size_t_size_bits;
 100     cl_uint host_size_t_size_bits;
 101
 102     assert(dinfo);
 103
 104     /* Each component of a global_work_size must not exceed the range given by the
 105        sizeof(device size_t) for the device on which the kernel execution will
 106        be enqueued. See:
 107        https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
 108      */
 109     device_size_t_size_bits = dinfo->adress_bits;
 110     host_size_t_size_bits   = (cl_uint)(sizeof(size_t) * 8);
 111
 112     /* If sizeof(host size_t) <= sizeof(device size_t)
 113             => global_work_size components will always be valid
 114        else
 115             => get device limit for global work size and
 116             compare it against each component of global_work_size.
 117      */
 118     if (host_size_t_size_bits > device_size_t_size_bits)
 119     {
 120         size_t device_limit;
 121
 122         device_limit = (((size_t)1) << device_size_t_size_bits) - 1;
 123
 124         for (int i = 0; i < work_dim; i++)
 125         {
 126             if (global_work_size[i] > device_limit)
 127             {
 128                 gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
 129                           "The number of nonbonded work units (=number of super-clusters) exceeds the"
 130                           "device capabilities. Global work size limit exceeded (%d > %d)!",
 131                           global_work_size[i], device_limit);
 132             }
 133         }
 134     }
 135 }
 136
 137 /* Constant arrays listing non-bonded kernel function names. The arrays are
 138  * organized in 2-dim arrays by: electrostatics and VDW type.
 139  *
 140  *  Note that the row- and column-order of function pointers has to match the
 141  *  order of corresponding enumerated electrostatics and vdw types, resp.,
 142  *  defined in nbnxn_cuda_types.h.
 143  */
 144
 145 /*! \brief Force-only kernel function names. */
 146 static const char* nb_kfunc_noener_noprune_ptr[eelOclNR][evdwOclNR] =
 147 {
 148     { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombLB_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl"            },
 149     { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombLB_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl"             },
 150     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl"        },
 151     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
 152     { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombLB_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl"             },
 153     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl"      }
 154 };
 155
 156 /*! \brief Force + energy kernel function pointers. */
 157 static const char* nb_kfunc_ener_noprune_ptr[eelOclNR][evdwOclNR] =
 158 {
 159     { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl"            },
 160     { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl"             },
 161     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl"        },
 162     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl" },
 163     { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl"             },
 164     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_opencl"      }
 165 };
 166
 167 /*! \brief Force + pruning kernel function pointers. */
 168 static const char* nb_kfunc_noener_prune_ptr[eelOclNR][evdwOclNR] =
 169 {
 170     { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombLB_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl"             },
 171     { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombLB_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl"              },
 172     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_opencl"         },
 173     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl"  },
 174     { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombLB_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl"              },
 175     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_opencl"       }
 176 };
 177
 178 /*! \brief Force + energy + pruning kernel function pointers. */
 179 static const char* nb_kfunc_ener_prune_ptr[eelOclNR][evdwOclNR] =
 180 {
 181     { "nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_opencl"            },
 182     { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl"             },
 183     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_opencl"        },
 184     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_opencl" },
 185     { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl"             },
 186     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_opencl"      }
 187 };
 188
 189
 190 /*! \brief Return a pointer to the kernel version to be executed at the current step.
 191  *  OpenCL kernel objects are cached in nb. If the requested kernel is not
 192  *  found in the cache, it will be created and the cache will be updated.
 193  */
 194 static inline cl_kernel select_nbnxn_kernel(gmx_nbnxn_ocl_t   *nb,
 195                                             int                eeltype,
 196                                             int                evdwtype,
 197                                             bool               bDoEne,
 198                                             bool               bDoPrune)
 199 {
 200     const char* kernel_name_to_run;
 201     cl_kernel  *kernel_ptr;
 202     cl_int      cl_error;
 203
 204     assert(eeltype  < eelOclNR);
 205     assert(evdwtype < evdwOclNR);
 206
 207     if (bDoEne)
 208     {
 209         if (bDoPrune)
 210         {
 211             kernel_name_to_run = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
 212             kernel_ptr         = &(nb->kernel_ener_prune_ptr[eeltype][evdwtype]);
 213         }
 214         else
 215         {
 216             kernel_name_to_run = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
 217             kernel_ptr         = &(nb->kernel_ener_noprune_ptr[eeltype][evdwtype]);
 218         }
 219     }
 220     else
 221     {
 222         if (bDoPrune)
 223         {
 224             kernel_name_to_run = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
 225             kernel_ptr         = &(nb->kernel_noener_prune_ptr[eeltype][evdwtype]);
 226         }
 227         else
 228         {
 229             kernel_name_to_run = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
 230             kernel_ptr         = &(nb->kernel_noener_noprune_ptr[eeltype][evdwtype]);
 231         }
 232     }
 233
 234     if (NULL == kernel_ptr[0])
 235     {
 236         *kernel_ptr = clCreateKernel(nb->dev_rundata->program, kernel_name_to_run, &cl_error);
 237         assert(cl_error == CL_SUCCESS);
 238     }
 239     // TODO: handle errors
 240
 241     return *kernel_ptr;
 242 }
 243
 244 /*! \brief Calculates the amount of shared memory required by the OpenCL kernel in use.
 245  */
 246 static inline int calc_shmem_required(int  vdwType,
 247                                       bool bPrefetchLjParam)
 248 {
 249     int shmem;
 250
 251     /* size of shmem (force-buffers/xq/atom type preloading) */
 252     /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 253     /* i-atom x+q in shared memory */
 254     shmem  = c_numClPerSupercl * c_clSize * sizeof(float) * 4; /* xqib */
 255     /* cj in shared memory, for both warps separately */
 256     shmem += 2 * c_nbnxnGpuJgroupSize * sizeof(int);           /* cjs  */
 257     if (bPrefetchLjParam)
 258     {
 259         if (useLjCombRule(vdwType))
 260         {
 261             /* i-atom LJ combination parameters in shared memory */
 262             shmem += c_numClPerSupercl * c_clSize * 2*sizeof(float); /* atib abused for ljcp, float2 */
 263         }
 264         else
 265         {
 266             /* i-atom types in shared memory */
 267             shmem += c_numClPerSupercl * c_clSize * sizeof(int); /* atib */
 268         }
 269     }
 270     /* force reduction buffers in shared memory */
 271     shmem += c_clSize * c_clSize * 3 * sizeof(float);    /* f_buf */
 272     /* Warp vote. In fact it must be * number of warps in block.. */
 273     shmem += sizeof(cl_uint) * 2;                        /* warp_any */
 274     return shmem;
 275 }
 276
 277 /*! \brief Initializes data structures that are going to be sent to the OpenCL device.
 278  *
 279  *  The device can't use the same data structures as the host for two main reasons:
 280  *  - OpenCL restrictions (pointers are not accepted inside data structures)
 281  *  - some host side fields are not needed for the OpenCL kernels.
 282  */
 283 static void fillin_ocl_structures(cl_nbparam_t        *nbp,
 284                                   cl_nbparam_params_t *nbparams_params)
 285 {
 286     nbparams_params->coulomb_tab_scale = nbp->coulomb_tab_scale;
 287     nbparams_params->coulomb_tab_size  = nbp->coulomb_tab_size;
 288     nbparams_params->c_rf              = nbp->c_rf;
 289     nbparams_params->dispersion_shift  = nbp->dispersion_shift;
 290     nbparams_params->eeltype           = nbp->eeltype;
 291     nbparams_params->epsfac            = nbp->epsfac;
 292     nbparams_params->ewaldcoeff_lj     = nbp->ewaldcoeff_lj;
 293     nbparams_params->ewald_beta        = nbp->ewald_beta;
 294     nbparams_params->rcoulomb_sq       = nbp->rcoulomb_sq;
 295     nbparams_params->repulsion_shift   = nbp->repulsion_shift;
 296     nbparams_params->rlist_sq          = nbp->rlist_sq;
 297     nbparams_params->rvdw_sq           = nbp->rvdw_sq;
 298     nbparams_params->rvdw_switch       = nbp->rvdw_switch;
 299     nbparams_params->sh_ewald          = nbp->sh_ewald;
 300     nbparams_params->sh_lj_ewald       = nbp->sh_lj_ewald;
 301     nbparams_params->two_k_rf          = nbp->two_k_rf;
 302     nbparams_params->vdwtype           = nbp->vdwtype;
 303     nbparams_params->vdw_switch        = nbp->vdw_switch;
 304 }
 305
 306 /*! \brief Waits for the commands associated with the input event to finish.
 307  * Then it releases the event and sets it to 0.
 308  * Don't use this function when more than one wait will be issued for the event.
 309  */
 310 void wait_ocl_event(cl_event *ocl_event)
 311 {
 312     cl_int gmx_unused cl_error;
 313
 314     /* Blocking wait for the event */
 315     cl_error = clWaitForEvents(1, ocl_event);
 316     assert(CL_SUCCESS == cl_error);
 317
 318     /* Release event and reset it to 0 */
 319     cl_error = clReleaseEvent(*ocl_event);
 320     assert(CL_SUCCESS == cl_error);
 321     *ocl_event = 0;
 322 }
 323
 324 /*! \brief Enqueues a wait for event completion.
 325  *
 326  * Then it releases the event and sets it to 0.
 327  * Don't use this function when more than one wait will be issued for the event.
 328  * Equivalent to Cuda Stream Sync. */
 329 void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event)
 330 {
 331     cl_int gmx_unused cl_error;
 332
 333     /* Enqueue wait */
 334 #ifdef CL_VERSION_1_2
 335     cl_error = clEnqueueBarrierWithWaitList(stream, 1, ocl_event, NULL);
 336 #else
 337     cl_error = clEnqueueWaitForEvents(stream, 1, ocl_event);
 338 #endif
 339
 340     GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
 341
 342     /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
 343     cl_error = clReleaseEvent(*ocl_event);
 344     assert(CL_SUCCESS == cl_error);
 345     *ocl_event = 0;
 346 }
 347
 348 /*! \brief Returns the duration in milliseconds for the command associated with the event.
 349  *
 350  * It then releases the event and sets it to 0.
 351  * Before calling this function, make sure the command has finished either by
 352  * calling clFinish or clWaitForEvents.
 353  * The function returns 0.0 if the input event, *ocl_event, is 0.
 354  * Don't use this function when more than one wait will be issued for the event.
 355  */
 356 double ocl_event_elapsed_ms(cl_event *ocl_event)
 357 {
 358     cl_int gmx_unused cl_error;
 359     cl_ulong          start_ns, end_ns;
 360     double            elapsed_ms;
 361
 362     elapsed_ms = 0.0;
 363     assert(NULL != ocl_event);
 364
 365     if (*ocl_event)
 366     {
 367         cl_error = clGetEventProfilingInfo(*ocl_event, CL_PROFILING_COMMAND_START,
 368                                            sizeof(cl_ulong), &start_ns, NULL);
 369         assert(CL_SUCCESS == cl_error);
 370
 371         cl_error = clGetEventProfilingInfo(*ocl_event, CL_PROFILING_COMMAND_END,
 372                                            sizeof(cl_ulong), &end_ns, NULL);
 373         assert(CL_SUCCESS == cl_error);
 374
 375         clReleaseEvent(*ocl_event);
 376         *ocl_event = 0;
 377
 378         elapsed_ms = (end_ns - start_ns) / 1000000.0;
 379     }
 380
 381     return elapsed_ms;
 382 }
 383
 384 /*! \brief Launch GPU kernel
 385
 386    As we execute nonbonded workload in separate queues, before launching
 387    the kernel we need to make sure that he following operations have completed:
 388    - atomdata allocation and related H2D transfers (every nstlist step);
 389    - pair list H2D transfer (every nstlist step);
 390    - shift vector H2D transfer (every nstlist step);
 391    - force (+shift force and energy) output clearing (every step).
 392
 393    These operations are issued in the local queue at the beginning of the step
 394    and therefore always complete before the local kernel launch. The non-local
 395    kernel is launched after the local on the same device/context, so this is
 396    inherently scheduled after the operations in the local stream (including the
 397    above "misc_ops").
 398    However, for the sake of having a future-proof implementation, we use the
 399    misc_ops_done event to record the point in time when the above  operations
 400    are finished and synchronize with this event in the non-local stream.
 401  */
 402 void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
 403                              const struct nbnxn_atomdata_t *nbatom,
 404                              int                            flags,
 405                              int                            iloc)
 406 {
 407     cl_int               cl_error;
 408     int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 409     /* OpenCL kernel launch-related stuff */
 410     int                  shmem;
 411     size_t               local_work_size[3], global_work_size[3];
 412     cl_kernel            nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 413
 414     cl_atomdata_t       *adat    = nb->atdat;
 415     cl_nbparam_t        *nbp     = nb->nbparam;
 416     cl_plist_t          *plist   = nb->plist[iloc];
 417     cl_timers_t         *t       = nb->timers;
 418     cl_command_queue     stream  = nb->stream[iloc];
 419
 420     bool                 bCalcEner   = flags & GMX_FORCE_ENERGY;
 421     int                  bCalcFshift = flags & GMX_FORCE_VIRIAL;
 422     bool                 bDoTime     = nb->bDoTime;
 423     cl_uint              arg_no;
 424
 425     cl_nbparam_params_t  nbparams_params;
 426 #ifdef DEBUG_OCL
 427     float              * debug_buffer_h;
 428     size_t               debug_buffer_size;
 429 #endif
 430
 431     /* turn energy calculation always on/off (for debugging/testing only) */
 432     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 433
 434     /* Don't launch the non-local kernel if there is no work to do.
 435        Doing the same for the local kernel is more complicated, since the
 436        local part of the force array also depends on the non-local kernel.
 437        So to avoid complicating the code and to reduce the risk of bugs,
 438        we always call the local kernel, the local x+q copy and later (not in
 439        this function) the stream wait, local f copyback and the f buffer
 440        clearing. All these operations, except for the local interaction kernel,
 441        are needed for the non-local interactions. The skip of the local kernel
 442        call is taken care of later in this function. */
 443     if (iloc == eintNonlocal && plist->nsci == 0)
 444     {
 445         return;
 446     }
 447
 448     /* calculate the atom data index range based on locality */
 449     if (LOCAL_I(iloc))
 450     {
 451         adat_begin  = 0;
 452         adat_len    = adat->natoms_local;
 453     }
 454     else
 455     {
 456         adat_begin  = adat->natoms_local;
 457         adat_len    = adat->natoms - adat->natoms_local;
 458     }
 459
 460     /* beginning of timed HtoD section */
 461
 462     /* HtoD x, q */
 463     ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4,
 464                        adat_len * sizeof(float) * 4, stream, bDoTime ? (&(t->nb_h2d[iloc])) : NULL);
 465
 466     /* When we get here all misc operations issues in the local stream as well as
 467        the local xq H2D are done,
 468        so we record that in the local stream and wait for it in the nonlocal one. */
 469     if (nb->bUseTwoStreams)
 470     {
 471         if (iloc == eintLocal)
 472         {
 473 #ifdef CL_VERSION_1_2
 474             cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_and_local_H2D_done));
 475 #else
 476             cl_error = clEnqueueMarker(stream, &(nb->misc_ops_and_local_H2D_done));
 477 #endif
 478             assert(CL_SUCCESS == cl_error);
 479
 480             /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
 481              * in the local stream in order to be able to sync with the above event
 482              * from the non-local stream.
 483              */
 484             cl_error = clFlush(stream);
 485             assert(CL_SUCCESS == cl_error);
 486         }
 487         else
 488         {
 489             sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done));
 490         }
 491     }
 492
 493     if (plist->nsci == 0)
 494     {
 495         /* Don't launch an empty local kernel (is not allowed with OpenCL).
 496          * TODO: Separate H2D and kernel launch into separate functions.
 497          */
 498         return;
 499     }
 500
 501     /* beginning of timed nonbonded calculation section */
 502
 503     /* get the pointer to the kernel flavor we need to use */
 504     nb_kernel = select_nbnxn_kernel(nb,
 505                                     nbp->eeltype,
 506                                     nbp->vdwtype,
 507                                     bCalcEner,
 508                                     plist->bDoPrune || always_prune);
 509
 510     /* kernel launch config */
 511     local_work_size[0] = c_clSize;
 512     local_work_size[1] = c_clSize;
 513     local_work_size[2] = 1;
 514
 515     global_work_size[0] = plist->nsci * local_work_size[0];
 516     global_work_size[1] = 1 * local_work_size[1];
 517     global_work_size[2] = 1 * local_work_size[2];
 518
 519     validate_global_work_size(global_work_size, 3, nb->dev_info);
 520
 521     shmem     = calc_shmem_required(nbp->vdwtype, nb->bPrefetchLjParam);
 522
 523 #ifdef DEBUG_OCL
 524     {
 525         static int run_step = 1;
 526
 527         if (DEBUG_RUN_STEP == run_step)
 528         {
 529             debug_buffer_size = global_work_size[0] * global_work_size[1] * global_work_size[2] * sizeof(float);
 530             debug_buffer_h    = (float*)calloc(1, debug_buffer_size);
 531             assert(NULL != debug_buffer_h);
 532
 533             if (NULL == nb->debug_buffer)
 534             {
 535                 nb->debug_buffer = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
 536                                                   debug_buffer_size, debug_buffer_h, &cl_error);
 537
 538                 assert(CL_SUCCESS == cl_error);
 539             }
 540         }
 541
 542         run_step++;
 543     }
 544 #endif
 545     if (debug)
 546     {
 547         fprintf(debug, "GPU launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
 548                 "Global work size : %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
 549                 (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]),
 550                 (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*c_numClPerSupercl,
 551                 c_numClPerSupercl, plist->na_c);
 552     }
 553
 554     fillin_ocl_structures(nbp, &nbparams_params);
 555
 556     arg_no    = 0;
 557     cl_error  = CL_SUCCESS;
 558     if (!useLjCombRule(nb->nbparam->vdwtype))
 559     {
 560         cl_error  = clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &(adat->ntypes));
 561     }
 562     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(nbparams_params), &(nbparams_params));
 563     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->xq));
 564     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->f));
 565     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_lj));
 566     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_el));
 567     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->fshift));
 568     if (useLjCombRule(nb->nbparam->vdwtype))
 569     {
 570         cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->lj_comb));
 571     }
 572     else
 573     {
 574         cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->atom_types));
 575     }
 576     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec));
 577     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_climg2d));
 578     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_comb_climg2d));
 579     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->coulomb_tab_climg2d));
 580     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->sci));
 581     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->cj4));
 582     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->excl));
 583     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &bCalcFshift);
 584     cl_error |= clSetKernelArg(nb_kernel, arg_no++, shmem, NULL);
 585     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nb->debug_buffer));
 586
 587     assert(cl_error == CL_SUCCESS);
 588
 589     if (cl_error)
 590     {
 591         printf("OpenCL error: %s\n", ocl_get_error_string(cl_error).c_str());
 592     }
 593     cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? &(t->nb_k[iloc]) : NULL);
 594     assert(cl_error == CL_SUCCESS);
 595
 596 #ifdef DEBUG_OCL
 597     {
 598         static int run_step = 1;
 599
 600         if (DEBUG_RUN_STEP == run_step)
 601         {
 602             FILE *pf;
 603             char  file_name[256] = {0};
 604
 605             ocl_copy_D2H_async(debug_buffer_h, nb->debug_buffer, 0,
 606                                debug_buffer_size, stream, NULL);
 607
 608             // Make sure all data has been transfered back from device
 609             clFinish(stream);
 610
 611             printf("\nWriting debug_buffer to debug_buffer_ocl.txt...");
 612
 613             sprintf(file_name, "debug_buffer_ocl_%d.txt", DEBUG_RUN_STEP);
 614             pf = fopen(file_name, "wt");
 615             assert(pf != NULL);
 616
 617             fprintf(pf, "%20s", "");
 618             for (int j = 0; j < global_work_size[0]; j++)
 619             {
 620                 char label[20];
 621                 sprintf(label, "(wIdx=%2d thIdx=%2d)", j / local_work_size[0], j % local_work_size[0]);
 622                 fprintf(pf, "%20s", label);
 623             }
 624
 625             for (int i = 0; i < global_work_size[1]; i++)
 626             {
 627                 char label[20];
 628                 sprintf(label, "(wIdy=%2d thIdy=%2d)", i / local_work_size[1], i % local_work_size[1]);
 629                 fprintf(pf, "\n%20s", label);
 630
 631                 for (int j = 0; j < global_work_size[0]; j++)
 632                 {
 633                     fprintf(pf, "%20.5f", debug_buffer_h[i * global_work_size[0] + j]);
 634                 }
 635
 636                 //fprintf(pf, "\n");
 637             }
 638
 639             fclose(pf);
 640
 641             printf(" done.\n");
 642
 643
 644             free(debug_buffer_h);
 645             debug_buffer_h = NULL;
 646         }
 647
 648         run_step++;
 649     }
 650 #endif
 651 }
 652 /*! \brief
 653  * Launch asynchronously the download of nonbonded forces from the GPU
 654  * (and energies/shift forces if required).
 655  */
 656 void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
 657                               const struct nbnxn_atomdata_t *nbatom,
 658                               int                            flags,
 659                               int                            aloc)
 660 {
 661     cl_int gmx_unused cl_error;
 662     int               adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 663     int               iloc = -1;
 664
 665     /* determine interaction locality from atom locality */
 666     if (LOCAL_A(aloc))
 667     {
 668         iloc = eintLocal;
 669     }
 670     else if (NONLOCAL_A(aloc))
 671     {
 672         iloc = eintNonlocal;
 673     }
 674     else
 675     {
 676         char stmp[STRLEN];
 677         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 678                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 679
 680         gmx_incons(stmp);
 681     }
 682
 683     cl_atomdata_t   *adat    = nb->atdat;
 684     cl_timers_t     *t       = nb->timers;
 685     bool             bDoTime = nb->bDoTime;
 686     cl_command_queue stream  = nb->stream[iloc];
 687
 688     bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
 689     int              bCalcFshift = flags & GMX_FORCE_VIRIAL;
 690
 691
 692     /* don't launch non-local copy-back if there was no non-local work to do */
 693     if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
 694     {
 695         /* TODO An alternative way to signal that non-local work is
 696            complete is to use a clEnqueueMarker+clEnqueueBarrier
 697            pair. However, the use of bNonLocalStreamActive has the
 698            advantage of being local to the host, so probably minimizes
 699            overhead. Curiously, for NVIDIA OpenCL with an empty-domain
 700            test case, overall simulation performance was higher with
 701            the API calls, but this has not been tested on AMD OpenCL,
 702            so could be worth considering in future. */
 703         nb->bNonLocalStreamActive = false;
 704         return;
 705     }
 706
 707     /* calculate the atom data index range based on locality */
 708     if (LOCAL_A(aloc))
 709     {
 710         adat_begin  = 0;
 711         adat_len    = adat->natoms_local;
 712     }
 713     else
 714     {
 715         adat_begin  = adat->natoms_local;
 716         adat_len    = adat->natoms - adat->natoms_local;
 717     }
 718
 719     /* beginning of timed D2H section */
 720
 721     /* With DD the local D2H transfer can only start after the non-local
 722        has been launched. */
 723     if (iloc == eintLocal && nb->bNonLocalStreamActive)
 724     {
 725         sync_ocl_event(stream, &(nb->nonlocal_done));
 726     }
 727
 728     /* DtoH f */
 729     ocl_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
 730                        (adat_len)* adat->f_elem_size, stream, bDoTime ? &(t->nb_d2h_f[iloc]) : NULL);
 731
 732     /* kick off work */
 733     cl_error = clFlush(stream);
 734     assert(CL_SUCCESS == cl_error);
 735
 736     /* After the non-local D2H is launched the nonlocal_done event can be
 737        recorded which signals that the local D2H can proceed. This event is not
 738        placed after the non-local kernel because we first need the non-local
 739        data back first. */
 740     if (iloc == eintNonlocal)
 741     {
 742 #ifdef CL_VERSION_1_2
 743         cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->nonlocal_done));
 744 #else
 745         cl_error = clEnqueueMarker(stream, &(nb->nonlocal_done));
 746 #endif
 747         assert(CL_SUCCESS == cl_error);
 748         nb->bNonLocalStreamActive = true;
 749     }
 750
 751     /* only transfer energies in the local stream */
 752     if (LOCAL_I(iloc))
 753     {
 754         /* DtoH fshift */
 755         if (bCalcFshift)
 756         {
 757             ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
 758                                SHIFTS * adat->fshift_elem_size, stream, bDoTime ? &(t->nb_d2h_fshift[iloc]) : NULL);
 759         }
 760
 761         /* DtoH energies */
 762         if (bCalcEner)
 763         {
 764             ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
 765                                sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_lj[iloc]) : NULL);
 766
 767             ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
 768                                sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_el[iloc]) : NULL);
 769         }
 770     }
 771 }
 772
 773 /*! \brief
 774  * Wait for the asynchronously launched nonbonded calculations and data
 775  * transfers to finish.
 776  */
 777 void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_ocl_t *nb,
 778                             int flags, int aloc,
 779                             real *e_lj, real *e_el, rvec *fshift)
 780 {
 781     /* NOTE:  only implemented for single-precision at this time */
 782     cl_int gmx_unused      cl_error;
 783     int                    i, iloc = -1;
 784
 785     /* determine interaction locality from atom locality */
 786     if (LOCAL_A(aloc))
 787     {
 788         iloc = eintLocal;
 789     }
 790     else if (NONLOCAL_A(aloc))
 791     {
 792         iloc = eintNonlocal;
 793     }
 794     else
 795     {
 796         char stmp[STRLEN];
 797         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 798                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 799         gmx_incons(stmp);
 800     }
 801
 802     cl_plist_t                 *plist    = nb->plist[iloc];
 803     cl_timers_t                *timers   = nb->timers;
 804     struct gmx_wallclock_gpu_t *timings  = nb->timings;
 805     cl_nb_staging               nbst     = nb->nbst;
 806
 807     bool                        bCalcEner   = flags & GMX_FORCE_ENERGY;
 808     int                         bCalcFshift = flags & GMX_FORCE_VIRIAL;
 809
 810     /* turn energy calculation always on/off (for debugging/testing only) */
 811     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 812
 813     /* Launch wait/update timers & counters, unless doing the non-local phase
 814        when there is not actually work to do. This is consistent with
 815        nbnxn_gpu_launch_kernel.
 816
 817        NOTE: if timing with multiple GPUs (streams) becomes possible, the
 818        counters could end up being inconsistent due to not being incremented
 819        on some of the nodes! */
 820     if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
 821     {
 822         return;
 823     }
 824
 825     /* Actual sync point. Waits for everything to be finished in the command queue. TODO: Find out if a more fine grained solution is needed */
 826     cl_error = clFinish(nb->stream[iloc]);
 827     assert(CL_SUCCESS == cl_error);
 828
 829     /* timing data accumulation */
 830     if (nb->bDoTime)
 831     {
 832         /* only increase counter once (at local F wait) */
 833         if (LOCAL_I(iloc))
 834         {
 835             timings->nb_c++;
 836             timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
 837         }
 838
 839         /* kernel timings */
 840
 841         timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
 842             ocl_event_elapsed_ms(timers->nb_k + iloc);
 843
 844         /* X/q H2D and F D2H timings */
 845         timings->nb_h2d_t += ocl_event_elapsed_ms(timers->nb_h2d        + iloc);
 846         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_f      + iloc);
 847         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_fshift + iloc);
 848         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_el   + iloc);
 849         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_lj   + iloc);
 850
 851         /* only count atdat and pair-list H2D at pair-search step */
 852         if (plist->bDoPrune)
 853         {
 854             /* atdat transfer timing (add only once, at local F wait) */
 855             if (LOCAL_A(aloc))
 856             {
 857                 timings->pl_h2d_c++;
 858                 timings->pl_h2d_t += ocl_event_elapsed_ms(&(timers->atdat));
 859             }
 860
 861             timings->pl_h2d_t +=
 862                 ocl_event_elapsed_ms(timers->pl_h2d_sci     + iloc) +
 863                 ocl_event_elapsed_ms(timers->pl_h2d_cj4     + iloc) +
 864                 ocl_event_elapsed_ms(timers->pl_h2d_excl    + iloc);
 865
 866         }
 867     }
 868
 869     /* add up energies and shift forces (only once at local F wait) */
 870     if (LOCAL_I(iloc))
 871     {
 872         if (bCalcEner)
 873         {
 874             *e_lj += *nbst.e_lj;
 875             *e_el += *nbst.e_el;
 876         }
 877
 878         if (bCalcFshift)
 879         {
 880             for (i = 0; i < SHIFTS; i++)
 881             {
 882                 fshift[i][0] += (nbst.fshift)[i][0];
 883                 fshift[i][1] += (nbst.fshift)[i][1];
 884                 fshift[i][2] += (nbst.fshift)[i][2];
 885             }
 886         }
 887     }
 888
 889     /* turn off pruning (doesn't matter if this is pair-search step or not) */
 890     plist->bDoPrune = false;
 891
 892 }
 893
 894 /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
 895 int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut)
 896 {
 897     bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
 898     int  kernel_type;
 899
 900     /* Benchmarking/development environment variables to force the use of
 901        analytical or tabulated Ewald kernel. */
 902     bForceAnalyticalEwald = (getenv("GMX_OCL_NB_ANA_EWALD") != NULL);
 903     bForceTabulatedEwald  = (getenv("GMX_OCL_NB_TAB_EWALD") != NULL);
 904
 905     if (bForceAnalyticalEwald && bForceTabulatedEwald)
 906     {
 907         gmx_incons("Both analytical and tabulated Ewald OpenCL non-bonded kernels "
 908                    "requested through environment variables.");
 909     }
 910
 911     /* OpenCL: By default, use analytical Ewald
 912      * TODO: tabulated does not work, it needs fixing, see init_nbparam() in nbnxn_ocl_data_mgmt.cpp
 913      *
 914      * TODO: decide if dev_info parameter should be added to recognize NVIDIA CC>=3.0 devices.
 915      *
 916      */
 917     //if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
 918     if ((1                         || bForceAnalyticalEwald) && !bForceTabulatedEwald)
 919     {
 920         bUseAnalyticalEwald = true;
 921
 922         if (debug)
 923         {
 924             fprintf(debug, "Using analytical Ewald OpenCL kernels\n");
 925         }
 926     }
 927     else
 928     {
 929         bUseAnalyticalEwald = false;
 930
 931         if (debug)
 932         {
 933             fprintf(debug, "Using tabulated Ewald OpenCL kernels\n");
 934         }
 935     }
 936
 937     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
 938        forces it (use it for debugging/benchmarking only). */
 939     if (!bTwinCut && (getenv("GMX_OCL_NB_EWALD_TWINCUT") == NULL))
 940     {
 941         kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA : eelOclEWALD_TAB;
 942     }
 943     else
 944     {
 945         kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA_TWIN : eelOclEWALD_TAB_TWIN;
 946     }
 947
 948     return kernel_type;
 949 }