src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *  \brief Define OpenCL implementation of nbnxn_gpu.h
  37  *
  38  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  39  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  41  *  \ingroup module_mdlib
  42  */
  43 #include "gmxpre.h"
  44
  45 #include "config.h"
  46
  47 #include <assert.h>
  48 #include <stdlib.h>
  49
  50 #if defined(_MSVC)
  51 #include <limits>
  52 #endif
  53
  54 #include "gromacs/gpu_utils/oclutils.h"
  55 #include "gromacs/hardware/hw_info.h"
  56 #include "gromacs/mdlib/force_flags.h"
  57 #include "gromacs/mdlib/nb_verlet.h"
  58 #include "gromacs/mdlib/nbnxn_consts.h"
  59 #include "gromacs/mdlib/nbnxn_pairlist.h"
  60 #include "gromacs/timing/gpu_timing.h"
  61
  62 #ifdef TMPI_ATOMICS
  63 #include "thread_mpi/atomic.h"
  64 #endif
  65
  66 #include "gromacs/mdlib/nbnxn_gpu.h"
  67 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  68 #include "gromacs/pbcutil/ishift.h"
  69 #include "gromacs/utility/cstringutil.h"
  70 #include "gromacs/utility/fatalerror.h"
  71
  72 #include "nbnxn_ocl_types.h"
  73
  74 #if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
  75 #define USE_TEXOBJ
  76 #endif
  77
  78 /*! \brief Convenience defines */
  79 //@{
  80 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  81 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  82 //@}
  83
  84 /*! \brief Always/never run the energy/pruning kernels -- only for benchmarking purposes */
  85 //@{
  86 static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
  87 static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
  88 static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
  89 //@}
  90
  91 /* Uncomment this define to enable kernel debugging */
  92 //#define DEBUG_OCL
  93
  94 /*! \brief Specifies which kernel run to debug */
  95 #define DEBUG_RUN_STEP 2
  96
  97 /*! \brief Validates the input global work size parameter.
  98  */
  99 static inline void validate_global_work_size(size_t *global_work_size, int work_dim, gmx_device_info_t *dinfo)
 100 {
 101     cl_uint device_size_t_size_bits;
 102     cl_uint host_size_t_size_bits;
 103
 104     assert(dinfo);
 105
 106     /* Each component of a global_work_size must not exceed the range given by the
 107        sizeof(device size_t) for the device on which the kernel execution will
 108        be enqueued. See:
 109        https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
 110      */
 111     device_size_t_size_bits = dinfo->adress_bits;
 112     host_size_t_size_bits   = (cl_uint)(sizeof(size_t) * 8);
 113
 114     /* If sizeof(host size_t) <= sizeof(device size_t)
 115             => global_work_size components will always be valid
 116        else
 117             => get device limit for global work size and
 118             compare it against each component of global_work_size.
 119      */
 120     if (host_size_t_size_bits > device_size_t_size_bits)
 121     {
 122         size_t device_limit;
 123
 124         device_limit = (((size_t)1) << device_size_t_size_bits) - 1;
 125
 126         for (int i = 0; i < work_dim; i++)
 127         {
 128             if (global_work_size[i] > device_limit)
 129             {
 130                 gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
 131                           "The number of nonbonded work units (=number of super-clusters) exceeds the"
 132                           "device capabilities. Global work size limit exceeded (%d > %d)!",
 133                           global_work_size[i], device_limit);
 134             }
 135         }
 136     }
 137 }
 138
 139 /* Constant arrays listing non-bonded kernel function names. The arrays are
 140  * organized in 2-dim arrays by: electrostatics and VDW type.
 141  *
 142  *  Note that the row- and column-order of function pointers has to match the
 143  *  order of corresponding enumerated electrostatics and vdw types, resp.,
 144  *  defined in nbnxn_cuda_types.h.
 145  */
 146
 147 /*! \brief Force-only kernel function names. */
 148 static const char* nb_kfunc_noener_noprune_ptr[eelOclNR][evdwOclNR] =
 149 {
 150     { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl"            },
 151     { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl"             },
 152     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl"        },
 153     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
 154     { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl"             },
 155     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl"      }
 156 };
 157
 158 /*! \brief Force + energy kernel function pointers. */
 159 static const char* nb_kfunc_ener_noprune_ptr[eelOclNR][evdwOclNR] =
 160 {
 161     { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl"              },
 162     { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl"               },
 163     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl"          },
 164     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl"     },
 165     { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl"               },
 166     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_opencl"        }
 167 };
 168
 169 /*! \brief Force + pruning kernel function pointers. */
 170 static const char* nb_kfunc_noener_prune_ptr[eelOclNR][evdwOclNR] =
 171 {
 172     { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl",             "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl"            },
 173     { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl",              "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl"             },
 174     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl",         "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_opencl"        },
 175     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_opencl",  "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl" },
 176     { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl",              "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl"             },
 177     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl",       "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_opencl"      }
 178 };
 179
 180 /*! \brief Force + energy + pruning kernel function pointers. */
 181 static const char* nb_kfunc_ener_prune_ptr[eelOclNR][evdwOclNR] =
 182 {
 183     { "nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_opencl"            },
 184     { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl"             },
 185     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_opencl"        },
 186     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_opencl" },
 187     { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl"             },
 188     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_opencl"      }
 189 };
 190
 191 /*! \brief Return a pointer to the kernel version to be executed at the current step.
 192  *  OpenCL kernel objects are cached in nb. If the requested kernel is not
 193  *  found in the cache, it will be created and the cache will be updated.
 194  */
 195 static inline cl_kernel select_nbnxn_kernel(gmx_nbnxn_ocl_t   *nb,
 196                                             int                eeltype,
 197                                             int                evdwtype,
 198                                             bool               bDoEne,
 199                                             bool               bDoPrune)
 200 {
 201     const char* kernel_name_to_run;
 202     cl_kernel  *kernel_ptr;
 203     cl_int      cl_error;
 204
 205     assert(eeltype < eelOclNR);
 206     assert(evdwtype < eelOclNR);
 207
 208     if (bDoEne)
 209     {
 210         if (bDoPrune)
 211         {
 212             kernel_name_to_run = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
 213             kernel_ptr         = &(nb->kernel_ener_prune_ptr[eeltype][evdwtype]);
 214         }
 215         else
 216         {
 217             kernel_name_to_run = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
 218             kernel_ptr         = &(nb->kernel_ener_noprune_ptr[eeltype][evdwtype]);
 219         }
 220     }
 221     else
 222     {
 223         if (bDoPrune)
 224         {
 225             kernel_name_to_run = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
 226             kernel_ptr         = &(nb->kernel_noener_prune_ptr[eeltype][evdwtype]);
 227         }
 228         else
 229         {
 230             kernel_name_to_run = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
 231             kernel_ptr         = &(nb->kernel_noener_noprune_ptr[eeltype][evdwtype]);
 232         }
 233     }
 234
 235     if (NULL == kernel_ptr[0])
 236     {
 237         *kernel_ptr = clCreateKernel(nb->dev_info->program, kernel_name_to_run, &cl_error);
 238         assert(cl_error == CL_SUCCESS);
 239     }
 240     // TODO: handle errors
 241
 242     return *kernel_ptr;
 243 }
 244
 245 /*! \brief Calculates the amount of shared memory required by the OpenCL kernel in use.
 246  */
 247 static inline int calc_shmem_required()
 248 {
 249     int shmem;
 250
 251     /* size of shmem (force-buffers/xq/atom type preloading) */
 252     /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 253     /* i-atom x+q in shared memory */
 254     shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float) * 4; /* xqib */
 255     /* cj in shared memory, for both warps separately */
 256     shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);       /* cjs  */
 257 #ifdef IATYPE_SHMEM
 258     /* FIXME: this should not be compile-time decided but rather at runtime.
 259      * This issue propagated from the CUDA code where due to the source to source
 260      * compilation there was confusion the way to set up arch-dependent launch parameters.
 261      * Here too this should be converted to a hardware/arch/generation dependent
 262      * conditional when re-evaluating the need for i atom type preloading.
 263      */
 264     /* i-atom types in shared memory */
 265     #pragma error "Should not be defined"
 266     shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);       /* atib */
 267 #endif
 268     /* force reduction buffers in shared memory */
 269     shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float); /* f_buf */
 270     /* Warp vote. In fact it must be * number of warps in block.. */
 271     shmem += sizeof(cl_uint) * 2;                   /* warp_any */
 272     return shmem;
 273 }
 274
 275 /*! \brief Initializes data structures that are going to be sent to the OpenCL device.
 276  *
 277  *  The device can't use the same data structures as the host for two main reasons:
 278  *  - OpenCL restrictions (pointers are not accepted inside data structures)
 279  *  - some host side fields are not needed for the OpenCL kernels.
 280  */
 281 static void fillin_ocl_structures(cl_nbparam_t        *nbp,
 282                                   cl_nbparam_params_t *nbparams_params)
 283 {
 284     nbparams_params->coulomb_tab_scale = nbp->coulomb_tab_scale;
 285     nbparams_params->coulomb_tab_size  = nbp->coulomb_tab_size;
 286     nbparams_params->c_rf              = nbp->c_rf;
 287     nbparams_params->dispersion_shift  = nbp->dispersion_shift;
 288     nbparams_params->eeltype           = nbp->eeltype;
 289     nbparams_params->epsfac            = nbp->epsfac;
 290     nbparams_params->ewaldcoeff_lj     = nbp->ewaldcoeff_lj;
 291     nbparams_params->ewald_beta        = nbp->ewald_beta;
 292     nbparams_params->rcoulomb_sq       = nbp->rcoulomb_sq;
 293     nbparams_params->repulsion_shift   = nbp->repulsion_shift;
 294     nbparams_params->rlist_sq          = nbp->rlist_sq;
 295     nbparams_params->rvdw_sq           = nbp->rvdw_sq;
 296     nbparams_params->rvdw_switch       = nbp->rvdw_switch;
 297     nbparams_params->sh_ewald          = nbp->sh_ewald;
 298     nbparams_params->sh_lj_ewald       = nbp->sh_lj_ewald;
 299     nbparams_params->two_k_rf          = nbp->two_k_rf;
 300     nbparams_params->vdwtype           = nbp->vdwtype;
 301     nbparams_params->vdw_switch        = nbp->vdw_switch;
 302 }
 303
 304 /*! \brief Waits for the commands associated with the input event to finish.
 305  * Then it releases the event and sets it to 0.
 306  * Don't use this function when more than one wait will be issued for the event.
 307  */
 308 void wait_ocl_event(cl_event *ocl_event)
 309 {
 310     cl_int gmx_unused cl_error;
 311
 312     /* Blocking wait for the event */
 313     cl_error = clWaitForEvents(1, ocl_event);
 314     assert(CL_SUCCESS == cl_error);
 315
 316     /* Release event and reset it to 0 */
 317     cl_error = clReleaseEvent(*ocl_event);
 318     assert(CL_SUCCESS == cl_error);
 319     *ocl_event = 0;
 320 }
 321
 322 /*! \brief Enqueues a wait for event completion.
 323  *
 324  * Then it releases the event and sets it to 0.
 325  * Don't use this function when more than one wait will be issued for the event.
 326  * Equivalent to Cuda Stream Sync. */
 327 void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event)
 328 {
 329     cl_int gmx_unused cl_error;
 330
 331     /* Enqueue wait */
 332 #ifdef CL_VERSION_1_2
 333     cl_error = clEnqueueBarrierWithWaitList(stream, 1, ocl_event, NULL);
 334 #else
 335     cl_error = clEnqueueWaitForEvents(stream, 1, ocl_event);
 336 #endif
 337
 338     assert(CL_SUCCESS == cl_error);
 339
 340     /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
 341     cl_error = clReleaseEvent(*ocl_event);
 342     assert(CL_SUCCESS == cl_error);
 343     *ocl_event = 0;
 344 }
 345
 346 /*! \brief Returns the duration in milliseconds for the command associated with the event.
 347  *
 348  * It then releases the event and sets it to 0.
 349  * Before calling this function, make sure the command has finished either by
 350  * calling clFinish or clWaitForEvents.
 351  * The function returns 0.0 if the input event, *ocl_event, is 0.
 352  * Don't use this function when more than one wait will be issued for the event.
 353  */
 354 double ocl_event_elapsed_ms(cl_event *ocl_event)
 355 {
 356     cl_int gmx_unused cl_error;
 357     cl_ulong          start_ns, end_ns;
 358     double            elapsed_ms;
 359
 360     elapsed_ms = 0.0;
 361     assert(NULL != ocl_event);
 362
 363     if (*ocl_event)
 364     {
 365         cl_error = clGetEventProfilingInfo(*ocl_event, CL_PROFILING_COMMAND_START,
 366                                            sizeof(cl_ulong), &start_ns, NULL);
 367         assert(CL_SUCCESS == cl_error);
 368
 369         cl_error = clGetEventProfilingInfo(*ocl_event, CL_PROFILING_COMMAND_END,
 370                                            sizeof(cl_ulong), &end_ns, NULL);
 371         assert(CL_SUCCESS == cl_error);
 372
 373         clReleaseEvent(*ocl_event);
 374         *ocl_event = 0;
 375
 376         elapsed_ms = (end_ns - start_ns) / 1000000.0;
 377     }
 378
 379     return elapsed_ms;
 380 }
 381
 382 /*! \brief Launch GPU kernel
 383
 384    As we execute nonbonded workload in separate queues, before launching
 385    the kernel we need to make sure that he following operations have completed:
 386    - atomdata allocation and related H2D transfers (every nstlist step);
 387    - pair list H2D transfer (every nstlist step);
 388    - shift vector H2D transfer (every nstlist step);
 389    - force (+shift force and energy) output clearing (every step).
 390
 391    These operations are issued in the local queue at the beginning of the step
 392    and therefore always complete before the local kernel launch. The non-local
 393    kernel is launched after the local on the same device/context, so this is
 394    inherently scheduled after the operations in the local stream (including the
 395    above "misc_ops").
 396    However, for the sake of having a future-proof implementation, we use the
 397    misc_ops_done event to record the point in time when the above  operations
 398    are finished and synchronize with this event in the non-local stream.
 399  */
 400 void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
 401                              const struct nbnxn_atomdata_t *nbatom,
 402                              int                            flags,
 403                              int                            iloc)
 404 {
 405     cl_int               cl_error;
 406     int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 407     /* OpenCL kernel launch-related stuff */
 408     int                  shmem;
 409     size_t               local_work_size[3], global_work_size[3];
 410     cl_kernel            nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 411
 412     cl_atomdata_t       *adat    = nb->atdat;
 413     cl_nbparam_t        *nbp     = nb->nbparam;
 414     cl_plist_t          *plist   = nb->plist[iloc];
 415     cl_timers_t         *t       = nb->timers;
 416     cl_command_queue     stream  = nb->stream[iloc];
 417
 418     bool                 bCalcEner   = flags & GMX_FORCE_ENERGY;
 419     int                  bCalcFshift = flags & GMX_FORCE_VIRIAL;
 420     bool                 bDoTime     = nb->bDoTime;
 421     cl_uint              arg_no;
 422
 423     cl_nbparam_params_t  nbparams_params;
 424 #ifdef DEBUG_OCL
 425     float              * debug_buffer_h;
 426     size_t               debug_buffer_size;
 427 #endif
 428
 429     /* turn energy calculation always on/off (for debugging/testing only) */
 430     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 431
 432     /* Don't launch the non-local kernel if there is no work to do.
 433        Doing the same for the local kernel is more complicated, since the
 434        local part of the force array also depends on the non-local kernel.
 435        So to avoid complicating the code and to reduce the risk of bugs,
 436        we always call the local kernel, the local x+q copy and later (not in
 437        this function) the stream wait, local f copyback and the f buffer
 438        clearing. All these operations, except for the local interaction kernel,
 439        are needed for the non-local interactions. The skip of the local kernel
 440        call is taken care of later in this function. */
 441     if (iloc == eintNonlocal && plist->nsci == 0)
 442     {
 443         return;
 444     }
 445
 446     /* calculate the atom data index range based on locality */
 447     if (LOCAL_I(iloc))
 448     {
 449         adat_begin  = 0;
 450         adat_len    = adat->natoms_local;
 451     }
 452     else
 453     {
 454         adat_begin  = adat->natoms_local;
 455         adat_len    = adat->natoms - adat->natoms_local;
 456     }
 457
 458     /* beginning of timed HtoD section */
 459
 460     /* HtoD x, q */
 461     ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4,
 462                        adat_len * sizeof(float) * 4, stream, bDoTime ? (&(t->nb_h2d[iloc])) : NULL);
 463
 464     /* When we get here all misc operations issues in the local stream as well as
 465        the local xq H2D are done,
 466        so we record that in the local stream and wait for it in the nonlocal one. */
 467     if (nb->bUseTwoStreams)
 468     {
 469         if (iloc == eintLocal)
 470         {
 471 #ifdef CL_VERSION_1_2
 472             cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_and_local_H2D_done));
 473 #else
 474             cl_error = clEnqueueMarker(stream, &(nb->misc_ops_and_local_H2D_done));
 475 #endif
 476             assert(CL_SUCCESS == cl_error);
 477
 478             /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
 479              * in the local stream in order to be able to sync with the above event
 480              * from the non-local stream.
 481              */
 482             cl_error = clFlush(stream);
 483             assert(CL_SUCCESS == cl_error);
 484         }
 485         else
 486         {
 487             sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done));
 488         }
 489     }
 490
 491     if (plist->nsci == 0)
 492     {
 493         /* Don't launch an empty local kernel (is not allowed with OpenCL).
 494          * TODO: Separate H2D and kernel launch into separate functions.
 495          */
 496         return;
 497     }
 498
 499     /* beginning of timed nonbonded calculation section */
 500
 501     /* get the pointer to the kernel flavor we need to use */
 502     nb_kernel = select_nbnxn_kernel(nb,
 503                                     nbp->eeltype,
 504                                     nbp->vdwtype,
 505                                     bCalcEner,
 506                                     plist->bDoPrune || always_prune);
 507
 508     /* kernel launch config */
 509     local_work_size[0] = CL_SIZE;
 510     local_work_size[1] = CL_SIZE;
 511     local_work_size[2] = 1;
 512
 513     global_work_size[0] = plist->nsci * local_work_size[0];
 514     global_work_size[1] = 1 * local_work_size[1];
 515     global_work_size[2] = 1 * local_work_size[2];
 516
 517     validate_global_work_size(global_work_size, 3, nb->dev_info);
 518
 519     shmem     = calc_shmem_required();
 520
 521 #ifdef DEBUG_OCL
 522     {
 523         static int run_step = 1;
 524
 525         if (DEBUG_RUN_STEP == run_step)
 526         {
 527             debug_buffer_size = global_work_size[0] * global_work_size[1] * global_work_size[2] * sizeof(float);
 528             debug_buffer_h    = (float*)calloc(1, debug_buffer_size);
 529             assert(NULL != debug_buffer_h);
 530
 531             if (NULL == nb->debug_buffer)
 532             {
 533                 nb->debug_buffer = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
 534                                                   debug_buffer_size, debug_buffer_h, &cl_error);
 535
 536                 assert(CL_SUCCESS == cl_error);
 537             }
 538         }
 539
 540         run_step++;
 541     }
 542 #endif
 543     if (debug)
 544     {
 545         fprintf(debug, "GPU launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
 546                 "Global work size : %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
 547                 (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]),
 548                 (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*NCL_PER_SUPERCL,
 549                 NCL_PER_SUPERCL, plist->na_c);
 550     }
 551
 552     fillin_ocl_structures(nbp, &nbparams_params);
 553
 554     arg_no    = 0;
 555     cl_error  = clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &(adat->ntypes));
 556     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(nbparams_params), &(nbparams_params));
 557     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->xq));
 558     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->f));
 559     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_lj));
 560     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_el));
 561     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->fshift));
 562     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->atom_types));
 563     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec));
 564     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_climg2d));
 565     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_comb_climg2d));
 566     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->coulomb_tab_climg2d));
 567     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->sci));
 568     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->cj4));
 569     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->excl));
 570     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &bCalcFshift);
 571     cl_error |= clSetKernelArg(nb_kernel, arg_no++, shmem, NULL);
 572     cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nb->debug_buffer));
 573
 574     assert(cl_error == CL_SUCCESS);
 575
 576     if (cl_error)
 577     {
 578         printf("ClERROR! %d\n", cl_error);
 579     }
 580     cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? &(t->nb_k[iloc]) : NULL);
 581     assert(cl_error == CL_SUCCESS);
 582
 583 #ifdef DEBUG_OCL
 584     {
 585         static int run_step = 1;
 586
 587         if (DEBUG_RUN_STEP == run_step)
 588         {
 589             FILE *pf;
 590             char  file_name[256] = {0};
 591
 592             ocl_copy_D2H_async(debug_buffer_h, nb->debug_buffer, 0,
 593                                debug_buffer_size, stream, NULL);
 594
 595             // Make sure all data has been transfered back from device
 596             clFinish(stream);
 597
 598             printf("\nWriting debug_buffer to debug_buffer_ocl.txt...");
 599
 600             sprintf(file_name, "debug_buffer_ocl_%d.txt", DEBUG_RUN_STEP);
 601             pf = fopen(file_name, "wt");
 602             assert(pf != NULL);
 603
 604             fprintf(pf, "%20s", "");
 605             for (int j = 0; j < global_work_size[0]; j++)
 606             {
 607                 char label[20];
 608                 sprintf(label, "(wIdx=%2d thIdx=%2d)", j / local_work_size[0], j % local_work_size[0]);
 609                 fprintf(pf, "%20s", label);
 610             }
 611
 612             for (int i = 0; i < global_work_size[1]; i++)
 613             {
 614                 char label[20];
 615                 sprintf(label, "(wIdy=%2d thIdy=%2d)", i / local_work_size[1], i % local_work_size[1]);
 616                 fprintf(pf, "\n%20s", label);
 617
 618                 for (int j = 0; j < global_work_size[0]; j++)
 619                 {
 620                     fprintf(pf, "%20.5f", debug_buffer_h[i * global_work_size[0] + j]);
 621                 }
 622
 623                 //fprintf(pf, "\n");
 624             }
 625
 626             fclose(pf);
 627
 628             printf(" done.\n");
 629
 630
 631             free(debug_buffer_h);
 632             debug_buffer_h = NULL;
 633         }
 634
 635         run_step++;
 636     }
 637 #endif
 638 }
 639
 640 /*! \brief Debugging helper function */
 641 void dump_compare_results_cj4(nbnxn_cj4_t* results, int cnt, char* out_file, char* ref_file)
 642 {
 643     FILE *pf;
 644
 645     pf = fopen(out_file, "wt");
 646     assert(pf != NULL);
 647
 648     fprintf(pf, "%20s%20s%20s%20s%20s%20s%20s%20s\n",
 649             "cj[0]", "cj[1]", "cj[2]", "cj[3]",
 650             "imei[0].excl_ind", "imei[0].imask",
 651             "imei[1].excl_ind", "imei[1].imask");
 652
 653     for (int index = 0; index < cnt; index++)
 654     {
 655         fprintf(pf, "%20d%20d%20d%20d%20d%20u%20d%20u\n",
 656                 results[index].cj[0], results[index].cj[1], results[index].cj[2], results[index].cj[3],
 657                 results[index].imei[0].excl_ind, results[index].imei[0].imask,
 658                 results[index].imei[1].excl_ind, results[index].imei[1].imask);
 659     }
 660
 661     fclose(pf);
 662
 663     printf("\nWrote results to %s", out_file);
 664
 665     pf = fopen(ref_file, "rt");
 666     if (pf)
 667     {
 668         char c;
 669         int  diff = 0;
 670         printf("\n%s file found. Comparing results...", ref_file);
 671
 672         /* Skip the first line */
 673         c = 0;
 674         while (c != '\n')
 675         {
 676             if (1 != fscanf(pf, "%c", &c))
 677             {
 678                 break;
 679             }
 680         }
 681
 682         for (int index = 0; index < cnt; index++)
 683         {
 684             int          ref_val;
 685             unsigned int u_ref_val;
 686
 687             for (int j = 0; j < 4; j++)
 688             {
 689                 if (1 != fscanf(pf, "%20d", &ref_val))
 690                 {
 691                     break;
 692                 }
 693
 694                 if (ref_val != results[index].cj[j])
 695                 {
 696                     printf("\nDifference for cj[%d] at index %d computed value = %d reference value = %d",
 697                            j, index, results[index].cj[j], ref_val);
 698
 699                     diff++;
 700                 }
 701             }
 702
 703             for (int j = 0; j < 2; j++)
 704             {
 705                 if (1 != fscanf(pf, "%20d", &ref_val))
 706                 {
 707                     break;
 708                 }
 709
 710                 if (ref_val != results[index].imei[j].excl_ind)
 711                 {
 712                     printf("\nDifference for imei[%d].excl_ind at index %d computed value = %d reference value = %d",
 713                            j, index, results[index].imei[j].excl_ind, ref_val);
 714
 715                     diff++;
 716                 }
 717
 718                 if (1 != fscanf(pf, "%20u", &u_ref_val))
 719                 {
 720                     break;
 721                 }
 722
 723                 if (u_ref_val != results[index].imei[j].imask)
 724                 {
 725                     printf("\nDifference for imei[%d].imask at index %d computed value = %u reference value = %u",
 726                            j, index, results[index].imei[j].imask, u_ref_val);
 727
 728                     diff++;
 729                 }
 730
 731             }
 732         }
 733
 734         printf("\nFinished comparing results. Total number of differences: %d", diff);
 735         fclose(pf);
 736     }
 737     else
 738     {
 739         printf("\n%s file not found. No comparison performed.", ref_file);
 740     }
 741 }
 742
 743 /*! \brief Debugging helper function */
 744 void dump_compare_results_f(float* results, int cnt, char* out_file, char* ref_file)
 745 {
 746     FILE *pf;
 747     float cmp_eps = 0.001f;
 748
 749     pf = fopen(out_file, "wt");
 750     assert(pf != NULL);
 751
 752     for (int index = 0; index < cnt; index++)
 753     {
 754         fprintf(pf, "%15.5f\n", results[index]);
 755     }
 756
 757     fclose(pf);
 758
 759     printf("\nWrote results to %s", out_file);
 760
 761     pf = fopen(ref_file, "rt");
 762     if (pf)
 763     {
 764         int diff = 0;
 765         printf("\n%s file found. Comparing results...", ref_file);
 766         for (int index = 0; index < cnt; index++)
 767         {
 768             float ref_val;
 769             if (1 != fscanf(pf, "%20f", &ref_val))
 770             {
 771                 break;
 772             }
 773
 774             if (((ref_val - results[index]) > cmp_eps) ||
 775                 ((ref_val - results[index]) < -cmp_eps))
 776             {
 777                 printf("\nDifference at index %d computed value = %15.5f reference value = %15.5f",
 778                        index, results[index], ref_val);
 779
 780                 diff++;
 781             }
 782         }
 783
 784         printf("\nFinished comparing results. Total number of differences: %d", diff);
 785         fclose(pf);
 786     }
 787     else
 788     {
 789         printf("\n%s file not found. No comparison performed.", ref_file);
 790     }
 791 }
 792
 793 /*! \brief
 794  * Debug function for dumping cj4, f and fshift buffers.
 795  * By default this function does nothing. To enable debugging for any of these
 796  * buffers, uncomment the corresponding definition inside the function:
 797  * DEBUG_DUMP_CJ4_OCL, DEBUG_DUMP_F_OCL, DEBUG_DUMP_FSHIFT_OCL.
 798  */
 799 static
 800 void debug_dump_cj4_f_fshift(gmx_nbnxn_ocl_t               gmx_unused *nb,
 801                              const struct nbnxn_atomdata_t gmx_unused *nbatom,
 802                              cl_command_queue              gmx_unused  stream,
 803                              int                           gmx_unused  adat_begin,
 804                              int                           gmx_unused  adat_len)
 805 {
 806 /* Uncomment this define to enable cj4 debugging for the first kernel run */
 807 //#define DEBUG_DUMP_CJ4_OCL
 808 #ifdef DEBUG_DUMP_CJ4_OCL
 809     {
 810         static int run_step = 1;
 811
 812         if (DEBUG_RUN_STEP == run_step)
 813         {
 814             nbnxn_cj4_t *temp_cj4;
 815             int          cnt;
 816             size_t       size;
 817             char         ocl_file_name[256]  = {0};
 818             char         cuda_file_name[256] = {0};
 819
 820             cnt      = nb->plist[0]->ncj4;
 821             size     = cnt * sizeof(nbnxn_cj4_t);
 822             temp_cj4 = (nbnxn_cj4_t*)malloc(size);
 823
 824             ocl_copy_D2H_async(temp_cj4, nb->plist[0]->cj4, 0,
 825                                size, stream, NULL);
 826
 827             // Make sure all data has been transfered back from device
 828             clFinish(stream);
 829
 830             sprintf(ocl_file_name, "ocl_cj4_%d.txt", DEBUG_RUN_STEP);
 831             sprintf(cuda_file_name, "cuda_cj4_%d.txt", DEBUG_RUN_STEP);
 832             dump_compare_results_cj4(temp_cj4, cnt, ocl_file_name, cuda_file_name);
 833
 834             free(temp_cj4);
 835         }
 836
 837         run_step++;
 838     }
 839 #endif
 840
 841 /* Uncomment this define to enable f debugging for the first kernel run */
 842 //#define DEBUG_DUMP_F_OCL
 843 #ifdef DEBUG_DUMP_F_OCL
 844     {
 845         static int run_step = 1;
 846
 847         if (DEBUG_RUN_STEP == run_step)
 848         {
 849             char ocl_file_name[256]  = {0};
 850             char cuda_file_name[256] = {0};
 851
 852             // Make sure all data has been transfered back from device
 853             clFinish(stream);
 854
 855             sprintf(ocl_file_name, "ocl_f_%d.txt", DEBUG_RUN_STEP);
 856             sprintf(cuda_file_name, "cuda_f_%d.txt", DEBUG_RUN_STEP);
 857
 858             dump_compare_results_f(nbatom->out[0].f + adat_begin * 3, (adat_len) * 3,
 859                                    ocl_file_name, cuda_file_name);
 860         }
 861
 862         run_step++;
 863     }
 864 #endif
 865
 866 /* Uncomment this define to enable fshift debugging for the first kernel run */
 867 //#define DEBUG_DUMP_FSHIFT_OCL
 868 #ifdef DEBUG_DUMP_FSHIFT_OCL
 869     {
 870         static int run_step = 1;
 871
 872         if (DEBUG_RUN_STEP == run_step)
 873         {
 874             char ocl_file_name[256]  = {0};
 875             char cuda_file_name[256] = {0};
 876
 877             // Make sure all data has been transfered back from device
 878             clFinish(stream);
 879
 880             sprintf(ocl_file_name, "ocl_fshift_%d.txt", DEBUG_RUN_STEP);
 881             sprintf(cuda_file_name, "cuda_fshift_%d.txt", DEBUG_RUN_STEP);
 882
 883             dump_compare_results_f((float*)(nb->nbst.fshift), SHIFTS * 3,
 884                                    ocl_file_name, cuda_file_name);
 885         }
 886
 887         run_step++;
 888     }
 889 #endif
 890 }
 891
 892 /*! \brief
 893  * Launch asynchronously the download of nonbonded forces from the GPU
 894  * (and energies/shift forces if required).
 895  */
 896 void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
 897                               const struct nbnxn_atomdata_t *nbatom,
 898                               int                            flags,
 899                               int                            aloc)
 900 {
 901     cl_int gmx_unused cl_error;
 902     int               adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 903     int               iloc = -1;
 904
 905     /* determine interaction locality from atom locality */
 906     if (LOCAL_A(aloc))
 907     {
 908         iloc = eintLocal;
 909     }
 910     else if (NONLOCAL_A(aloc))
 911     {
 912         iloc = eintNonlocal;
 913     }
 914     else
 915     {
 916         char stmp[STRLEN];
 917         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 918                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 919
 920         gmx_incons(stmp);
 921     }
 922
 923     cl_atomdata_t   *adat    = nb->atdat;
 924     cl_timers_t     *t       = nb->timers;
 925     bool             bDoTime = nb->bDoTime;
 926     cl_command_queue stream  = nb->stream[iloc];
 927
 928     bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
 929     int              bCalcFshift = flags & GMX_FORCE_VIRIAL;
 930
 931
 932     /* don't launch non-local copy-back if there was no non-local work to do */
 933     if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
 934     {
 935         return;
 936     }
 937
 938     /* calculate the atom data index range based on locality */
 939     if (LOCAL_A(aloc))
 940     {
 941         adat_begin  = 0;
 942         adat_len    = adat->natoms_local;
 943     }
 944     else
 945     {
 946         adat_begin  = adat->natoms_local;
 947         adat_len    = adat->natoms - adat->natoms_local;
 948     }
 949
 950     /* beginning of timed D2H section */
 951
 952     /* With DD the local D2H transfer can only start after the non-local
 953        has been launched. */
 954     if (iloc == eintLocal && nb->bUseTwoStreams)
 955     {
 956         sync_ocl_event(stream, &(nb->nonlocal_done));
 957     }
 958
 959     /* DtoH f */
 960     ocl_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
 961                        (adat_len)* adat->f_elem_size, stream, bDoTime ? &(t->nb_d2h_f[iloc]) : NULL);
 962
 963     /* kick off work */
 964     cl_error = clFlush(stream);
 965     assert(CL_SUCCESS == cl_error);
 966
 967     /* After the non-local D2H is launched the nonlocal_done event can be
 968        recorded which signals that the local D2H can proceed. This event is not
 969        placed after the non-local kernel because we first need the non-local
 970        data back first. */
 971     if (iloc == eintNonlocal)
 972     {
 973 #ifdef CL_VERSION_1_2
 974         cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->nonlocal_done));
 975 #else
 976         cl_error = clEnqueueMarker(stream, &(nb->nonlocal_done));
 977 #endif
 978         assert(CL_SUCCESS == cl_error);
 979     }
 980
 981     /* only transfer energies in the local stream */
 982     if (LOCAL_I(iloc))
 983     {
 984         /* DtoH fshift */
 985         if (bCalcFshift)
 986         {
 987             ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
 988                                SHIFTS * adat->fshift_elem_size, stream, bDoTime ? &(t->nb_d2h_fshift[iloc]) : NULL);
 989         }
 990
 991         /* DtoH energies */
 992         if (bCalcEner)
 993         {
 994             ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
 995                                sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_lj[iloc]) : NULL);
 996
 997             ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
 998                                sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_el[iloc]) : NULL);
 999         }
1000     }
1001
1002     debug_dump_cj4_f_fshift(nb, nbatom, stream, adat_begin, adat_len);
1003 }
1004
1005 /*! \brief
1006  * Wait for the asynchronously launched nonbonded calculations and data
1007  * transfers to finish.
1008  */
1009 void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_ocl_t *nb,
1010                             int flags, int aloc,
1011                             real *e_lj, real *e_el, rvec *fshift)
1012 {
1013     /* NOTE:  only implemented for single-precision at this time */
1014     cl_int gmx_unused      cl_error;
1015     int                    i, iloc = -1;
1016
1017     /* determine interaction locality from atom locality */
1018     if (LOCAL_A(aloc))
1019     {
1020         iloc = eintLocal;
1021     }
1022     else if (NONLOCAL_A(aloc))
1023     {
1024         iloc = eintNonlocal;
1025     }
1026     else
1027     {
1028         char stmp[STRLEN];
1029         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
1030                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
1031         gmx_incons(stmp);
1032     }
1033
1034     cl_plist_t                 *plist    = nb->plist[iloc];
1035     cl_timers_t                *timers   = nb->timers;
1036     struct gmx_wallclock_gpu_t *timings  = nb->timings;
1037     cl_nb_staging               nbst     = nb->nbst;
1038
1039     bool                        bCalcEner   = flags & GMX_FORCE_ENERGY;
1040     int                         bCalcFshift = flags & GMX_FORCE_VIRIAL;
1041
1042     /* turn energy calculation always on/off (for debugging/testing only) */
1043     bCalcEner = (bCalcEner || always_ener) && !never_ener;
1044
1045     /* Launch wait/update timers & counters, unless doing the non-local phase
1046        when there is not actually work to do. This is consistent with
1047        nbnxn_gpu_launch_kernel.
1048
1049        NOTE: if timing with multiple GPUs (streams) becomes possible, the
1050        counters could end up being inconsistent due to not being incremented
1051        on some of the nodes! */
1052     if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
1053     {
1054         return;
1055     }
1056
1057     /* Actual sync point. Waits for everything to be finished in the command queue. TODO: Find out if a more fine grained solution is needed */
1058     cl_error = clFinish(nb->stream[iloc]);
1059     assert(CL_SUCCESS == cl_error);
1060
1061     /* timing data accumulation */
1062     if (nb->bDoTime)
1063     {
1064         /* only increase counter once (at local F wait) */
1065         if (LOCAL_I(iloc))
1066         {
1067             timings->nb_c++;
1068             timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
1069         }
1070
1071         /* kernel timings */
1072
1073         timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
1074             ocl_event_elapsed_ms(timers->nb_k + iloc);
1075
1076         /* X/q H2D and F D2H timings */
1077         timings->nb_h2d_t += ocl_event_elapsed_ms(timers->nb_h2d        + iloc);
1078         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_f      + iloc);
1079         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_fshift + iloc);
1080         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_el   + iloc);
1081         timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_lj   + iloc);
1082
1083         /* only count atdat and pair-list H2D at pair-search step */
1084         if (plist->bDoPrune)
1085         {
1086             /* atdat transfer timing (add only once, at local F wait) */
1087             if (LOCAL_A(aloc))
1088             {
1089                 timings->pl_h2d_c++;
1090                 timings->pl_h2d_t += ocl_event_elapsed_ms(&(timers->atdat));
1091             }
1092
1093             timings->pl_h2d_t +=
1094                 ocl_event_elapsed_ms(timers->pl_h2d_sci     + iloc) +
1095                 ocl_event_elapsed_ms(timers->pl_h2d_cj4     + iloc) +
1096                 ocl_event_elapsed_ms(timers->pl_h2d_excl    + iloc);
1097
1098         }
1099     }
1100
1101     /* add up energies and shift forces (only once at local F wait) */
1102     if (LOCAL_I(iloc))
1103     {
1104         if (bCalcEner)
1105         {
1106             *e_lj += *nbst.e_lj;
1107             *e_el += *nbst.e_el;
1108         }
1109
1110         if (bCalcFshift)
1111         {
1112             for (i = 0; i < SHIFTS; i++)
1113             {
1114                 fshift[i][0] += (nbst.fshift)[i][0];
1115                 fshift[i][1] += (nbst.fshift)[i][1];
1116                 fshift[i][2] += (nbst.fshift)[i][2];
1117             }
1118         }
1119     }
1120
1121     /* turn off pruning (doesn't matter if this is pair-search step or not) */
1122     plist->bDoPrune = false;
1123
1124 }
1125
1126 /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
1127 int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut)
1128 {
1129     bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
1130     int  kernel_type;
1131
1132     /* Benchmarking/development environment variables to force the use of
1133        analytical or tabulated Ewald kernel. */
1134     bForceAnalyticalEwald = (getenv("GMX_OCL_NB_ANA_EWALD") != NULL);
1135     bForceTabulatedEwald  = (getenv("GMX_OCL_NB_TAB_EWALD") != NULL);
1136
1137     if (bForceAnalyticalEwald && bForceTabulatedEwald)
1138     {
1139         gmx_incons("Both analytical and tabulated Ewald OpenCL non-bonded kernels "
1140                    "requested through environment variables.");
1141     }
1142
1143     /* OpenCL: By default, use analytical Ewald
1144      * TODO: tabulated does not work, it needs fixing, see init_nbparam() in nbnxn_ocl_data_mgmt.cpp
1145      *
1146      * TODO: decide if dev_info parameter should be added to recognize NVIDIA CC>=3.0 devices.
1147      *
1148      */
1149     //if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
1150     if ((1                         || bForceAnalyticalEwald) && !bForceTabulatedEwald)
1151     {
1152         bUseAnalyticalEwald = true;
1153
1154         if (debug)
1155         {
1156             fprintf(debug, "Using analytical Ewald OpenCL kernels\n");
1157         }
1158     }
1159     else
1160     {
1161         bUseAnalyticalEwald = false;
1162
1163         if (debug)
1164         {
1165             fprintf(debug, "Using tabulated Ewald OpenCL kernels\n");
1166         }
1167     }
1168
1169     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
1170        forces it (use it for debugging/benchmarking only). */
1171     if (!bTwinCut && (getenv("GMX_OCL_NB_EWALD_TWINCUT") == NULL))
1172     {
1173         kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA : eelOclEWALD_TAB;
1174     }
1175     else
1176     {
1177         kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA_TWIN : eelOclEWALD_TAB_TWIN;
1178     }
1179
1180     return kernel_type;
1181 }