From 4d79d129c77236c942ece15ed3ba5390baaa1a91 Mon Sep 17 00:00:00 2001 From: Aleksei Iupinov Date: Thu, 17 May 2018 18:40:13 +0200 Subject: [PATCH] Move the PmeGpuSpecific structure into pme-gpu-types-host-impl.h This host structure contains trait-dependent types, but should also be visible to OpenCL. Change-Id: I068e537687b78488d66c927a355a115e4581885b --- src/gromacs/ewald/pme-gpu-types-host-impl.h | 146 ++++++++++++++++++++++++++++ src/gromacs/ewald/pme-gpu-types-host.h | 14 +-- src/gromacs/ewald/pme-timings.cu | 4 +- src/gromacs/ewald/pme.cuh | 89 +---------------- 4 files changed, 157 insertions(+), 96 deletions(-) create mode 100644 src/gromacs/ewald/pme-gpu-types-host-impl.h diff --git a/src/gromacs/ewald/pme-gpu-types-host-impl.h b/src/gromacs/ewald/pme-gpu-types-host-impl.h new file mode 100644 index 0000000000..e005552400 --- /dev/null +++ b/src/gromacs/ewald/pme-gpu-types-host-impl.h @@ -0,0 +1,146 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2018, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief Defines the host-side PME GPU data structure, which is dependent on the GPU types. + * It's included by pointer in the general PmeGpu host structure in pme-gpu-types-host.h. + * + * \author Aleksei Iupinov + * \ingroup module_ewald + */ + +#ifndef PMEGPUTYPESHOSTIMPL_H +#define PMEGPUTYPESHOSTIMPL_H + +#include "config.h" + +#include +#include +#include + +#if GMX_GPU == GMX_GPU_CUDA +#include "gromacs/gpu_utils/gpueventsynchronizer.cuh" +#include "gromacs/gpu_utils/gpuregiontimer.cuh" +#elif GMX_GPU == GMX_GPU_OPENCL +#include "gromacs/gpu_utils/gpueventsynchronizer_ocl.h" +#include "gromacs/gpu_utils/gpuregiontimer_ocl.h" +#endif + +#include "gromacs/timing/gpu_timing.h" // for gtPME_EVENT_COUNT + +class GpuParallel3dFft; + +/*! \internal \brief + * The main PME CUDA/OpenCL-specific host data structure, included in the PME GPU structure by the archSpecific pointer. + */ +struct PmeGpuSpecific +{ + /*! \brief The GPU stream where everything related to the PME happens. */ + CommandStream pmeStream; + + /*! \brief + * A handle to the GPU context. + * TODO: this is currently extracted from the implementation of pmeGpu->programHandle_, + * but should be a constructor parameter to PmeGpu, as well as PmeGpuProgram, + * managed by high-level code. + */ + Context context; + + /* Synchronization events */ + /*! \brief Triggered after the grid has been copied to the host (after the spreading stage). */ + GpuEventSynchronizer syncSpreadGridD2H; + + /* Settings which are set at the start of the run */ + /*! \brief A boolean which tells whether the complex and real grids for cu/clFFT are different or same. Currenty true. */ + bool performOutOfPlaceFFT; + /*! \brief A boolean which tells if the GPU timing events are enabled. + * False by default, can be enabled by setting the environment variable GMX_ENABLE_GPU_TIMING. + * Note: will not be reliable when multiple GPU tasks are running concurrently on the same device context, + * as CUDA events on multiple streams are untrustworthy. + */ + bool useTiming; + + //! Vector of FFT setups + std::vector > fftSetup; + + //! All the timers one might use + std::array timingEvents; + + //! Indices of timingEvents actually used + std::set activeTimers; + + /* GPU arrays element counts (not the arrays sizes in bytes!). + * They might be larger than the actual meaningful data sizes. + * These are paired: the actual element count + the maximum element count that can fit in the current allocated memory. + * These integer pairs are mostly meaningful for the reallocateDeviceBuffer calls. + * As such, if DeviceBuffer is refactored into a class, they can be freely changed, too. + * The only exceptions are realGridSize and complexGridSize which are also used for grid clearing/copying. + * TODO: these should live in a clean buffered container type, and be refactored in the NB/cudautils as well. + */ + /*! \brief The kernelParams.atoms.coordinates float element count (actual)*/ + int coordinatesSize; + /*! \brief The kernelParams.atoms.coordinates float element count (reserved) */ + int coordinatesSizeAlloc; + /*! \brief The kernelParams.atoms.forces float element count (actual) */ + int forcesSize; + /*! \brief The kernelParams.atoms.forces float element count (reserved) */ + int forcesSizeAlloc; + /*! \brief The kernelParams.atoms.gridlineIndices int element count (actual) */ + int gridlineIndicesSize; + /*! \brief The kernelParams.atoms.gridlineIndices int element count (reserved) */ + int gridlineIndicesSizeAlloc; + /*! \brief Both the kernelParams.atoms.theta and kernelParams.atoms.dtheta float element count (actual) */ + int splineDataSize; + /*! \brief Both the kernelParams.atoms.theta and kernelParams.atoms.dtheta float element count (reserved) */ + int splineDataSizeAlloc; + /*! \brief The kernelParams.atoms.coefficients float element count (actual) */ + int coefficientsSize; + /*! \brief The kernelParams.atoms.coefficients float element count (reserved) */ + int coefficientsSizeAlloc; + /*! \brief The kernelParams.grid.splineValuesArray float element count (actual) */ + int splineValuesSize; + /*! \brief The kernelParams.grid.splineValuesArray float element count (reserved) */ + int splineValuesSizeAlloc; + /*! \brief The kernelParams.grid.realGrid float element count (actual) */ + int realGridSize; + /*! \brief The kernelParams.grid.realGrid float element count (reserved) */ + int realGridSizeAlloc; + /*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (actual) */ + int complexGridSize; + /*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (reserved) */ + int complexGridSizeAlloc; +}; + +#endif diff --git a/src/gromacs/ewald/pme-gpu-types-host.h b/src/gromacs/ewald/pme-gpu-types-host.h index ccc83940b1..8335fe8ec8 100644 --- a/src/gromacs/ewald/pme-gpu-types-host.h +++ b/src/gromacs/ewald/pme-gpu-types-host.h @@ -59,20 +59,20 @@ #include "gromacs/gpu_utils/hostallocator.h" #include "gromacs/math/vectypes.h" -#if GMX_GPU == GMX_GPU_CUDA - -struct PmeGpuCuda; -/*! \brief A typedef for including the GPU host data by pointer */ -typedef PmeGpuCuda PmeGpuSpecific; +#if GMX_GPU != GMX_GPU_NONE +struct PmeGpuSpecific; +#else +/*! \brief A dummy typedef for the GPU host data placeholder on non-GPU builds */ +typedef int PmeGpuSpecific; +#endif +#if GMX_GPU == GMX_GPU_CUDA struct PmeGpuCudaKernelParams; /*! \brief A typedef for including the GPU kernel arguments data by pointer */ typedef PmeGpuCudaKernelParams PmeGpuKernelParams; #else -/*! \brief A dummy typedef for the GPU host data placeholder on non-GPU builds */ -typedef int PmeGpuSpecific; /*! \brief A dummy typedef for the GPU kernel arguments data placeholder on non-GPU builds */ typedef int PmeGpuKernelParams; diff --git a/src/gromacs/ewald/pme-timings.cu b/src/gromacs/ewald/pme-timings.cu index 306ad2392b..dd6bc3ea38 100644 --- a/src/gromacs/ewald/pme-timings.cu +++ b/src/gromacs/ewald/pme-timings.cu @@ -45,7 +45,9 @@ #include "gromacs/utility/gmxassert.h" -#include "pme.cuh" +#include "pme-gpu-internal.h" +#include "pme-gpu-types-host.h" +#include "pme-gpu-types-host-impl.h" /*! \brief \internal * Tells if CUDA-based performance tracking is enabled for PME. diff --git a/src/gromacs/ewald/pme.cuh b/src/gromacs/ewald/pme.cuh index 42ebb14635..bac34b69ef 100644 --- a/src/gromacs/ewald/pme.cuh +++ b/src/gromacs/ewald/pme.cuh @@ -47,19 +47,13 @@ #include -#include -#include - -#include "gromacs/gpu_utils/gpueventsynchronizer.cuh" - #include "pme-gpu-constants.h" #include "pme-gpu-internal.h" #include "pme-gpu-types.h" #include "pme-gpu-types-host.h" +#include "pme-gpu-types-host-impl.h" #include "pme-timings.cuh" -class GpuParallel3dFft; - /*! \internal \brief * Gets a base of the unique index to an element in a spline parameter buffer (theta/dtheta), * which is laid out for GPU spread/gather kernels. The base only corresponds to the atom index within the execution block. @@ -151,87 +145,6 @@ dim3 __host__ inline pmeGpuCreateGrid(const PmeGpu *pmeGpu, int blockCount) } /*! \brief \internal - * The main PME CUDA-specific host data structure, included in the PME GPU structure by the archSpecific pointer. - */ -struct PmeGpuCuda -{ - /*! \brief The CUDA stream where everything related to the PME happens. */ - cudaStream_t pmeStream; - - /*! \brief - * A handle to the GPU context. - * TODO: this is currently extracted from the implementation of pmeGpu->programHandle_, - * but should be a constructor parameter to PmeGpu, as well as PmeGpuProgram, - * managed by high-level code. - */ - Context context; - - /* Synchronization events */ - /*! \brief Triggered after the grid has been copied to the host (after the spreading stage). */ - GpuEventSynchronizer syncSpreadGridD2H; - - // TODO: consider moving some things below into the non-CUDA struct. - - /* Settings which are set at the start of the run */ - /*! \brief A boolean which tells whether the complex and real grids for cuFFT are different or same. Currenty true. */ - bool performOutOfPlaceFFT; - /*! \brief A boolean which tells if the CUDA timing events are enabled. - * False by default, can be enabled by setting the environment variable GMX_ENABLE_GPU_TIMING. - * Note: will not be reliable when multiple GPU tasks are running concurrently on the same device context, - * as CUDA events on multiple streams are untrustworthy. - */ - bool useTiming; - - std::vector > fftSetup; - - std::array timingEvents; - - std::set activeTimers; // indices into timingEvents - - /* GPU arrays element counts (not the arrays sizes in bytes!). - * They might be larger than the actual meaningful data sizes. - * These are paired: the actual element count + the maximum element count that can fit in the current allocated memory. - * These integer pairs are mostly meaningful for the reallocateDeviceBuffer calls. - * As such, if DeviceBuffer is refactored into a class, they can be freely changed, too. - * The only exceptions are realGridSize and complexGridSize which are also used for grid clearing/copying. - * TODO: these should live in a clean buffered container type, and be refactored in the NB/cudautils as well. - */ - /*! \brief The kernelParams.atoms.coordinates float element count (actual)*/ - int coordinatesSize; - /*! \brief The kernelParams.atoms.coordinates float element count (reserved) */ - int coordinatesSizeAlloc; - /*! \brief The kernelParams.atoms.forces float element count (actual) */ - int forcesSize; - /*! \brief The kernelParams.atoms.forces float element count (reserved) */ - int forcesSizeAlloc; - /*! \brief The kernelParams.atoms.gridlineIndices int element count (actual) */ - int gridlineIndicesSize; - /*! \brief The kernelParams.atoms.gridlineIndices int element count (reserved) */ - int gridlineIndicesSizeAlloc; - /*! \brief Both the kernelParams.atoms.theta and kernelParams.atoms.dtheta float element count (actual) */ - int splineDataSize; - /*! \brief Both the kernelParams.atoms.theta and kernelParams.atoms.dtheta float element count (reserved) */ - int splineDataSizeAlloc; - /*! \brief The kernelParams.atoms.coefficients float element count (actual) */ - int coefficientsSize; - /*! \brief The kernelParams.atoms.coefficients float element count (reserved) */ - int coefficientsSizeAlloc; - /*! \brief The kernelParams.grid.splineValuesArray float element count (actual) */ - int splineValuesSize; - /*! \brief The kernelParams.grid.splineValuesArray float element count (reserved) */ - int splineValuesSizeAlloc; - /*! \brief The kernelParams.grid.realGrid float element count (actual) */ - int realGridSize; - /*! \brief The kernelParams.grid.realGrid float element count (reserved) */ - int realGridSizeAlloc; - /*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (actual) */ - int complexGridSize; - /*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (reserved) */ - int complexGridSizeAlloc; -}; - - -/*! \brief \internal * A single structure encompassing all the PME data used in CUDA kernels. * This inherits from PmeGpuKernelParamsBase and adds a couple cudaTextureObject_t handles, * which we would like to avoid in plain C++. -- 2.11.4.GIT