From cf3f6da4c89a726fd46d0066c168e118de3453e7 Mon Sep 17 00:00:00 2001 From: Andrey Alekseenko Date: Sun, 20 Sep 2020 09:39:41 +0000 Subject: [PATCH] Add NVIDIA GPU compatibility check, following issue #3125 The incompatibility is documented, but better to verify. --- src/gromacs/hardware/device_information.h | 11 +++++---- src/gromacs/hardware/device_management_ocl.cpp | 33 +++++++++++++++++++++++++- 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/src/gromacs/hardware/device_information.h b/src/gromacs/hardware/device_information.h index b38cccb703..e3005e1d98 100644 --- a/src/gromacs/hardware/device_information.h +++ b/src/gromacs/hardware/device_information.h @@ -71,16 +71,18 @@ enum class DeviceStatus : int Incompatible = 2, //! OpenCL device has incompatible cluster size for non-bonded kernels. IncompatibleClusterSize = 3, + //! There are known issues with NVIDIA Volta and newer. + IncompatibleNvidiaVolta = 4, /*! \brief An error occurred during the functionality checks. * That indicates malfunctioning of the device, driver, or incompatible driver/runtime. */ - NonFunctional = 4, + NonFunctional = 5, /*! \brief CUDA devices are busy or unavailable. * typically due to use of \p cudaComputeModeExclusive, \p cudaComputeModeProhibited modes. */ - Unavailable = 5, + Unavailable = 6, //! Enumeration size - Count = 6 + Count = 7 }; /*! \brief Names of the GPU detection/check results @@ -98,7 +100,8 @@ static const gmx::EnumerationArray c_deviceStateStrin // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) "incompatible (please recompile with correct GMX" "_OPENCL_NB_CLUSTER_SIZE of 4)", // clang-format on - "non-functional", "unavailable" + "incompatible (please use CUDA build for NVIDIA Volta GPUs or newer)", "non-functional", + "unavailable" }; //! Device vendors diff --git a/src/gromacs/hardware/device_management_ocl.cpp b/src/gromacs/hardware/device_management_ocl.cpp index c46a04ee01..5380383fcf 100644 --- a/src/gromacs/hardware/device_management_ocl.cpp +++ b/src/gromacs/hardware/device_management_ocl.cpp @@ -112,6 +112,35 @@ static bool runningOnCompatibleOSForAmd() #endif } +/*! \brief Return true if executing on compatible GPU for NVIDIA OpenCL. + * + * There are known issues with OpenCL when running on NVIDIA Volta or newer (CC 7+). + * As a workaround, we recommend using CUDA on such hardware. + * + * This function relies on cl_nv_device_attribute_query. In case it's not functioning properly, + * we trust the user and mark the device as compatible. + * + * \return true if running on Pascal (CC 6.x) or older, or if we can not determine device generation. + */ +static bool runningOnCompatibleHWForNvidia(const DeviceInformation& deviceInfo) +{ + // The macro is defined in Intel's and AMD's headers, but it's not strictly required to be there. +#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV + return true; +#else + static const unsigned int ccMajorBad = 7; // Volta and Turing + unsigned int ccMajor; + cl_device_id devId = deviceInfo.oclDeviceId; + const cl_int err = clGetDeviceInfo(devId, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, + sizeof(ccMajor), &ccMajor, nullptr); + if (err != CL_SUCCESS) + { + return true; // Err on a side of trusting the user to know what they are doing. + } + return ccMajor < ccMajorBad; +#endif +} + /*! * \brief Checks that device \c deviceInfo is compatible with GROMACS. * @@ -151,7 +180,9 @@ static DeviceStatus isDeviceFunctional(const DeviceInformation& deviceInfo) /* Only AMD, Intel, and NVIDIA GPUs are supported for now */ switch (deviceInfo.deviceVendor) { - case DeviceVendor::Nvidia: return DeviceStatus::Compatible; + case DeviceVendor::Nvidia: + return runningOnCompatibleHWForNvidia(deviceInfo) ? DeviceStatus::Compatible + : DeviceStatus::IncompatibleNvidiaVolta; case DeviceVendor::Amd: return runningOnCompatibleOSForAmd() ? DeviceStatus::Compatible : DeviceStatus::Incompatible; case DeviceVendor::Intel: -- 2.11.4.GIT