Use device information object instead of id when performing device checks
[gromacs.git] / src / gromacs / hardware / device_management_ocl.cpp
blobc46a04ee01816ebf76e5fdc807a0d2310e778282
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
36 /*! \internal \file
37 * \brief Defines the OpenCL implementations of the device management.
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Teemu Virolainen <teemu@streamcomputing.eu>
42 * \author Mark Abraham <mark.j.abraham@gmail.com>
43 * \author Szilárd Páll <pall.szilard@gmail.com>
44 * \author Artem Zhmurov <zhmurov@gmail.com>
46 * \ingroup module_hardware
48 #include "gmxpre.h"
50 #include "config.h"
52 #include "gromacs/gpu_utils/oclraii.h"
53 #include "gromacs/gpu_utils/oclutils.h"
54 #include "gromacs/hardware/device_management.h"
55 #include "gromacs/utility/fatalerror.h"
56 #include "gromacs/utility/smalloc.h"
57 #include "gromacs/utility/stringutil.h"
59 #include "device_information.h"
61 namespace gmx
64 /*! \brief Returns an DeviceVendor value corresponding to the input OpenCL vendor name.
66 * \returns DeviceVendor value for the input vendor name
68 static DeviceVendor getDeviceVendor(const char* vendorName)
70 if (vendorName)
72 if (strstr(vendorName, "NVIDIA"))
74 return DeviceVendor::Nvidia;
76 else if (strstr(vendorName, "AMD") || strstr(vendorName, "Advanced Micro Devices"))
78 return DeviceVendor::Amd;
80 else if (strstr(vendorName, "Intel"))
82 return DeviceVendor::Intel;
85 return DeviceVendor::Unknown;
88 /*! \brief Return true if executing on compatible OS for AMD OpenCL.
90 * This is assumed to be true for OS X version of at least 10.10.4 and
91 * all other OS flavors.
93 * \return true if version is 14.4 or later (= OS X version 10.10.4),
94 * or OS is not Darwin.
96 static bool runningOnCompatibleOSForAmd()
98 #ifdef __APPLE__
99 int mib[2];
100 char kernelVersion[256];
101 size_t len = sizeof(kernelVersion);
103 mib[0] = CTL_KERN;
105 int major = strtod(kernelVersion, NULL);
106 int minor = strtod(strchr(kernelVersion, '.') + 1, NULL);
108 // Kernel 14.4 corresponds to OS X 10.10.4
109 return (major > 14 || (major == 14 && minor >= 4));
110 #else
111 return true;
112 #endif
116 * \brief Checks that device \c deviceInfo is compatible with GROMACS.
118 * Vendor and OpenCL version support checks are executed an the result
119 * of these returned.
121 * \param[in] deviceInfo The device info pointer.
122 * \returns The status enumeration value for the checked device:
124 static DeviceStatus isDeviceFunctional(const DeviceInformation& deviceInfo)
126 if (getenv("GMX_OCL_DISABLE_COMPATIBILITY_CHECK") != nullptr)
128 // Assume the device is compatible because checking has been disabled.
129 return DeviceStatus::Compatible;
132 // OpenCL device version check, ensure >= REQUIRED_OPENCL_MIN_VERSION
133 constexpr unsigned int minVersionMajor = REQUIRED_OPENCL_MIN_VERSION_MAJOR;
134 constexpr unsigned int minVersionMinor = REQUIRED_OPENCL_MIN_VERSION_MINOR;
136 // Based on the OpenCL spec we're checking the version supported by
137 // the device which has the following format:
138 // OpenCL<space><major_version.minor_version><space><vendor-specific information>
139 unsigned int deviceVersionMinor, deviceVersionMajor;
140 const int valuesScanned = std::sscanf(deviceInfo.device_version, "OpenCL %u.%u",
141 &deviceVersionMajor, &deviceVersionMinor);
142 const bool versionLargeEnough =
143 ((valuesScanned == 2)
144 && ((deviceVersionMajor > minVersionMajor)
145 || (deviceVersionMajor == minVersionMajor && deviceVersionMinor >= minVersionMinor)));
146 if (!versionLargeEnough)
148 return DeviceStatus::Incompatible;
151 /* Only AMD, Intel, and NVIDIA GPUs are supported for now */
152 switch (deviceInfo.deviceVendor)
154 case DeviceVendor::Nvidia: return DeviceStatus::Compatible;
155 case DeviceVendor::Amd:
156 return runningOnCompatibleOSForAmd() ? DeviceStatus::Compatible : DeviceStatus::Incompatible;
157 case DeviceVendor::Intel:
158 return GMX_OPENCL_NB_CLUSTER_SIZE == 4 ? DeviceStatus::Compatible
159 : DeviceStatus::IncompatibleClusterSize;
160 default: return DeviceStatus::Incompatible;
164 /*! \brief Make an error string following an OpenCL API call.
166 * It is meant to be called with \p status != CL_SUCCESS, but it will
167 * work correctly even if it is called with no OpenCL failure.
169 * \todo Make use of this function more.
171 * \param[in] message Supplies context, e.g. the name of the API call that returned the error.
172 * \param[in] status OpenCL API status code
173 * \returns A string describing the OpenCL error.
175 inline std::string makeOpenClInternalErrorString(const char* message, cl_int status)
177 if (message != nullptr)
179 return gmx::formatString("%s did %ssucceed %d: %s", message,
180 ((status != CL_SUCCESS) ? "not " : ""), status,
181 ocl_get_error_string(status).c_str());
183 else
185 return gmx::formatString("%sOpenCL error encountered %d: %s",
186 ((status != CL_SUCCESS) ? "" : "No "), status,
187 ocl_get_error_string(status).c_str());
192 * \brief Checks that device \c deviceInfo is sane (ie can run a kernel).
194 * Compiles and runs a dummy kernel to determine whether the given
195 * OpenCL device functions properly.
198 * \param[in] deviceInfo The device info pointer.
199 * \param[out] errorMessage An error message related to a failing OpenCL API call.
200 * \throws std::bad_alloc When out of memory.
201 * \returns Whether the device passed sanity checks
203 static bool isDeviceFunctional(const DeviceInformation& deviceInfo, std::string* errorMessage)
205 cl_context_properties properties[] = {
206 CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(deviceInfo.oclPlatformId), 0
208 // uncrustify spacing
210 cl_int status;
211 auto deviceId = deviceInfo.oclDeviceId;
212 ClContext context(clCreateContext(properties, 1, &deviceId, nullptr, nullptr, &status));
213 if (status != CL_SUCCESS)
215 errorMessage->assign(makeOpenClInternalErrorString("clCreateContext", status));
216 return false;
218 ClCommandQueue commandQueue(clCreateCommandQueue(context, deviceId, 0, &status));
219 if (status != CL_SUCCESS)
221 errorMessage->assign(makeOpenClInternalErrorString("clCreateCommandQueue", status));
222 return false;
225 // Some compilers such as Apple's require kernel functions to have at least one argument
226 const char* lines[] = { "__kernel void dummyKernel(__global void* input){}" };
227 ClProgram program(clCreateProgramWithSource(context, 1, lines, nullptr, &status));
228 if (status != CL_SUCCESS)
230 errorMessage->assign(makeOpenClInternalErrorString("clCreateProgramWithSource", status));
231 return false;
234 if ((status = clBuildProgram(program, 0, nullptr, nullptr, nullptr, nullptr)) != CL_SUCCESS)
236 errorMessage->assign(makeOpenClInternalErrorString("clBuildProgram", status));
237 return false;
240 ClKernel kernel(clCreateKernel(program, "dummyKernel", &status));
241 if (status != CL_SUCCESS)
243 errorMessage->assign(makeOpenClInternalErrorString("clCreateKernel", status));
244 return false;
247 clSetKernelArg(kernel, 0, sizeof(void*), nullptr);
249 const size_t localWorkSize = 1, globalWorkSize = 1;
250 if ((status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, nullptr, &globalWorkSize,
251 &localWorkSize, 0, nullptr, nullptr))
252 != CL_SUCCESS)
254 errorMessage->assign(makeOpenClInternalErrorString("clEnqueueNDRangeKernel", status));
255 return false;
257 return true;
260 /*! \brief Check whether the \c ocl_gpu_device is suitable for use by mdrun
262 * Runs sanity checks: checking that the runtime can compile a dummy kernel
263 * and this can be executed;
264 * Runs compatibility checks verifying the device OpenCL version requirement
265 * and vendor/OS support.
267 * \param[in] deviceId The runtime-reported numeric ID of the device.
268 * \param[in] deviceInfo The device info pointer.
269 * \returns A DeviceStatus to indicate if the GPU device is supported and if it was able to run
270 * basic functionality checks.
272 static DeviceStatus checkGpu(size_t deviceId, const DeviceInformation& deviceInfo)
275 DeviceStatus supportStatus = isDeviceFunctional(deviceInfo);
276 if (supportStatus != DeviceStatus::Compatible)
278 return supportStatus;
281 std::string errorMessage;
282 if (!isDeviceFunctional(deviceInfo, &errorMessage))
284 gmx_warning("While sanity checking device #%zu, %s", deviceId, errorMessage.c_str());
285 return DeviceStatus::NonFunctional;
288 return DeviceStatus::Compatible;
291 } // namespace gmx
293 bool isDeviceDetectionFunctional(std::string* errorMessage)
295 cl_uint numPlatforms;
296 cl_int status = clGetPlatformIDs(0, nullptr, &numPlatforms);
297 GMX_ASSERT(status != CL_INVALID_VALUE, "Incorrect call of clGetPlatformIDs detected");
298 #ifdef cl_khr_icd
299 if (status == CL_PLATFORM_NOT_FOUND_KHR)
301 // No valid ICDs found
302 if (errorMessage != nullptr)
304 errorMessage->assign("No valid OpenCL driver found");
306 return false;
308 #endif
309 GMX_RELEASE_ASSERT(
310 status == CL_SUCCESS,
311 gmx::formatString("An unexpected value was returned from clGetPlatformIDs %d: %s",
312 status, ocl_get_error_string(status).c_str())
313 .c_str());
314 bool foundPlatform = (numPlatforms > 0);
315 if (!foundPlatform && errorMessage != nullptr)
317 errorMessage->assign("No OpenCL platforms found even though the driver was valid");
319 return foundPlatform;
322 std::vector<std::unique_ptr<DeviceInformation>> findDevices()
324 cl_uint ocl_platform_count;
325 cl_platform_id* ocl_platform_ids;
326 cl_device_type req_dev_type = CL_DEVICE_TYPE_GPU;
328 ocl_platform_ids = nullptr;
330 if (getenv("GMX_OCL_FORCE_CPU") != nullptr)
332 req_dev_type = CL_DEVICE_TYPE_CPU;
335 int numDevices = 0;
336 std::vector<std::unique_ptr<DeviceInformation>> deviceInfoList(0);
338 while (true)
340 cl_int status = clGetPlatformIDs(0, nullptr, &ocl_platform_count);
341 if (CL_SUCCESS != status)
343 GMX_THROW(gmx::InternalError(
344 gmx::formatString("An unexpected value %d was returned from clGetPlatformIDs: ", status)
345 + ocl_get_error_string(status)));
348 if (1 > ocl_platform_count)
350 // TODO this should have a descriptive error message that we only support one OpenCL platform
351 break;
354 snew(ocl_platform_ids, ocl_platform_count);
356 status = clGetPlatformIDs(ocl_platform_count, ocl_platform_ids, nullptr);
357 if (CL_SUCCESS != status)
359 GMX_THROW(gmx::InternalError(
360 gmx::formatString("An unexpected value %d was returned from clGetPlatformIDs: ", status)
361 + ocl_get_error_string(status)));
364 for (unsigned int i = 0; i < ocl_platform_count; i++)
366 cl_uint ocl_device_count;
368 /* If requesting req_dev_type devices fails, just go to the next platform */
369 if (CL_SUCCESS != clGetDeviceIDs(ocl_platform_ids[i], req_dev_type, 0, nullptr, &ocl_device_count))
371 continue;
374 if (1 <= ocl_device_count)
376 numDevices += ocl_device_count;
380 if (1 > numDevices)
382 break;
385 deviceInfoList.resize(numDevices);
388 int device_index;
389 cl_device_id* ocl_device_ids;
391 snew(ocl_device_ids, numDevices);
392 device_index = 0;
394 for (unsigned int i = 0; i < ocl_platform_count; i++)
396 cl_uint ocl_device_count;
398 /* If requesting req_dev_type devices fails, just go to the next platform */
399 if (CL_SUCCESS
400 != clGetDeviceIDs(ocl_platform_ids[i], req_dev_type, numDevices, ocl_device_ids,
401 &ocl_device_count))
403 continue;
406 if (1 > ocl_device_count)
408 break;
411 for (unsigned int j = 0; j < ocl_device_count; j++)
413 deviceInfoList[device_index] = std::make_unique<DeviceInformation>();
415 deviceInfoList[device_index]->id = device_index;
417 deviceInfoList[device_index]->oclPlatformId = ocl_platform_ids[i];
418 deviceInfoList[device_index]->oclDeviceId = ocl_device_ids[j];
420 deviceInfoList[device_index]->device_name[0] = 0;
421 clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_NAME,
422 sizeof(deviceInfoList[device_index]->device_name),
423 deviceInfoList[device_index]->device_name, nullptr);
425 deviceInfoList[device_index]->device_version[0] = 0;
426 clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_VERSION,
427 sizeof(deviceInfoList[device_index]->device_version),
428 deviceInfoList[device_index]->device_version, nullptr);
430 deviceInfoList[device_index]->vendorName[0] = 0;
431 clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_VENDOR,
432 sizeof(deviceInfoList[device_index]->vendorName),
433 deviceInfoList[device_index]->vendorName, nullptr);
435 deviceInfoList[device_index]->compute_units = 0;
436 clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_MAX_COMPUTE_UNITS,
437 sizeof(deviceInfoList[device_index]->compute_units),
438 &(deviceInfoList[device_index]->compute_units), nullptr);
440 deviceInfoList[device_index]->adress_bits = 0;
441 clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_ADDRESS_BITS,
442 sizeof(deviceInfoList[device_index]->adress_bits),
443 &(deviceInfoList[device_index]->adress_bits), nullptr);
445 deviceInfoList[device_index]->deviceVendor =
446 gmx::getDeviceVendor(deviceInfoList[device_index]->vendorName);
448 clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, 3 * sizeof(size_t),
449 &deviceInfoList[device_index]->maxWorkItemSizes, nullptr);
451 clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t),
452 &deviceInfoList[device_index]->maxWorkGroupSize, nullptr);
454 deviceInfoList[device_index]->status =
455 gmx::checkGpu(device_index, *deviceInfoList[device_index]);
457 device_index++;
461 numDevices = device_index;
463 /* Dummy sort of devices - AMD first, then NVIDIA, then Intel */
464 // TODO: Sort devices based on performance.
465 if (0 < numDevices)
467 int last = -1;
468 for (int i = 0; i < numDevices; i++)
470 if (deviceInfoList[i]->deviceVendor == DeviceVendor::Amd)
472 last++;
474 if (last < i)
476 std::swap(deviceInfoList[i], deviceInfoList[last]);
481 /* if more than 1 device left to be sorted */
482 if ((numDevices - 1 - last) > 1)
484 for (int i = 0; i < numDevices; i++)
486 if (deviceInfoList[i]->deviceVendor == DeviceVendor::Nvidia)
488 last++;
490 if (last < i)
492 std::swap(deviceInfoList[i], deviceInfoList[last]);
499 sfree(ocl_device_ids);
502 break;
505 sfree(ocl_platform_ids);
506 return deviceInfoList;
509 void setActiveDevice(const DeviceInformation& deviceInfo)
511 // If the device is NVIDIA, for safety reasons we disable the JIT
512 // caching as this is known to be broken at least until driver 364.19;
513 // the cache does not always get regenerated when the source code changes,
514 // e.g. if the path to the kernel sources remains the same
516 if (deviceInfo.deviceVendor == DeviceVendor::Nvidia)
518 // Ignore return values, failing to set the variable does not mean
519 // that something will go wrong later.
520 #ifdef _MSC_VER
521 _putenv("CUDA_CACHE_DISABLE=1");
522 #else
523 // Don't override, maybe a dev is testing.
524 setenv("CUDA_CACHE_DISABLE", "1", 0);
525 #endif
529 void releaseDevice(DeviceInformation* /* deviceInfo */) {}
531 std::string getDeviceInformationString(const DeviceInformation& deviceInfo)
533 bool gpuExists = (deviceInfo.status != DeviceStatus::Nonexistent
534 && deviceInfo.status != DeviceStatus::NonFunctional);
536 if (!gpuExists)
538 return gmx::formatString("#%d: %s, status: %s", deviceInfo.id, "N/A",
539 c_deviceStateString[deviceInfo.status]);
541 else
543 return gmx::formatString("#%d: name: %s, vendor: %s, device version: %s, status: %s",
544 deviceInfo.id, deviceInfo.device_name, deviceInfo.vendorName,
545 deviceInfo.device_version, c_deviceStateString[deviceInfo.status]);