Fix Polly
[polly-mirror.git] / tools / GPURuntime / GPUJIT.c
blob4de9626b135e7c41737158d20fc6e4e558725911
1 /******************** GPUJIT.c - GPUJIT Execution Engine **********************/
2 /* */
3 /* Part of the LLVM Project, under the Apache License v2.0 with LLVM */
4 /* Exceptions. */
5 /* See https://llvm.org/LICENSE.txt for license information. */
6 /* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */
7 /* */
8 /******************************************************************************/
9 /* */
10 /* This file implements GPUJIT, a ptx string execution engine for GPU. */
11 /* */
12 /******************************************************************************/
14 #include "GPUJIT.h"
16 #ifdef HAS_LIBCUDART
17 #include <cuda.h>
18 #include <cuda_runtime.h>
19 #endif /* HAS_LIBCUDART */
21 #ifdef HAS_LIBOPENCL
22 #ifdef __APPLE__
23 #include <OpenCL/opencl.h>
24 #else
25 #include <CL/cl.h>
26 #endif /* __APPLE__ */
27 #endif /* HAS_LIBOPENCL */
29 #include <assert.h>
30 #include <dlfcn.h>
31 #include <stdarg.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <unistd.h>
37 static int DebugMode;
38 static int CacheMode;
39 #define max(x, y) ((x) > (y) ? (x) : (y))
41 static PollyGPURuntime Runtime = RUNTIME_NONE;
43 static void debug_print(const char *format, ...) {
44 if (!DebugMode)
45 return;
47 va_list args;
48 va_start(args, format);
49 vfprintf(stderr, format, args);
50 va_end(args);
52 #define dump_function() debug_print("-> %s\n", __func__)
54 #define KERNEL_CACHE_SIZE 10
56 static void err_runtime() __attribute__((noreturn));
57 static void err_runtime() {
58 fprintf(stderr, "Runtime not correctly initialized.\n");
59 exit(-1);
62 struct PollyGPUContextT {
63 void *Context;
66 struct PollyGPUFunctionT {
67 void *Kernel;
70 struct PollyGPUDevicePtrT {
71 void *DevicePtr;
74 /******************************************************************************/
75 /* OpenCL */
76 /******************************************************************************/
77 #ifdef HAS_LIBOPENCL
79 struct OpenCLContextT {
80 cl_context Context;
81 cl_command_queue CommandQueue;
84 struct OpenCLKernelT {
85 cl_kernel Kernel;
86 cl_program Program;
87 const char *BinaryString;
90 struct OpenCLDevicePtrT {
91 cl_mem MemObj;
94 /* Dynamic library handles for the OpenCL runtime library. */
95 static void *HandleOpenCL;
96 static void *HandleOpenCLBeignet;
98 /* Type-defines of function pointer to OpenCL Runtime API. */
99 typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries,
100 cl_platform_id *Platforms,
101 cl_uint *NumPlatforms);
102 static clGetPlatformIDsFcnTy *clGetPlatformIDsFcnPtr;
104 typedef cl_int clGetDeviceIDsFcnTy(cl_platform_id Platform,
105 cl_device_type DeviceType,
106 cl_uint NumEntries, cl_device_id *Devices,
107 cl_uint *NumDevices);
108 static clGetDeviceIDsFcnTy *clGetDeviceIDsFcnPtr;
110 typedef cl_int clGetDeviceInfoFcnTy(cl_device_id Device,
111 cl_device_info ParamName,
112 size_t ParamValueSize, void *ParamValue,
113 size_t *ParamValueSizeRet);
114 static clGetDeviceInfoFcnTy *clGetDeviceInfoFcnPtr;
116 typedef cl_int clGetKernelInfoFcnTy(cl_kernel Kernel, cl_kernel_info ParamName,
117 size_t ParamValueSize, void *ParamValue,
118 size_t *ParamValueSizeRet);
119 static clGetKernelInfoFcnTy *clGetKernelInfoFcnPtr;
121 typedef cl_context clCreateContextFcnTy(
122 const cl_context_properties *Properties, cl_uint NumDevices,
123 const cl_device_id *Devices,
124 void CL_CALLBACK *pfn_notify(const char *Errinfo, const void *PrivateInfo,
125 size_t CB, void *UserData),
126 void *UserData, cl_int *ErrcodeRet);
127 static clCreateContextFcnTy *clCreateContextFcnPtr;
129 typedef cl_command_queue
130 clCreateCommandQueueFcnTy(cl_context Context, cl_device_id Device,
131 cl_command_queue_properties Properties,
132 cl_int *ErrcodeRet);
133 static clCreateCommandQueueFcnTy *clCreateCommandQueueFcnPtr;
135 typedef cl_mem clCreateBufferFcnTy(cl_context Context, cl_mem_flags Flags,
136 size_t Size, void *HostPtr,
137 cl_int *ErrcodeRet);
138 static clCreateBufferFcnTy *clCreateBufferFcnPtr;
140 typedef cl_int
141 clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue, cl_mem Buffer,
142 cl_bool BlockingWrite, size_t Offset, size_t Size,
143 const void *Ptr, cl_uint NumEventsInWaitList,
144 const cl_event *EventWaitList, cl_event *Event);
145 static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr;
147 typedef cl_program
148 clCreateProgramWithLLVMIntelFcnTy(cl_context Context, cl_uint NumDevices,
149 const cl_device_id *DeviceList,
150 const char *Filename, cl_int *ErrcodeRet);
151 static clCreateProgramWithLLVMIntelFcnTy *clCreateProgramWithLLVMIntelFcnPtr;
153 typedef cl_program clCreateProgramWithBinaryFcnTy(
154 cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList,
155 const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus,
156 cl_int *ErrcodeRet);
157 static clCreateProgramWithBinaryFcnTy *clCreateProgramWithBinaryFcnPtr;
159 typedef cl_int clBuildProgramFcnTy(
160 cl_program Program, cl_uint NumDevices, const cl_device_id *DeviceList,
161 const char *Options,
162 void(CL_CALLBACK *pfn_notify)(cl_program Program, void *UserData),
163 void *UserData);
164 static clBuildProgramFcnTy *clBuildProgramFcnPtr;
166 typedef cl_kernel clCreateKernelFcnTy(cl_program Program,
167 const char *KernelName,
168 cl_int *ErrcodeRet);
169 static clCreateKernelFcnTy *clCreateKernelFcnPtr;
171 typedef cl_int clSetKernelArgFcnTy(cl_kernel Kernel, cl_uint ArgIndex,
172 size_t ArgSize, const void *ArgValue);
173 static clSetKernelArgFcnTy *clSetKernelArgFcnPtr;
175 typedef cl_int clEnqueueNDRangeKernelFcnTy(
176 cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint WorkDim,
177 const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
178 const size_t *LocalWorkSize, cl_uint NumEventsInWaitList,
179 const cl_event *EventWaitList, cl_event *Event);
180 static clEnqueueNDRangeKernelFcnTy *clEnqueueNDRangeKernelFcnPtr;
182 typedef cl_int clEnqueueReadBufferFcnTy(cl_command_queue CommandQueue,
183 cl_mem Buffer, cl_bool BlockingRead,
184 size_t Offset, size_t Size, void *Ptr,
185 cl_uint NumEventsInWaitList,
186 const cl_event *EventWaitList,
187 cl_event *Event);
188 static clEnqueueReadBufferFcnTy *clEnqueueReadBufferFcnPtr;
190 typedef cl_int clFlushFcnTy(cl_command_queue CommandQueue);
191 static clFlushFcnTy *clFlushFcnPtr;
193 typedef cl_int clFinishFcnTy(cl_command_queue CommandQueue);
194 static clFinishFcnTy *clFinishFcnPtr;
196 typedef cl_int clReleaseKernelFcnTy(cl_kernel Kernel);
197 static clReleaseKernelFcnTy *clReleaseKernelFcnPtr;
199 typedef cl_int clReleaseProgramFcnTy(cl_program Program);
200 static clReleaseProgramFcnTy *clReleaseProgramFcnPtr;
202 typedef cl_int clReleaseMemObjectFcnTy(cl_mem Memobject);
203 static clReleaseMemObjectFcnTy *clReleaseMemObjectFcnPtr;
205 typedef cl_int clReleaseCommandQueueFcnTy(cl_command_queue CommandQueue);
206 static clReleaseCommandQueueFcnTy *clReleaseCommandQueueFcnPtr;
208 typedef cl_int clReleaseContextFcnTy(cl_context Context);
209 static clReleaseContextFcnTy *clReleaseContextFcnPtr;
211 static void *getAPIHandleCL(void *Handle, const char *FuncName) {
212 char *Err;
213 void *FuncPtr;
214 dlerror();
215 FuncPtr = dlsym(Handle, FuncName);
216 if ((Err = dlerror()) != 0) {
217 fprintf(stderr, "Load OpenCL Runtime API failed: %s. \n", Err);
218 return 0;
220 return FuncPtr;
223 static int initialDeviceAPILibrariesCL() {
224 HandleOpenCLBeignet = dlopen("/usr/local/lib/beignet/libcl.so", RTLD_LAZY);
225 HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY);
226 if (!HandleOpenCL) {
227 fprintf(stderr, "Cannot open library: %s. \n", dlerror());
228 return 0;
230 return 1;
233 /* Get function pointer to OpenCL Runtime API.
235 * Note that compilers conforming to the ISO C standard are required to
236 * generate a warning if a conversion from a void * pointer to a function
237 * pointer is attempted as in the following statements. The warning
238 * of this kind of cast may not be emitted by clang and new versions of gcc
239 * as it is valid on POSIX 2008. For compilers required to generate a warning,
240 * we temporarily disable -Wpedantic, to avoid bloating the output with
241 * unnecessary warnings.
243 * Reference:
244 * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html
246 #pragma GCC diagnostic push
247 #pragma GCC diagnostic ignored "-Wpedantic"
248 static int initialDeviceAPIsCL() {
249 if (initialDeviceAPILibrariesCL() == 0)
250 return 0;
252 // FIXME: We are now always selecting the Intel Beignet driver if it is
253 // available on the system, instead of a possible NVIDIA or AMD OpenCL
254 // API. This selection should occurr based on the target architecture
255 // chosen when compiling.
256 void *Handle =
257 (HandleOpenCLBeignet != NULL ? HandleOpenCLBeignet : HandleOpenCL);
259 clGetPlatformIDsFcnPtr =
260 (clGetPlatformIDsFcnTy *)getAPIHandleCL(Handle, "clGetPlatformIDs");
262 clGetDeviceIDsFcnPtr =
263 (clGetDeviceIDsFcnTy *)getAPIHandleCL(Handle, "clGetDeviceIDs");
265 clGetDeviceInfoFcnPtr =
266 (clGetDeviceInfoFcnTy *)getAPIHandleCL(Handle, "clGetDeviceInfo");
268 clGetKernelInfoFcnPtr =
269 (clGetKernelInfoFcnTy *)getAPIHandleCL(Handle, "clGetKernelInfo");
271 clCreateContextFcnPtr =
272 (clCreateContextFcnTy *)getAPIHandleCL(Handle, "clCreateContext");
274 clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL(
275 Handle, "clCreateCommandQueue");
277 clCreateBufferFcnPtr =
278 (clCreateBufferFcnTy *)getAPIHandleCL(Handle, "clCreateBuffer");
280 clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL(
281 Handle, "clEnqueueWriteBuffer");
283 if (HandleOpenCLBeignet)
284 clCreateProgramWithLLVMIntelFcnPtr =
285 (clCreateProgramWithLLVMIntelFcnTy *)getAPIHandleCL(
286 Handle, "clCreateProgramWithLLVMIntel");
288 clCreateProgramWithBinaryFcnPtr =
289 (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL(
290 Handle, "clCreateProgramWithBinary");
292 clBuildProgramFcnPtr =
293 (clBuildProgramFcnTy *)getAPIHandleCL(Handle, "clBuildProgram");
295 clCreateKernelFcnPtr =
296 (clCreateKernelFcnTy *)getAPIHandleCL(Handle, "clCreateKernel");
298 clSetKernelArgFcnPtr =
299 (clSetKernelArgFcnTy *)getAPIHandleCL(Handle, "clSetKernelArg");
301 clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL(
302 Handle, "clEnqueueNDRangeKernel");
304 clEnqueueReadBufferFcnPtr =
305 (clEnqueueReadBufferFcnTy *)getAPIHandleCL(Handle, "clEnqueueReadBuffer");
307 clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(Handle, "clFlush");
309 clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(Handle, "clFinish");
311 clReleaseKernelFcnPtr =
312 (clReleaseKernelFcnTy *)getAPIHandleCL(Handle, "clReleaseKernel");
314 clReleaseProgramFcnPtr =
315 (clReleaseProgramFcnTy *)getAPIHandleCL(Handle, "clReleaseProgram");
317 clReleaseMemObjectFcnPtr =
318 (clReleaseMemObjectFcnTy *)getAPIHandleCL(Handle, "clReleaseMemObject");
320 clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL(
321 Handle, "clReleaseCommandQueue");
323 clReleaseContextFcnPtr =
324 (clReleaseContextFcnTy *)getAPIHandleCL(Handle, "clReleaseContext");
326 return 1;
328 #pragma GCC diagnostic pop
330 /* Context and Device. */
331 static PollyGPUContext *GlobalContext = NULL;
332 static cl_device_id GlobalDeviceID = NULL;
334 /* Fd-Decl: Print out OpenCL Error codes to human readable strings. */
335 static void printOpenCLError(int Error);
337 static void checkOpenCLError(int Ret, const char *format, ...) {
338 if (Ret == CL_SUCCESS)
339 return;
341 printOpenCLError(Ret);
342 va_list args;
343 va_start(args, format);
344 vfprintf(stderr, format, args);
345 va_end(args);
346 exit(-1);
349 static PollyGPUContext *initContextCL() {
350 dump_function();
352 PollyGPUContext *Context;
354 cl_platform_id PlatformID = NULL;
355 cl_device_id DeviceID = NULL;
356 cl_uint NumDevicesRet;
357 cl_int Ret;
359 char DeviceRevision[256];
360 char DeviceName[256];
361 size_t DeviceRevisionRetSize, DeviceNameRetSize;
363 static __thread PollyGPUContext *CurrentContext = NULL;
365 if (CurrentContext)
366 return CurrentContext;
368 /* Get API handles. */
369 if (initialDeviceAPIsCL() == 0) {
370 fprintf(stderr, "Getting the \"handle\" for the OpenCL Runtime failed.\n");
371 exit(-1);
374 /* Get number of devices that support OpenCL. */
375 static const int NumberOfPlatforms = 1;
376 Ret = clGetPlatformIDsFcnPtr(NumberOfPlatforms, &PlatformID, NULL);
377 checkOpenCLError(Ret, "Failed to get platform IDs.\n");
378 // TODO: Extend to CL_DEVICE_TYPE_ALL?
379 static const int NumberOfDevices = 1;
380 Ret = clGetDeviceIDsFcnPtr(PlatformID, CL_DEVICE_TYPE_GPU, NumberOfDevices,
381 &DeviceID, &NumDevicesRet);
382 checkOpenCLError(Ret, "Failed to get device IDs.\n");
384 GlobalDeviceID = DeviceID;
385 if (NumDevicesRet == 0) {
386 fprintf(stderr, "There is no device supporting OpenCL.\n");
387 exit(-1);
390 /* Get device revision. */
391 Ret =
392 clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_VERSION, sizeof(DeviceRevision),
393 DeviceRevision, &DeviceRevisionRetSize);
394 checkOpenCLError(Ret, "Failed to fetch device revision.\n");
396 /* Get device name. */
397 Ret = clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_NAME, sizeof(DeviceName),
398 DeviceName, &DeviceNameRetSize);
399 checkOpenCLError(Ret, "Failed to fetch device name.\n");
401 debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
403 /* Create context on the device. */
404 Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
405 if (Context == 0) {
406 fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
407 exit(-1);
409 Context->Context = (OpenCLContext *)malloc(sizeof(OpenCLContext));
410 if (Context->Context == 0) {
411 fprintf(stderr, "Allocate memory for Polly OpenCL context failed.\n");
412 exit(-1);
414 ((OpenCLContext *)Context->Context)->Context =
415 clCreateContextFcnPtr(NULL, NumDevicesRet, &DeviceID, NULL, NULL, &Ret);
416 checkOpenCLError(Ret, "Failed to create context.\n");
418 static const int ExtraProperties = 0;
419 ((OpenCLContext *)Context->Context)->CommandQueue =
420 clCreateCommandQueueFcnPtr(((OpenCLContext *)Context->Context)->Context,
421 DeviceID, ExtraProperties, &Ret);
422 checkOpenCLError(Ret, "Failed to create command queue.\n");
424 if (CacheMode)
425 CurrentContext = Context;
427 GlobalContext = Context;
428 return Context;
431 static void freeKernelCL(PollyGPUFunction *Kernel) {
432 dump_function();
434 if (CacheMode)
435 return;
437 if (!GlobalContext) {
438 fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
439 exit(-1);
442 cl_int Ret;
443 Ret = clFlushFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
444 checkOpenCLError(Ret, "Failed to flush command queue.\n");
445 Ret = clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
446 checkOpenCLError(Ret, "Failed to finish command queue.\n");
448 if (((OpenCLKernel *)Kernel->Kernel)->Kernel) {
449 cl_int Ret =
450 clReleaseKernelFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Kernel);
451 checkOpenCLError(Ret, "Failed to release kernel.\n");
454 if (((OpenCLKernel *)Kernel->Kernel)->Program) {
455 cl_int Ret =
456 clReleaseProgramFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Program);
457 checkOpenCLError(Ret, "Failed to release program.\n");
460 if (Kernel->Kernel)
461 free((OpenCLKernel *)Kernel->Kernel);
463 if (Kernel)
464 free(Kernel);
467 static PollyGPUFunction *getKernelCL(const char *BinaryBuffer,
468 const char *KernelName) {
469 dump_function();
471 if (!GlobalContext) {
472 fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
473 exit(-1);
476 static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
477 static __thread int NextCacheItem = 0;
479 for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
480 // We exploit here the property that all Polly-ACC kernels are allocated
481 // as global constants, hence a pointer comparision is sufficient to
482 // determin equality.
483 if (KernelCache[i] &&
484 ((OpenCLKernel *)KernelCache[i]->Kernel)->BinaryString ==
485 BinaryBuffer) {
486 debug_print(" -> using cached kernel\n");
487 return KernelCache[i];
491 PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
492 if (Function == 0) {
493 fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
494 exit(-1);
496 Function->Kernel = (OpenCLKernel *)malloc(sizeof(OpenCLKernel));
497 if (Function->Kernel == 0) {
498 fprintf(stderr, "Allocate memory for Polly OpenCL kernel failed.\n");
499 exit(-1);
502 if (!GlobalDeviceID) {
503 fprintf(stderr, "GPGPU-code generation not initialized correctly.\n");
504 exit(-1);
507 cl_int Ret;
509 if (HandleOpenCLBeignet) {
510 // This is a workaround, since clCreateProgramWithLLVMIntel only
511 // accepts a filename to a valid llvm-ir file as an argument, instead
512 // of accepting the BinaryBuffer directly.
513 char FileName[] = "/tmp/polly_kernelXXXXXX";
514 int File = mkstemp(FileName);
515 write(File, BinaryBuffer, strlen(BinaryBuffer));
517 ((OpenCLKernel *)Function->Kernel)->Program =
518 clCreateProgramWithLLVMIntelFcnPtr(
519 ((OpenCLContext *)GlobalContext->Context)->Context, 1,
520 &GlobalDeviceID, FileName, &Ret);
521 checkOpenCLError(Ret, "Failed to create program from llvm.\n");
522 close(File);
523 unlink(FileName);
524 } else {
525 size_t BinarySize = strlen(BinaryBuffer);
526 ((OpenCLKernel *)Function->Kernel)->Program =
527 clCreateProgramWithBinaryFcnPtr(
528 ((OpenCLContext *)GlobalContext->Context)->Context, 1,
529 &GlobalDeviceID, (const size_t *)&BinarySize,
530 (const unsigned char **)&BinaryBuffer, NULL, &Ret);
531 checkOpenCLError(Ret, "Failed to create program from binary.\n");
534 Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1,
535 &GlobalDeviceID, NULL, NULL, NULL);
536 checkOpenCLError(Ret, "Failed to build program.\n");
538 ((OpenCLKernel *)Function->Kernel)->Kernel = clCreateKernelFcnPtr(
539 ((OpenCLKernel *)Function->Kernel)->Program, KernelName, &Ret);
540 checkOpenCLError(Ret, "Failed to create kernel.\n");
542 ((OpenCLKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
544 if (CacheMode) {
545 if (KernelCache[NextCacheItem])
546 freeKernelCL(KernelCache[NextCacheItem]);
548 KernelCache[NextCacheItem] = Function;
550 NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
553 return Function;
556 static void copyFromHostToDeviceCL(void *HostData, PollyGPUDevicePtr *DevData,
557 long MemSize) {
558 dump_function();
560 if (!GlobalContext) {
561 fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
562 exit(-1);
565 cl_int Ret;
566 Ret = clEnqueueWriteBufferFcnPtr(
567 ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
568 ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
569 HostData, 0, NULL, NULL);
570 checkOpenCLError(Ret, "Copying data from host memory to device failed.\n");
573 static void copyFromDeviceToHostCL(PollyGPUDevicePtr *DevData, void *HostData,
574 long MemSize) {
575 dump_function();
577 if (!GlobalContext) {
578 fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
579 exit(-1);
582 cl_int Ret;
583 Ret = clEnqueueReadBufferFcnPtr(
584 ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
585 ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
586 HostData, 0, NULL, NULL);
587 checkOpenCLError(Ret, "Copying results from device to host memory failed.\n");
590 static void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX,
591 unsigned int GridDimY, unsigned int BlockDimX,
592 unsigned int BlockDimY, unsigned int BlockDimZ,
593 void **Parameters) {
594 dump_function();
596 cl_int Ret;
597 cl_uint NumArgs;
599 if (!GlobalContext) {
600 fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
601 exit(-1);
604 OpenCLKernel *CLKernel = (OpenCLKernel *)Kernel->Kernel;
605 Ret = clGetKernelInfoFcnPtr(CLKernel->Kernel, CL_KERNEL_NUM_ARGS,
606 sizeof(cl_uint), &NumArgs, NULL);
607 checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n");
609 /* Argument sizes are stored at the end of the Parameters array. */
610 for (cl_uint i = 0; i < NumArgs; i++) {
611 Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i,
612 *((int *)Parameters[NumArgs + i]),
613 (void *)Parameters[i]);
614 checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i);
617 unsigned int GridDimZ = 1;
618 size_t GlobalWorkSize[3] = {BlockDimX * GridDimX, BlockDimY * GridDimY,
619 BlockDimZ * GridDimZ};
620 size_t LocalWorkSize[3] = {BlockDimX, BlockDimY, BlockDimZ};
622 static const int WorkDim = 3;
623 OpenCLContext *CLContext = (OpenCLContext *)GlobalContext->Context;
624 Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue, CLKernel->Kernel,
625 WorkDim, NULL, GlobalWorkSize,
626 LocalWorkSize, 0, NULL, NULL);
627 checkOpenCLError(Ret, "Launching OpenCL kernel failed.\n");
630 static void freeDeviceMemoryCL(PollyGPUDevicePtr *Allocation) {
631 dump_function();
633 OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
634 cl_int Ret = clReleaseMemObjectFcnPtr((cl_mem)DevPtr->MemObj);
635 checkOpenCLError(Ret, "Failed to free device memory.\n");
637 free(DevPtr);
638 free(Allocation);
641 static PollyGPUDevicePtr *allocateMemoryForDeviceCL(long MemSize) {
642 dump_function();
644 if (!GlobalContext) {
645 fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
646 exit(-1);
649 PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
650 if (DevData == 0) {
651 fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
652 exit(-1);
654 DevData->DevicePtr = (OpenCLDevicePtr *)malloc(sizeof(OpenCLDevicePtr));
655 if (DevData->DevicePtr == 0) {
656 fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
657 exit(-1);
660 cl_int Ret;
661 ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj =
662 clCreateBufferFcnPtr(((OpenCLContext *)GlobalContext->Context)->Context,
663 CL_MEM_READ_WRITE, MemSize, NULL, &Ret);
664 checkOpenCLError(Ret,
665 "Allocate memory for GPU device memory pointer failed.\n");
667 return DevData;
670 static void *getDevicePtrCL(PollyGPUDevicePtr *Allocation) {
671 dump_function();
673 OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
674 return (void *)DevPtr->MemObj;
677 static void synchronizeDeviceCL() {
678 dump_function();
680 if (!GlobalContext) {
681 fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
682 exit(-1);
685 if (clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue) !=
686 CL_SUCCESS) {
687 fprintf(stderr, "Synchronizing device and host memory failed.\n");
688 exit(-1);
692 static void freeContextCL(PollyGPUContext *Context) {
693 dump_function();
695 cl_int Ret;
697 GlobalContext = NULL;
699 OpenCLContext *Ctx = (OpenCLContext *)Context->Context;
700 if (Ctx->CommandQueue) {
701 Ret = clReleaseCommandQueueFcnPtr(Ctx->CommandQueue);
702 checkOpenCLError(Ret, "Could not release command queue.\n");
705 if (Ctx->Context) {
706 Ret = clReleaseContextFcnPtr(Ctx->Context);
707 checkOpenCLError(Ret, "Could not release context.\n");
710 free(Ctx);
711 free(Context);
714 static void printOpenCLError(int Error) {
716 switch (Error) {
717 case CL_SUCCESS:
718 // Success, don't print an error.
719 break;
721 // JIT/Runtime errors.
722 case CL_DEVICE_NOT_FOUND:
723 fprintf(stderr, "Device not found.\n");
724 break;
725 case CL_DEVICE_NOT_AVAILABLE:
726 fprintf(stderr, "Device not available.\n");
727 break;
728 case CL_COMPILER_NOT_AVAILABLE:
729 fprintf(stderr, "Compiler not available.\n");
730 break;
731 case CL_MEM_OBJECT_ALLOCATION_FAILURE:
732 fprintf(stderr, "Mem object allocation failure.\n");
733 break;
734 case CL_OUT_OF_RESOURCES:
735 fprintf(stderr, "Out of resources.\n");
736 break;
737 case CL_OUT_OF_HOST_MEMORY:
738 fprintf(stderr, "Out of host memory.\n");
739 break;
740 case CL_PROFILING_INFO_NOT_AVAILABLE:
741 fprintf(stderr, "Profiling info not available.\n");
742 break;
743 case CL_MEM_COPY_OVERLAP:
744 fprintf(stderr, "Mem copy overlap.\n");
745 break;
746 case CL_IMAGE_FORMAT_MISMATCH:
747 fprintf(stderr, "Image format mismatch.\n");
748 break;
749 case CL_IMAGE_FORMAT_NOT_SUPPORTED:
750 fprintf(stderr, "Image format not supported.\n");
751 break;
752 case CL_BUILD_PROGRAM_FAILURE:
753 fprintf(stderr, "Build program failure.\n");
754 break;
755 case CL_MAP_FAILURE:
756 fprintf(stderr, "Map failure.\n");
757 break;
758 case CL_MISALIGNED_SUB_BUFFER_OFFSET:
759 fprintf(stderr, "Misaligned sub buffer offset.\n");
760 break;
761 case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
762 fprintf(stderr, "Exec status error for events in wait list.\n");
763 break;
764 case CL_COMPILE_PROGRAM_FAILURE:
765 fprintf(stderr, "Compile program failure.\n");
766 break;
767 case CL_LINKER_NOT_AVAILABLE:
768 fprintf(stderr, "Linker not available.\n");
769 break;
770 case CL_LINK_PROGRAM_FAILURE:
771 fprintf(stderr, "Link program failure.\n");
772 break;
773 case CL_DEVICE_PARTITION_FAILED:
774 fprintf(stderr, "Device partition failed.\n");
775 break;
776 case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
777 fprintf(stderr, "Kernel arg info not available.\n");
778 break;
780 // Compiler errors.
781 case CL_INVALID_VALUE:
782 fprintf(stderr, "Invalid value.\n");
783 break;
784 case CL_INVALID_DEVICE_TYPE:
785 fprintf(stderr, "Invalid device type.\n");
786 break;
787 case CL_INVALID_PLATFORM:
788 fprintf(stderr, "Invalid platform.\n");
789 break;
790 case CL_INVALID_DEVICE:
791 fprintf(stderr, "Invalid device.\n");
792 break;
793 case CL_INVALID_CONTEXT:
794 fprintf(stderr, "Invalid context.\n");
795 break;
796 case CL_INVALID_QUEUE_PROPERTIES:
797 fprintf(stderr, "Invalid queue properties.\n");
798 break;
799 case CL_INVALID_COMMAND_QUEUE:
800 fprintf(stderr, "Invalid command queue.\n");
801 break;
802 case CL_INVALID_HOST_PTR:
803 fprintf(stderr, "Invalid host pointer.\n");
804 break;
805 case CL_INVALID_MEM_OBJECT:
806 fprintf(stderr, "Invalid memory object.\n");
807 break;
808 case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
809 fprintf(stderr, "Invalid image format descriptor.\n");
810 break;
811 case CL_INVALID_IMAGE_SIZE:
812 fprintf(stderr, "Invalid image size.\n");
813 break;
814 case CL_INVALID_SAMPLER:
815 fprintf(stderr, "Invalid sampler.\n");
816 break;
817 case CL_INVALID_BINARY:
818 fprintf(stderr, "Invalid binary.\n");
819 break;
820 case CL_INVALID_BUILD_OPTIONS:
821 fprintf(stderr, "Invalid build options.\n");
822 break;
823 case CL_INVALID_PROGRAM:
824 fprintf(stderr, "Invalid program.\n");
825 break;
826 case CL_INVALID_PROGRAM_EXECUTABLE:
827 fprintf(stderr, "Invalid program executable.\n");
828 break;
829 case CL_INVALID_KERNEL_NAME:
830 fprintf(stderr, "Invalid kernel name.\n");
831 break;
832 case CL_INVALID_KERNEL_DEFINITION:
833 fprintf(stderr, "Invalid kernel definition.\n");
834 break;
835 case CL_INVALID_KERNEL:
836 fprintf(stderr, "Invalid kernel.\n");
837 break;
838 case CL_INVALID_ARG_INDEX:
839 fprintf(stderr, "Invalid arg index.\n");
840 break;
841 case CL_INVALID_ARG_VALUE:
842 fprintf(stderr, "Invalid arg value.\n");
843 break;
844 case CL_INVALID_ARG_SIZE:
845 fprintf(stderr, "Invalid arg size.\n");
846 break;
847 case CL_INVALID_KERNEL_ARGS:
848 fprintf(stderr, "Invalid kernel args.\n");
849 break;
850 case CL_INVALID_WORK_DIMENSION:
851 fprintf(stderr, "Invalid work dimension.\n");
852 break;
853 case CL_INVALID_WORK_GROUP_SIZE:
854 fprintf(stderr, "Invalid work group size.\n");
855 break;
856 case CL_INVALID_WORK_ITEM_SIZE:
857 fprintf(stderr, "Invalid work item size.\n");
858 break;
859 case CL_INVALID_GLOBAL_OFFSET:
860 fprintf(stderr, "Invalid global offset.\n");
861 break;
862 case CL_INVALID_EVENT_WAIT_LIST:
863 fprintf(stderr, "Invalid event wait list.\n");
864 break;
865 case CL_INVALID_EVENT:
866 fprintf(stderr, "Invalid event.\n");
867 break;
868 case CL_INVALID_OPERATION:
869 fprintf(stderr, "Invalid operation.\n");
870 break;
871 case CL_INVALID_GL_OBJECT:
872 fprintf(stderr, "Invalid GL object.\n");
873 break;
874 case CL_INVALID_BUFFER_SIZE:
875 fprintf(stderr, "Invalid buffer size.\n");
876 break;
877 case CL_INVALID_MIP_LEVEL:
878 fprintf(stderr, "Invalid mip level.\n");
879 break;
880 case CL_INVALID_GLOBAL_WORK_SIZE:
881 fprintf(stderr, "Invalid global work size.\n");
882 break;
883 case CL_INVALID_PROPERTY:
884 fprintf(stderr, "Invalid property.\n");
885 break;
886 case CL_INVALID_IMAGE_DESCRIPTOR:
887 fprintf(stderr, "Invalid image descriptor.\n");
888 break;
889 case CL_INVALID_COMPILER_OPTIONS:
890 fprintf(stderr, "Invalid compiler options.\n");
891 break;
892 case CL_INVALID_LINKER_OPTIONS:
893 fprintf(stderr, "Invalid linker options.\n");
894 break;
895 case CL_INVALID_DEVICE_PARTITION_COUNT:
896 fprintf(stderr, "Invalid device partition count.\n");
897 break;
898 case -69: // OpenCL 2.0 Code for CL_INVALID_PIPE_SIZE
899 fprintf(stderr, "Invalid pipe size.\n");
900 break;
901 case -70: // OpenCL 2.0 Code for CL_INVALID_DEVICE_QUEUE
902 fprintf(stderr, "Invalid device queue.\n");
903 break;
905 // NVIDIA specific error.
906 case -9999:
907 fprintf(stderr, "NVIDIA invalid read or write buffer.\n");
908 break;
910 default:
911 fprintf(stderr, "Unknown error code!\n");
912 break;
916 #endif /* HAS_LIBOPENCL */
917 /******************************************************************************/
918 /* CUDA */
919 /******************************************************************************/
920 #ifdef HAS_LIBCUDART
922 struct CUDAContextT {
923 CUcontext Cuda;
926 struct CUDAKernelT {
927 CUfunction Cuda;
928 CUmodule CudaModule;
929 const char *BinaryString;
932 struct CUDADevicePtrT {
933 CUdeviceptr Cuda;
936 /* Dynamic library handles for the CUDA and CUDA runtime library. */
937 static void *HandleCuda;
938 static void *HandleCudaRT;
940 /* Type-defines of function pointer to CUDA driver APIs. */
941 typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t);
942 static CuMemAllocFcnTy *CuMemAllocFcnPtr;
944 typedef CUresult CUDAAPI CuMemAllocManagedFcnTy(CUdeviceptr *, size_t,
945 unsigned int);
946 static CuMemAllocManagedFcnTy *CuMemAllocManagedFcnPtr;
948 typedef CUresult CUDAAPI CuLaunchKernelFcnTy(
949 CUfunction F, unsigned int GridDimX, unsigned int GridDimY,
950 unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY,
951 unsigned int BlockDimZ, unsigned int SharedMemBytes, CUstream HStream,
952 void **KernelParams, void **Extra);
953 static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr;
955 typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t);
956 static CuMemcpyDtoHFcnTy *CuMemcpyDtoHFcnPtr;
958 typedef CUresult CUDAAPI CuMemcpyHtoDFcnTy(CUdeviceptr, const void *, size_t);
959 static CuMemcpyHtoDFcnTy *CuMemcpyHtoDFcnPtr;
961 typedef CUresult CUDAAPI CuMemFreeFcnTy(CUdeviceptr);
962 static CuMemFreeFcnTy *CuMemFreeFcnPtr;
964 typedef CUresult CUDAAPI CuModuleUnloadFcnTy(CUmodule);
965 static CuModuleUnloadFcnTy *CuModuleUnloadFcnPtr;
967 typedef CUresult CUDAAPI CuProfilerStopFcnTy();
968 static CuProfilerStopFcnTy *CuProfilerStopFcnPtr;
970 typedef CUresult CUDAAPI CuCtxDestroyFcnTy(CUcontext);
971 static CuCtxDestroyFcnTy *CuCtxDestroyFcnPtr;
973 typedef CUresult CUDAAPI CuInitFcnTy(unsigned int);
974 static CuInitFcnTy *CuInitFcnPtr;
976 typedef CUresult CUDAAPI CuDeviceGetCountFcnTy(int *);
977 static CuDeviceGetCountFcnTy *CuDeviceGetCountFcnPtr;
979 typedef CUresult CUDAAPI CuCtxCreateFcnTy(CUcontext *, unsigned int, CUdevice);
980 static CuCtxCreateFcnTy *CuCtxCreateFcnPtr;
982 typedef CUresult CUDAAPI CuCtxGetCurrentFcnTy(CUcontext *);
983 static CuCtxGetCurrentFcnTy *CuCtxGetCurrentFcnPtr;
985 typedef CUresult CUDAAPI CuDeviceGetFcnTy(CUdevice *, int);
986 static CuDeviceGetFcnTy *CuDeviceGetFcnPtr;
988 typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *,
989 unsigned int, CUjit_option *,
990 void **);
991 static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;
993 typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *Module,
994 const void *Image);
995 static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr;
997 typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
998 const char *);
999 static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr;
1001 typedef CUresult CUDAAPI CuDeviceComputeCapabilityFcnTy(int *, int *, CUdevice);
1002 static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr;
1004 typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
1005 static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;
1007 typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState State,
1008 CUjitInputType Type, void *Data,
1009 size_t Size, const char *Name,
1010 unsigned int NumOptions,
1011 CUjit_option *Options,
1012 void **OptionValues);
1013 static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr;
1015 typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int NumOptions,
1016 CUjit_option *Options,
1017 void **OptionValues,
1018 CUlinkState *StateOut);
1019 static CuLinkCreateFcnTy *CuLinkCreateFcnPtr;
1021 typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState State, void **CubinOut,
1022 size_t *SizeOut);
1023 static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr;
1025 typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState State);
1026 static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;
1028 typedef CUresult CUDAAPI CuCtxSynchronizeFcnTy();
1029 static CuCtxSynchronizeFcnTy *CuCtxSynchronizeFcnPtr;
1031 /* Type-defines of function pointer ot CUDA runtime APIs. */
1032 typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
1033 static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
1035 static void *getAPIHandleCUDA(void *Handle, const char *FuncName) {
1036 char *Err;
1037 void *FuncPtr;
1038 dlerror();
1039 FuncPtr = dlsym(Handle, FuncName);
1040 if ((Err = dlerror()) != 0) {
1041 fprintf(stderr, "Load CUDA driver API failed: %s. \n", Err);
1042 return 0;
1044 return FuncPtr;
1047 static int initialDeviceAPILibrariesCUDA() {
1048 HandleCuda = dlopen("libcuda.so", RTLD_LAZY);
1049 if (!HandleCuda) {
1050 fprintf(stderr, "Cannot open library: %s. \n", dlerror());
1051 return 0;
1054 HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY);
1055 if (!HandleCudaRT) {
1056 fprintf(stderr, "Cannot open library: %s. \n", dlerror());
1057 return 0;
1060 return 1;
1063 /* Get function pointer to CUDA Driver APIs.
1065 * Note that compilers conforming to the ISO C standard are required to
1066 * generate a warning if a conversion from a void * pointer to a function
1067 * pointer is attempted as in the following statements. The warning
1068 * of this kind of cast may not be emitted by clang and new versions of gcc
1069 * as it is valid on POSIX 2008. For compilers required to generate a warning,
1070 * we temporarily disable -Wpedantic, to avoid bloating the output with
1071 * unnecessary warnings.
1073 * Reference:
1074 * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html
1076 #pragma GCC diagnostic push
1077 #pragma GCC diagnostic ignored "-Wpedantic"
1078 static int initialDeviceAPIsCUDA() {
1079 if (initialDeviceAPILibrariesCUDA() == 0)
1080 return 0;
1082 CuLaunchKernelFcnPtr =
1083 (CuLaunchKernelFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLaunchKernel");
1085 CuMemAllocFcnPtr =
1086 (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2");
1088 CuMemAllocManagedFcnPtr = (CuMemAllocManagedFcnTy *)getAPIHandleCUDA(
1089 HandleCuda, "cuMemAllocManaged");
1091 CuMemFreeFcnPtr =
1092 (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2");
1094 CuMemcpyDtoHFcnPtr =
1095 (CuMemcpyDtoHFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyDtoH_v2");
1097 CuMemcpyHtoDFcnPtr =
1098 (CuMemcpyHtoDFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyHtoD_v2");
1100 CuModuleUnloadFcnPtr =
1101 (CuModuleUnloadFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleUnload");
1103 CuProfilerStopFcnPtr =
1104 (CuProfilerStopFcnTy *)getAPIHandleCUDA(HandleCuda, "cuProfilerStop");
1106 CuCtxDestroyFcnPtr =
1107 (CuCtxDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxDestroy");
1109 CuInitFcnPtr = (CuInitFcnTy *)getAPIHandleCUDA(HandleCuda, "cuInit");
1111 CuDeviceGetCountFcnPtr =
1112 (CuDeviceGetCountFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetCount");
1114 CuDeviceGetFcnPtr =
1115 (CuDeviceGetFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGet");
1117 CuCtxCreateFcnPtr =
1118 (CuCtxCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxCreate_v2");
1120 CuCtxGetCurrentFcnPtr =
1121 (CuCtxGetCurrentFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxGetCurrent");
1123 CuModuleLoadDataExFcnPtr = (CuModuleLoadDataExFcnTy *)getAPIHandleCUDA(
1124 HandleCuda, "cuModuleLoadDataEx");
1126 CuModuleLoadDataFcnPtr =
1127 (CuModuleLoadDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleLoadData");
1129 CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandleCUDA(
1130 HandleCuda, "cuModuleGetFunction");
1132 CuDeviceComputeCapabilityFcnPtr =
1133 (CuDeviceComputeCapabilityFcnTy *)getAPIHandleCUDA(
1134 HandleCuda, "cuDeviceComputeCapability");
1136 CuDeviceGetNameFcnPtr =
1137 (CuDeviceGetNameFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetName");
1139 CuLinkAddDataFcnPtr =
1140 (CuLinkAddDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkAddData");
1142 CuLinkCreateFcnPtr =
1143 (CuLinkCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkCreate");
1145 CuLinkCompleteFcnPtr =
1146 (CuLinkCompleteFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkComplete");
1148 CuLinkDestroyFcnPtr =
1149 (CuLinkDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkDestroy");
1151 CuCtxSynchronizeFcnPtr =
1152 (CuCtxSynchronizeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxSynchronize");
1154 /* Get function pointer to CUDA Runtime APIs. */
1155 CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandleCUDA(
1156 HandleCudaRT, "cudaThreadSynchronize");
1158 return 1;
1160 #pragma GCC diagnostic pop
1162 static PollyGPUContext *initContextCUDA() {
1163 dump_function();
1164 PollyGPUContext *Context;
1165 CUdevice Device;
1167 int Major = 0, Minor = 0, DeviceID = 0;
1168 char DeviceName[256];
1169 int DeviceCount = 0;
1171 static __thread PollyGPUContext *CurrentContext = NULL;
1173 if (CurrentContext)
1174 return CurrentContext;
1176 /* Get API handles. */
1177 if (initialDeviceAPIsCUDA() == 0) {
1178 fprintf(stderr, "Getting the \"handle\" for the CUDA driver API failed.\n");
1179 exit(-1);
1182 if (CuInitFcnPtr(0) != CUDA_SUCCESS) {
1183 fprintf(stderr, "Initializing the CUDA driver API failed.\n");
1184 exit(-1);
1187 /* Get number of devices that supports CUDA. */
1188 CuDeviceGetCountFcnPtr(&DeviceCount);
1189 if (DeviceCount == 0) {
1190 fprintf(stderr, "There is no device supporting CUDA.\n");
1191 exit(-1);
1194 CuDeviceGetFcnPtr(&Device, 0);
1196 /* Get compute capabilities and the device name. */
1197 CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, Device);
1198 CuDeviceGetNameFcnPtr(DeviceName, 256, Device);
1199 debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
1201 /* Create context on the device. */
1202 Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
1203 if (Context == 0) {
1204 fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
1205 exit(-1);
1207 Context->Context = malloc(sizeof(CUDAContext));
1208 if (Context->Context == 0) {
1209 fprintf(stderr, "Allocate memory for Polly CUDA context failed.\n");
1210 exit(-1);
1213 // In cases where managed memory is used, it is quite likely that
1214 // `cudaMallocManaged` / `polly_mallocManaged` was called before
1215 // `polly_initContext` was called.
1217 // If `polly_initContext` calls `CuCtxCreate` when there already was a
1218 // pre-existing context created by the runtime API, this causes code running
1219 // on P100 to hang. So, we query for a pre-existing context to try and use.
1220 // If there is no pre-existing context, we create a new context
1222 // The possible pre-existing context from previous runtime API calls.
1223 CUcontext MaybeRuntimeAPIContext;
1224 if (CuCtxGetCurrentFcnPtr(&MaybeRuntimeAPIContext) != CUDA_SUCCESS) {
1225 fprintf(stderr, "cuCtxGetCurrent failed.\n");
1226 exit(-1);
1229 // There was no previous context, initialise it.
1230 if (MaybeRuntimeAPIContext == NULL) {
1231 if (CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0,
1232 Device) != CUDA_SUCCESS) {
1233 fprintf(stderr, "cuCtxCreateFcnPtr failed.\n");
1234 exit(-1);
1236 } else {
1237 ((CUDAContext *)Context->Context)->Cuda = MaybeRuntimeAPIContext;
1240 if (CacheMode)
1241 CurrentContext = Context;
1243 return Context;
1246 static void freeKernelCUDA(PollyGPUFunction *Kernel) {
1247 dump_function();
1249 if (CacheMode)
1250 return;
1252 if (((CUDAKernel *)Kernel->Kernel)->CudaModule)
1253 CuModuleUnloadFcnPtr(((CUDAKernel *)Kernel->Kernel)->CudaModule);
1255 if (Kernel->Kernel)
1256 free((CUDAKernel *)Kernel->Kernel);
1258 if (Kernel)
1259 free(Kernel);
1262 static PollyGPUFunction *getKernelCUDA(const char *BinaryBuffer,
1263 const char *KernelName) {
1264 dump_function();
1266 static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
1267 static __thread int NextCacheItem = 0;
1269 for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
1270 // We exploit here the property that all Polly-ACC kernels are allocated
1271 // as global constants, hence a pointer comparision is sufficient to
1272 // determin equality.
1273 if (KernelCache[i] &&
1274 ((CUDAKernel *)KernelCache[i]->Kernel)->BinaryString == BinaryBuffer) {
1275 debug_print(" -> using cached kernel\n");
1276 return KernelCache[i];
1280 PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
1281 if (Function == 0) {
1282 fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
1283 exit(-1);
1285 Function->Kernel = (CUDAKernel *)malloc(sizeof(CUDAKernel));
1286 if (Function->Kernel == 0) {
1287 fprintf(stderr, "Allocate memory for Polly CUDA function failed.\n");
1288 exit(-1);
1291 CUresult Res;
1292 CUlinkState LState;
1293 CUjit_option Options[6];
1294 void *OptionVals[6];
1295 float Walltime = 0;
1296 unsigned long LogSize = 8192;
1297 char ErrorLog[8192], InfoLog[8192];
1298 void *CuOut;
1299 size_t OutSize;
1301 // Setup linker options
1302 // Return walltime from JIT compilation
1303 Options[0] = CU_JIT_WALL_TIME;
1304 OptionVals[0] = (void *)&Walltime;
1305 // Pass a buffer for info messages
1306 Options[1] = CU_JIT_INFO_LOG_BUFFER;
1307 OptionVals[1] = (void *)InfoLog;
1308 // Pass the size of the info buffer
1309 Options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
1310 OptionVals[2] = (void *)LogSize;
1311 // Pass a buffer for error message
1312 Options[3] = CU_JIT_ERROR_LOG_BUFFER;
1313 OptionVals[3] = (void *)ErrorLog;
1314 // Pass the size of the error buffer
1315 Options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
1316 OptionVals[4] = (void *)LogSize;
1317 // Make the linker verbose
1318 Options[5] = CU_JIT_LOG_VERBOSE;
1319 OptionVals[5] = (void *)1;
1321 memset(ErrorLog, 0, sizeof(ErrorLog));
1323 CuLinkCreateFcnPtr(6, Options, OptionVals, &LState);
1324 Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)BinaryBuffer,
1325 strlen(BinaryBuffer) + 1, 0, 0, 0, 0);
1326 if (Res != CUDA_SUCCESS) {
1327 fprintf(stderr, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
1328 exit(-1);
1331 Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize);
1332 if (Res != CUDA_SUCCESS) {
1333 fprintf(stderr, "Complete ptx linker step failed.\n");
1334 fprintf(stderr, "\n%s\n", ErrorLog);
1335 exit(-1);
1338 debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime,
1339 InfoLog);
1341 Res = CuModuleLoadDataFcnPtr(&(((CUDAKernel *)Function->Kernel)->CudaModule),
1342 CuOut);
1343 if (Res != CUDA_SUCCESS) {
1344 fprintf(stderr, "Loading ptx assembly text failed.\n");
1345 exit(-1);
1348 Res = CuModuleGetFunctionFcnPtr(&(((CUDAKernel *)Function->Kernel)->Cuda),
1349 ((CUDAKernel *)Function->Kernel)->CudaModule,
1350 KernelName);
1351 if (Res != CUDA_SUCCESS) {
1352 fprintf(stderr, "Loading kernel function failed.\n");
1353 exit(-1);
1356 CuLinkDestroyFcnPtr(LState);
1358 ((CUDAKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
1360 if (CacheMode) {
1361 if (KernelCache[NextCacheItem])
1362 freeKernelCUDA(KernelCache[NextCacheItem]);
1364 KernelCache[NextCacheItem] = Function;
1366 NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
1369 return Function;
1372 static void synchronizeDeviceCUDA() {
1373 dump_function();
1374 if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) {
1375 fprintf(stderr, "Synchronizing device and host memory failed.\n");
1376 exit(-1);
1380 static void copyFromHostToDeviceCUDA(void *HostData, PollyGPUDevicePtr *DevData,
1381 long MemSize) {
1382 dump_function();
1384 CUdeviceptr CuDevData = ((CUDADevicePtr *)DevData->DevicePtr)->Cuda;
1385 CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize);
1388 static void copyFromDeviceToHostCUDA(PollyGPUDevicePtr *DevData, void *HostData,
1389 long MemSize) {
1390 dump_function();
1392 if (CuMemcpyDtoHFcnPtr(HostData, ((CUDADevicePtr *)DevData->DevicePtr)->Cuda,
1393 MemSize) != CUDA_SUCCESS) {
1394 fprintf(stderr, "Copying results from device to host memory failed.\n");
1395 exit(-1);
1399 static void launchKernelCUDA(PollyGPUFunction *Kernel, unsigned int GridDimX,
1400 unsigned int GridDimY, unsigned int BlockDimX,
1401 unsigned int BlockDimY, unsigned int BlockDimZ,
1402 void **Parameters) {
1403 dump_function();
1405 unsigned GridDimZ = 1;
1406 unsigned int SharedMemBytes = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE;
1407 CUstream Stream = 0;
1408 void **Extra = 0;
1410 CUresult Res;
1411 Res =
1412 CuLaunchKernelFcnPtr(((CUDAKernel *)Kernel->Kernel)->Cuda, GridDimX,
1413 GridDimY, GridDimZ, BlockDimX, BlockDimY, BlockDimZ,
1414 SharedMemBytes, Stream, Parameters, Extra);
1415 if (Res != CUDA_SUCCESS) {
1416 fprintf(stderr, "Launching CUDA kernel failed.\n");
1417 exit(-1);
1421 // Maximum number of managed memory pointers.
1422 #define DEFAULT_MAX_POINTERS 4000
1423 // For the rationale behing a list of free pointers, see `polly_freeManaged`.
1424 void **g_managedptrs;
1425 unsigned long long g_nmanagedptrs = 0;
1426 unsigned long long g_maxmanagedptrs = 0;
1428 __attribute__((constructor)) static void initManagedPtrsBuffer() {
1429 g_maxmanagedptrs = DEFAULT_MAX_POINTERS;
1430 const char *maxManagedPointersString = getenv("POLLY_MAX_MANAGED_POINTERS");
1431 if (maxManagedPointersString)
1432 g_maxmanagedptrs = atoll(maxManagedPointersString);
1434 g_managedptrs = (void **)malloc(sizeof(void *) * g_maxmanagedptrs);
1437 // Add a pointer as being allocated by cuMallocManaged
1438 void addManagedPtr(void *mem) {
1439 assert(g_maxmanagedptrs > 0 && "g_maxmanagedptrs was set to 0!");
1440 assert(g_nmanagedptrs < g_maxmanagedptrs &&
1441 "We have hit the maximum number of "
1442 "managed pointers allowed. Set the "
1443 "POLLY_MAX_MANAGED_POINTERS environment variable. ");
1444 g_managedptrs[g_nmanagedptrs++] = mem;
1447 int isManagedPtr(void *mem) {
1448 for (unsigned long long i = 0; i < g_nmanagedptrs; i++) {
1449 if (g_managedptrs[i] == mem)
1450 return 1;
1452 return 0;
1455 void freeManagedCUDA(void *mem) {
1456 dump_function();
1458 // In a real-world program this was used (COSMO), there were more `free`
1459 // calls in the original source than `malloc` calls. Hence, replacing all
1460 // `free`s with `cudaFree` does not work, since we would try to free
1461 // 'illegal' memory.
1462 // As a quick fix, we keep a free list and check if `mem` is a managed memory
1463 // pointer. If it is, we call `cudaFree`.
1464 // If not, we pass it along to the underlying allocator.
1465 // This is a hack, and can be removed if the underlying issue is fixed.
1466 if (isManagedPtr(mem)) {
1467 if (CuMemFreeFcnPtr((size_t)mem) != CUDA_SUCCESS) {
1468 fprintf(stderr, "cudaFree failed.\n");
1469 exit(-1);
1471 return;
1472 } else {
1473 free(mem);
1477 void *mallocManagedCUDA(size_t size) {
1478 // Note: [Size 0 allocations]
1479 // Sometimes, some runtime computation of size could create a size of 0
1480 // for an allocation. In these cases, we do not wish to fail.
1481 // The CUDA API fails on size 0 allocations.
1482 // So, we allocate size a minimum of size 1.
1483 if (!size && DebugMode)
1484 fprintf(stderr, "cudaMallocManaged called with size 0. "
1485 "Promoting to size 1");
1486 size = max(size, 1);
1487 PollyGPUContext *_ = polly_initContextCUDA();
1488 assert(_ && "polly_initContextCUDA failed");
1490 void *newMemPtr;
1491 const CUresult Res = CuMemAllocManagedFcnPtr((CUdeviceptr *)&newMemPtr, size,
1492 CU_MEM_ATTACH_GLOBAL);
1493 if (Res != CUDA_SUCCESS) {
1494 fprintf(stderr, "cudaMallocManaged failed for size: %zu\n", size);
1495 exit(-1);
1497 addManagedPtr(newMemPtr);
1498 return newMemPtr;
1501 static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) {
1502 dump_function();
1503 CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
1504 CuMemFreeFcnPtr((CUdeviceptr)DevPtr->Cuda);
1505 free(DevPtr);
1506 free(Allocation);
1509 static PollyGPUDevicePtr *allocateMemoryForDeviceCUDA(long MemSize) {
1510 if (!MemSize && DebugMode)
1511 fprintf(stderr, "allocateMemoryForDeviceCUDA called with size 0. "
1512 "Promoting to size 1");
1513 // see: [Size 0 allocations]
1514 MemSize = max(MemSize, 1);
1515 dump_function();
1517 PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
1518 if (DevData == 0) {
1519 fprintf(stderr,
1520 "Allocate memory for GPU device memory pointer failed."
1521 " Line: %d | Size: %ld\n",
1522 __LINE__, MemSize);
1523 exit(-1);
1525 DevData->DevicePtr = (CUDADevicePtr *)malloc(sizeof(CUDADevicePtr));
1526 if (DevData->DevicePtr == 0) {
1527 fprintf(stderr,
1528 "Allocate memory for GPU device memory pointer failed."
1529 " Line: %d | Size: %ld\n",
1530 __LINE__, MemSize);
1531 exit(-1);
1534 CUresult Res =
1535 CuMemAllocFcnPtr(&(((CUDADevicePtr *)DevData->DevicePtr)->Cuda), MemSize);
1537 if (Res != CUDA_SUCCESS) {
1538 fprintf(stderr,
1539 "Allocate memory for GPU device memory pointer failed."
1540 " Line: %d | Size: %ld\n",
1541 __LINE__, MemSize);
1542 exit(-1);
1545 return DevData;
1548 static void *getDevicePtrCUDA(PollyGPUDevicePtr *Allocation) {
1549 dump_function();
1551 CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
1552 return (void *)DevPtr->Cuda;
1555 static void freeContextCUDA(PollyGPUContext *Context) {
1556 dump_function();
1558 CUDAContext *Ctx = (CUDAContext *)Context->Context;
1559 if (Ctx->Cuda) {
1560 CuProfilerStopFcnPtr();
1561 CuCtxDestroyFcnPtr(Ctx->Cuda);
1562 free(Ctx);
1563 free(Context);
1566 dlclose(HandleCuda);
1567 dlclose(HandleCudaRT);
1570 #endif /* HAS_LIBCUDART */
1571 /******************************************************************************/
1572 /* API */
1573 /******************************************************************************/
1575 PollyGPUContext *polly_initContext() {
1576 DebugMode = getenv("POLLY_DEBUG") != 0;
1577 CacheMode = getenv("POLLY_NOCACHE") == 0;
1579 dump_function();
1581 PollyGPUContext *Context;
1583 switch (Runtime) {
1584 #ifdef HAS_LIBCUDART
1585 case RUNTIME_CUDA:
1586 Context = initContextCUDA();
1587 break;
1588 #endif /* HAS_LIBCUDART */
1589 #ifdef HAS_LIBOPENCL
1590 case RUNTIME_CL:
1591 Context = initContextCL();
1592 break;
1593 #endif /* HAS_LIBOPENCL */
1594 default:
1595 err_runtime();
1598 return Context;
1601 void polly_freeKernel(PollyGPUFunction *Kernel) {
1602 dump_function();
1604 switch (Runtime) {
1605 #ifdef HAS_LIBCUDART
1606 case RUNTIME_CUDA:
1607 freeKernelCUDA(Kernel);
1608 break;
1609 #endif /* HAS_LIBCUDART */
1610 #ifdef HAS_LIBOPENCL
1611 case RUNTIME_CL:
1612 freeKernelCL(Kernel);
1613 break;
1614 #endif /* HAS_LIBOPENCL */
1615 default:
1616 err_runtime();
1620 PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
1621 const char *KernelName) {
1622 dump_function();
1624 PollyGPUFunction *Function;
1626 switch (Runtime) {
1627 #ifdef HAS_LIBCUDART
1628 case RUNTIME_CUDA:
1629 Function = getKernelCUDA(BinaryBuffer, KernelName);
1630 break;
1631 #endif /* HAS_LIBCUDART */
1632 #ifdef HAS_LIBOPENCL
1633 case RUNTIME_CL:
1634 Function = getKernelCL(BinaryBuffer, KernelName);
1635 break;
1636 #endif /* HAS_LIBOPENCL */
1637 default:
1638 err_runtime();
1641 return Function;
1644 void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
1645 long MemSize) {
1646 dump_function();
1648 switch (Runtime) {
1649 #ifdef HAS_LIBCUDART
1650 case RUNTIME_CUDA:
1651 copyFromHostToDeviceCUDA(HostData, DevData, MemSize);
1652 break;
1653 #endif /* HAS_LIBCUDART */
1654 #ifdef HAS_LIBOPENCL
1655 case RUNTIME_CL:
1656 copyFromHostToDeviceCL(HostData, DevData, MemSize);
1657 break;
1658 #endif /* HAS_LIBOPENCL */
1659 default:
1660 err_runtime();
1664 void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
1665 long MemSize) {
1666 dump_function();
1668 switch (Runtime) {
1669 #ifdef HAS_LIBCUDART
1670 case RUNTIME_CUDA:
1671 copyFromDeviceToHostCUDA(DevData, HostData, MemSize);
1672 break;
1673 #endif /* HAS_LIBCUDART */
1674 #ifdef HAS_LIBOPENCL
1675 case RUNTIME_CL:
1676 copyFromDeviceToHostCL(DevData, HostData, MemSize);
1677 break;
1678 #endif /* HAS_LIBOPENCL */
1679 default:
1680 err_runtime();
1684 void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
1685 unsigned int GridDimY, unsigned int BlockDimX,
1686 unsigned int BlockDimY, unsigned int BlockDimZ,
1687 void **Parameters) {
1688 dump_function();
1690 switch (Runtime) {
1691 #ifdef HAS_LIBCUDART
1692 case RUNTIME_CUDA:
1693 launchKernelCUDA(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
1694 BlockDimZ, Parameters);
1695 break;
1696 #endif /* HAS_LIBCUDART */
1697 #ifdef HAS_LIBOPENCL
1698 case RUNTIME_CL:
1699 launchKernelCL(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ,
1700 Parameters);
1701 break;
1702 #endif /* HAS_LIBOPENCL */
1703 default:
1704 err_runtime();
1708 void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
1709 dump_function();
1711 switch (Runtime) {
1712 #ifdef HAS_LIBCUDART
1713 case RUNTIME_CUDA:
1714 freeDeviceMemoryCUDA(Allocation);
1715 break;
1716 #endif /* HAS_LIBCUDART */
1717 #ifdef HAS_LIBOPENCL
1718 case RUNTIME_CL:
1719 freeDeviceMemoryCL(Allocation);
1720 break;
1721 #endif /* HAS_LIBOPENCL */
1722 default:
1723 err_runtime();
1727 PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
1728 dump_function();
1730 PollyGPUDevicePtr *DevData;
1732 switch (Runtime) {
1733 #ifdef HAS_LIBCUDART
1734 case RUNTIME_CUDA:
1735 DevData = allocateMemoryForDeviceCUDA(MemSize);
1736 break;
1737 #endif /* HAS_LIBCUDART */
1738 #ifdef HAS_LIBOPENCL
1739 case RUNTIME_CL:
1740 DevData = allocateMemoryForDeviceCL(MemSize);
1741 break;
1742 #endif /* HAS_LIBOPENCL */
1743 default:
1744 err_runtime();
1747 return DevData;
1750 void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) {
1751 dump_function();
1753 void *DevPtr;
1755 switch (Runtime) {
1756 #ifdef HAS_LIBCUDART
1757 case RUNTIME_CUDA:
1758 DevPtr = getDevicePtrCUDA(Allocation);
1759 break;
1760 #endif /* HAS_LIBCUDART */
1761 #ifdef HAS_LIBOPENCL
1762 case RUNTIME_CL:
1763 DevPtr = getDevicePtrCL(Allocation);
1764 break;
1765 #endif /* HAS_LIBOPENCL */
1766 default:
1767 err_runtime();
1770 return DevPtr;
1773 void polly_synchronizeDevice() {
1774 dump_function();
1776 switch (Runtime) {
1777 #ifdef HAS_LIBCUDART
1778 case RUNTIME_CUDA:
1779 synchronizeDeviceCUDA();
1780 break;
1781 #endif /* HAS_LIBCUDART */
1782 #ifdef HAS_LIBOPENCL
1783 case RUNTIME_CL:
1784 synchronizeDeviceCL();
1785 break;
1786 #endif /* HAS_LIBOPENCL */
1787 default:
1788 err_runtime();
1792 void polly_freeContext(PollyGPUContext *Context) {
1793 dump_function();
1795 if (CacheMode)
1796 return;
1798 switch (Runtime) {
1799 #ifdef HAS_LIBCUDART
1800 case RUNTIME_CUDA:
1801 freeContextCUDA(Context);
1802 break;
1803 #endif /* HAS_LIBCUDART */
1804 #ifdef HAS_LIBOPENCL
1805 case RUNTIME_CL:
1806 freeContextCL(Context);
1807 break;
1808 #endif /* HAS_LIBOPENCL */
1809 default:
1810 err_runtime();
1814 void polly_freeManaged(void *mem) {
1815 dump_function();
1817 #ifdef HAS_LIBCUDART
1818 freeManagedCUDA(mem);
1819 #else
1820 fprintf(stderr, "No CUDA Runtime. Managed memory only supported by CUDA\n");
1821 exit(-1);
1822 #endif
1825 void *polly_mallocManaged(size_t size) {
1826 dump_function();
1828 #ifdef HAS_LIBCUDART
1829 return mallocManagedCUDA(size);
1830 #else
1831 fprintf(stderr, "No CUDA Runtime. Managed memory only supported by CUDA\n");
1832 exit(-1);
1833 #endif
1836 /* Initialize GPUJIT with CUDA as runtime library. */
1837 PollyGPUContext *polly_initContextCUDA() {
1838 #ifdef HAS_LIBCUDART
1839 Runtime = RUNTIME_CUDA;
1840 return polly_initContext();
1841 #else
1842 fprintf(stderr, "GPU Runtime was built without CUDA support.\n");
1843 exit(-1);
1844 #endif /* HAS_LIBCUDART */
1847 /* Initialize GPUJIT with OpenCL as runtime library. */
1848 PollyGPUContext *polly_initContextCL() {
1849 #ifdef HAS_LIBOPENCL
1850 Runtime = RUNTIME_CL;
1851 return polly_initContext();
1852 #else
1853 fprintf(stderr, "GPU Runtime was built without OpenCL support.\n");
1854 exit(-1);
1855 #endif /* HAS_LIBOPENCL */