1 /******************** GPUJIT.c - GPUJIT Execution Engine **********************/
3 /* Part of the LLVM Project, under the Apache License v2.0 with LLVM */
5 /* See https://llvm.org/LICENSE.txt for license information. */
6 /* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */
8 /******************************************************************************/
10 /* This file implements GPUJIT, a ptx string execution engine for GPU. */
12 /******************************************************************************/
18 #include <cuda_runtime.h>
19 #endif /* HAS_LIBCUDART */
23 #include <OpenCL/opencl.h>
26 #endif /* __APPLE__ */
27 #endif /* HAS_LIBOPENCL */
39 #define max(x, y) ((x) > (y) ? (x) : (y))
41 static PollyGPURuntime Runtime
= RUNTIME_NONE
;
43 static void debug_print(const char *format
, ...) {
48 va_start(args
, format
);
49 vfprintf(stderr
, format
, args
);
52 #define dump_function() debug_print("-> %s\n", __func__)
54 #define KERNEL_CACHE_SIZE 10
56 static void err_runtime() __attribute__((noreturn
));
57 static void err_runtime() {
58 fprintf(stderr
, "Runtime not correctly initialized.\n");
62 struct PollyGPUContextT
{
66 struct PollyGPUFunctionT
{
70 struct PollyGPUDevicePtrT
{
74 /******************************************************************************/
76 /******************************************************************************/
79 struct OpenCLContextT
{
81 cl_command_queue CommandQueue
;
84 struct OpenCLKernelT
{
87 const char *BinaryString
;
90 struct OpenCLDevicePtrT
{
94 /* Dynamic library handles for the OpenCL runtime library. */
95 static void *HandleOpenCL
;
96 static void *HandleOpenCLBeignet
;
98 /* Type-defines of function pointer to OpenCL Runtime API. */
99 typedef cl_int
clGetPlatformIDsFcnTy(cl_uint NumEntries
,
100 cl_platform_id
*Platforms
,
101 cl_uint
*NumPlatforms
);
102 static clGetPlatformIDsFcnTy
*clGetPlatformIDsFcnPtr
;
104 typedef cl_int
clGetDeviceIDsFcnTy(cl_platform_id Platform
,
105 cl_device_type DeviceType
,
106 cl_uint NumEntries
, cl_device_id
*Devices
,
107 cl_uint
*NumDevices
);
108 static clGetDeviceIDsFcnTy
*clGetDeviceIDsFcnPtr
;
110 typedef cl_int
clGetDeviceInfoFcnTy(cl_device_id Device
,
111 cl_device_info ParamName
,
112 size_t ParamValueSize
, void *ParamValue
,
113 size_t *ParamValueSizeRet
);
114 static clGetDeviceInfoFcnTy
*clGetDeviceInfoFcnPtr
;
116 typedef cl_int
clGetKernelInfoFcnTy(cl_kernel Kernel
, cl_kernel_info ParamName
,
117 size_t ParamValueSize
, void *ParamValue
,
118 size_t *ParamValueSizeRet
);
119 static clGetKernelInfoFcnTy
*clGetKernelInfoFcnPtr
;
121 typedef cl_context
clCreateContextFcnTy(
122 const cl_context_properties
*Properties
, cl_uint NumDevices
,
123 const cl_device_id
*Devices
,
124 void CL_CALLBACK
*pfn_notify(const char *Errinfo
, const void *PrivateInfo
,
125 size_t CB
, void *UserData
),
126 void *UserData
, cl_int
*ErrcodeRet
);
127 static clCreateContextFcnTy
*clCreateContextFcnPtr
;
129 typedef cl_command_queue
130 clCreateCommandQueueFcnTy(cl_context Context
, cl_device_id Device
,
131 cl_command_queue_properties Properties
,
133 static clCreateCommandQueueFcnTy
*clCreateCommandQueueFcnPtr
;
135 typedef cl_mem
clCreateBufferFcnTy(cl_context Context
, cl_mem_flags Flags
,
136 size_t Size
, void *HostPtr
,
138 static clCreateBufferFcnTy
*clCreateBufferFcnPtr
;
141 clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue
, cl_mem Buffer
,
142 cl_bool BlockingWrite
, size_t Offset
, size_t Size
,
143 const void *Ptr
, cl_uint NumEventsInWaitList
,
144 const cl_event
*EventWaitList
, cl_event
*Event
);
145 static clEnqueueWriteBufferFcnTy
*clEnqueueWriteBufferFcnPtr
;
148 clCreateProgramWithLLVMIntelFcnTy(cl_context Context
, cl_uint NumDevices
,
149 const cl_device_id
*DeviceList
,
150 const char *Filename
, cl_int
*ErrcodeRet
);
151 static clCreateProgramWithLLVMIntelFcnTy
*clCreateProgramWithLLVMIntelFcnPtr
;
153 typedef cl_program
clCreateProgramWithBinaryFcnTy(
154 cl_context Context
, cl_uint NumDevices
, const cl_device_id
*DeviceList
,
155 const size_t *Lengths
, const unsigned char **Binaries
, cl_int
*BinaryStatus
,
157 static clCreateProgramWithBinaryFcnTy
*clCreateProgramWithBinaryFcnPtr
;
159 typedef cl_int
clBuildProgramFcnTy(
160 cl_program Program
, cl_uint NumDevices
, const cl_device_id
*DeviceList
,
162 void(CL_CALLBACK
*pfn_notify
)(cl_program Program
, void *UserData
),
164 static clBuildProgramFcnTy
*clBuildProgramFcnPtr
;
166 typedef cl_kernel
clCreateKernelFcnTy(cl_program Program
,
167 const char *KernelName
,
169 static clCreateKernelFcnTy
*clCreateKernelFcnPtr
;
171 typedef cl_int
clSetKernelArgFcnTy(cl_kernel Kernel
, cl_uint ArgIndex
,
172 size_t ArgSize
, const void *ArgValue
);
173 static clSetKernelArgFcnTy
*clSetKernelArgFcnPtr
;
175 typedef cl_int
clEnqueueNDRangeKernelFcnTy(
176 cl_command_queue CommandQueue
, cl_kernel Kernel
, cl_uint WorkDim
,
177 const size_t *GlobalWorkOffset
, const size_t *GlobalWorkSize
,
178 const size_t *LocalWorkSize
, cl_uint NumEventsInWaitList
,
179 const cl_event
*EventWaitList
, cl_event
*Event
);
180 static clEnqueueNDRangeKernelFcnTy
*clEnqueueNDRangeKernelFcnPtr
;
182 typedef cl_int
clEnqueueReadBufferFcnTy(cl_command_queue CommandQueue
,
183 cl_mem Buffer
, cl_bool BlockingRead
,
184 size_t Offset
, size_t Size
, void *Ptr
,
185 cl_uint NumEventsInWaitList
,
186 const cl_event
*EventWaitList
,
188 static clEnqueueReadBufferFcnTy
*clEnqueueReadBufferFcnPtr
;
190 typedef cl_int
clFlushFcnTy(cl_command_queue CommandQueue
);
191 static clFlushFcnTy
*clFlushFcnPtr
;
193 typedef cl_int
clFinishFcnTy(cl_command_queue CommandQueue
);
194 static clFinishFcnTy
*clFinishFcnPtr
;
196 typedef cl_int
clReleaseKernelFcnTy(cl_kernel Kernel
);
197 static clReleaseKernelFcnTy
*clReleaseKernelFcnPtr
;
199 typedef cl_int
clReleaseProgramFcnTy(cl_program Program
);
200 static clReleaseProgramFcnTy
*clReleaseProgramFcnPtr
;
202 typedef cl_int
clReleaseMemObjectFcnTy(cl_mem Memobject
);
203 static clReleaseMemObjectFcnTy
*clReleaseMemObjectFcnPtr
;
205 typedef cl_int
clReleaseCommandQueueFcnTy(cl_command_queue CommandQueue
);
206 static clReleaseCommandQueueFcnTy
*clReleaseCommandQueueFcnPtr
;
208 typedef cl_int
clReleaseContextFcnTy(cl_context Context
);
209 static clReleaseContextFcnTy
*clReleaseContextFcnPtr
;
211 static void *getAPIHandleCL(void *Handle
, const char *FuncName
) {
215 FuncPtr
= dlsym(Handle
, FuncName
);
216 if ((Err
= dlerror()) != 0) {
217 fprintf(stderr
, "Load OpenCL Runtime API failed: %s. \n", Err
);
223 static int initialDeviceAPILibrariesCL() {
224 HandleOpenCLBeignet
= dlopen("/usr/local/lib/beignet/libcl.so", RTLD_LAZY
);
225 HandleOpenCL
= dlopen("libOpenCL.so", RTLD_LAZY
);
227 fprintf(stderr
, "Cannot open library: %s. \n", dlerror());
233 /* Get function pointer to OpenCL Runtime API.
235 * Note that compilers conforming to the ISO C standard are required to
236 * generate a warning if a conversion from a void * pointer to a function
237 * pointer is attempted as in the following statements. The warning
238 * of this kind of cast may not be emitted by clang and new versions of gcc
239 * as it is valid on POSIX 2008. For compilers required to generate a warning,
240 * we temporarily disable -Wpedantic, to avoid bloating the output with
241 * unnecessary warnings.
244 * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html
246 #pragma GCC diagnostic push
247 #pragma GCC diagnostic ignored "-Wpedantic"
248 static int initialDeviceAPIsCL() {
249 if (initialDeviceAPILibrariesCL() == 0)
252 // FIXME: We are now always selecting the Intel Beignet driver if it is
253 // available on the system, instead of a possible NVIDIA or AMD OpenCL
254 // API. This selection should occurr based on the target architecture
255 // chosen when compiling.
257 (HandleOpenCLBeignet
!= NULL
? HandleOpenCLBeignet
: HandleOpenCL
);
259 clGetPlatformIDsFcnPtr
=
260 (clGetPlatformIDsFcnTy
*)getAPIHandleCL(Handle
, "clGetPlatformIDs");
262 clGetDeviceIDsFcnPtr
=
263 (clGetDeviceIDsFcnTy
*)getAPIHandleCL(Handle
, "clGetDeviceIDs");
265 clGetDeviceInfoFcnPtr
=
266 (clGetDeviceInfoFcnTy
*)getAPIHandleCL(Handle
, "clGetDeviceInfo");
268 clGetKernelInfoFcnPtr
=
269 (clGetKernelInfoFcnTy
*)getAPIHandleCL(Handle
, "clGetKernelInfo");
271 clCreateContextFcnPtr
=
272 (clCreateContextFcnTy
*)getAPIHandleCL(Handle
, "clCreateContext");
274 clCreateCommandQueueFcnPtr
= (clCreateCommandQueueFcnTy
*)getAPIHandleCL(
275 Handle
, "clCreateCommandQueue");
277 clCreateBufferFcnPtr
=
278 (clCreateBufferFcnTy
*)getAPIHandleCL(Handle
, "clCreateBuffer");
280 clEnqueueWriteBufferFcnPtr
= (clEnqueueWriteBufferFcnTy
*)getAPIHandleCL(
281 Handle
, "clEnqueueWriteBuffer");
283 if (HandleOpenCLBeignet
)
284 clCreateProgramWithLLVMIntelFcnPtr
=
285 (clCreateProgramWithLLVMIntelFcnTy
*)getAPIHandleCL(
286 Handle
, "clCreateProgramWithLLVMIntel");
288 clCreateProgramWithBinaryFcnPtr
=
289 (clCreateProgramWithBinaryFcnTy
*)getAPIHandleCL(
290 Handle
, "clCreateProgramWithBinary");
292 clBuildProgramFcnPtr
=
293 (clBuildProgramFcnTy
*)getAPIHandleCL(Handle
, "clBuildProgram");
295 clCreateKernelFcnPtr
=
296 (clCreateKernelFcnTy
*)getAPIHandleCL(Handle
, "clCreateKernel");
298 clSetKernelArgFcnPtr
=
299 (clSetKernelArgFcnTy
*)getAPIHandleCL(Handle
, "clSetKernelArg");
301 clEnqueueNDRangeKernelFcnPtr
= (clEnqueueNDRangeKernelFcnTy
*)getAPIHandleCL(
302 Handle
, "clEnqueueNDRangeKernel");
304 clEnqueueReadBufferFcnPtr
=
305 (clEnqueueReadBufferFcnTy
*)getAPIHandleCL(Handle
, "clEnqueueReadBuffer");
307 clFlushFcnPtr
= (clFlushFcnTy
*)getAPIHandleCL(Handle
, "clFlush");
309 clFinishFcnPtr
= (clFinishFcnTy
*)getAPIHandleCL(Handle
, "clFinish");
311 clReleaseKernelFcnPtr
=
312 (clReleaseKernelFcnTy
*)getAPIHandleCL(Handle
, "clReleaseKernel");
314 clReleaseProgramFcnPtr
=
315 (clReleaseProgramFcnTy
*)getAPIHandleCL(Handle
, "clReleaseProgram");
317 clReleaseMemObjectFcnPtr
=
318 (clReleaseMemObjectFcnTy
*)getAPIHandleCL(Handle
, "clReleaseMemObject");
320 clReleaseCommandQueueFcnPtr
= (clReleaseCommandQueueFcnTy
*)getAPIHandleCL(
321 Handle
, "clReleaseCommandQueue");
323 clReleaseContextFcnPtr
=
324 (clReleaseContextFcnTy
*)getAPIHandleCL(Handle
, "clReleaseContext");
328 #pragma GCC diagnostic pop
330 /* Context and Device. */
331 static PollyGPUContext
*GlobalContext
= NULL
;
332 static cl_device_id GlobalDeviceID
= NULL
;
334 /* Fd-Decl: Print out OpenCL Error codes to human readable strings. */
335 static void printOpenCLError(int Error
);
337 static void checkOpenCLError(int Ret
, const char *format
, ...) {
338 if (Ret
== CL_SUCCESS
)
341 printOpenCLError(Ret
);
343 va_start(args
, format
);
344 vfprintf(stderr
, format
, args
);
349 static PollyGPUContext
*initContextCL() {
352 PollyGPUContext
*Context
;
354 cl_platform_id PlatformID
= NULL
;
355 cl_device_id DeviceID
= NULL
;
356 cl_uint NumDevicesRet
;
359 char DeviceRevision
[256];
360 char DeviceName
[256];
361 size_t DeviceRevisionRetSize
, DeviceNameRetSize
;
363 static __thread PollyGPUContext
*CurrentContext
= NULL
;
366 return CurrentContext
;
368 /* Get API handles. */
369 if (initialDeviceAPIsCL() == 0) {
370 fprintf(stderr
, "Getting the \"handle\" for the OpenCL Runtime failed.\n");
374 /* Get number of devices that support OpenCL. */
375 static const int NumberOfPlatforms
= 1;
376 Ret
= clGetPlatformIDsFcnPtr(NumberOfPlatforms
, &PlatformID
, NULL
);
377 checkOpenCLError(Ret
, "Failed to get platform IDs.\n");
378 // TODO: Extend to CL_DEVICE_TYPE_ALL?
379 static const int NumberOfDevices
= 1;
380 Ret
= clGetDeviceIDsFcnPtr(PlatformID
, CL_DEVICE_TYPE_GPU
, NumberOfDevices
,
381 &DeviceID
, &NumDevicesRet
);
382 checkOpenCLError(Ret
, "Failed to get device IDs.\n");
384 GlobalDeviceID
= DeviceID
;
385 if (NumDevicesRet
== 0) {
386 fprintf(stderr
, "There is no device supporting OpenCL.\n");
390 /* Get device revision. */
392 clGetDeviceInfoFcnPtr(DeviceID
, CL_DEVICE_VERSION
, sizeof(DeviceRevision
),
393 DeviceRevision
, &DeviceRevisionRetSize
);
394 checkOpenCLError(Ret
, "Failed to fetch device revision.\n");
396 /* Get device name. */
397 Ret
= clGetDeviceInfoFcnPtr(DeviceID
, CL_DEVICE_NAME
, sizeof(DeviceName
),
398 DeviceName
, &DeviceNameRetSize
);
399 checkOpenCLError(Ret
, "Failed to fetch device name.\n");
401 debug_print("> Running on GPU device %d : %s.\n", DeviceID
, DeviceName
);
403 /* Create context on the device. */
404 Context
= (PollyGPUContext
*)malloc(sizeof(PollyGPUContext
));
406 fprintf(stderr
, "Allocate memory for Polly GPU context failed.\n");
409 Context
->Context
= (OpenCLContext
*)malloc(sizeof(OpenCLContext
));
410 if (Context
->Context
== 0) {
411 fprintf(stderr
, "Allocate memory for Polly OpenCL context failed.\n");
414 ((OpenCLContext
*)Context
->Context
)->Context
=
415 clCreateContextFcnPtr(NULL
, NumDevicesRet
, &DeviceID
, NULL
, NULL
, &Ret
);
416 checkOpenCLError(Ret
, "Failed to create context.\n");
418 static const int ExtraProperties
= 0;
419 ((OpenCLContext
*)Context
->Context
)->CommandQueue
=
420 clCreateCommandQueueFcnPtr(((OpenCLContext
*)Context
->Context
)->Context
,
421 DeviceID
, ExtraProperties
, &Ret
);
422 checkOpenCLError(Ret
, "Failed to create command queue.\n");
425 CurrentContext
= Context
;
427 GlobalContext
= Context
;
431 static void freeKernelCL(PollyGPUFunction
*Kernel
) {
437 if (!GlobalContext
) {
438 fprintf(stderr
, "GPGPU-code generation not correctly initialized.\n");
443 Ret
= clFlushFcnPtr(((OpenCLContext
*)GlobalContext
->Context
)->CommandQueue
);
444 checkOpenCLError(Ret
, "Failed to flush command queue.\n");
445 Ret
= clFinishFcnPtr(((OpenCLContext
*)GlobalContext
->Context
)->CommandQueue
);
446 checkOpenCLError(Ret
, "Failed to finish command queue.\n");
448 if (((OpenCLKernel
*)Kernel
->Kernel
)->Kernel
) {
450 clReleaseKernelFcnPtr(((OpenCLKernel
*)Kernel
->Kernel
)->Kernel
);
451 checkOpenCLError(Ret
, "Failed to release kernel.\n");
454 if (((OpenCLKernel
*)Kernel
->Kernel
)->Program
) {
456 clReleaseProgramFcnPtr(((OpenCLKernel
*)Kernel
->Kernel
)->Program
);
457 checkOpenCLError(Ret
, "Failed to release program.\n");
461 free((OpenCLKernel
*)Kernel
->Kernel
);
467 static PollyGPUFunction
*getKernelCL(const char *BinaryBuffer
,
468 const char *KernelName
) {
471 if (!GlobalContext
) {
472 fprintf(stderr
, "GPGPU-code generation not correctly initialized.\n");
476 static __thread PollyGPUFunction
*KernelCache
[KERNEL_CACHE_SIZE
];
477 static __thread
int NextCacheItem
= 0;
479 for (long i
= 0; i
< KERNEL_CACHE_SIZE
; i
++) {
480 // We exploit here the property that all Polly-ACC kernels are allocated
481 // as global constants, hence a pointer comparision is sufficient to
482 // determin equality.
483 if (KernelCache
[i
] &&
484 ((OpenCLKernel
*)KernelCache
[i
]->Kernel
)->BinaryString
==
486 debug_print(" -> using cached kernel\n");
487 return KernelCache
[i
];
491 PollyGPUFunction
*Function
= malloc(sizeof(PollyGPUFunction
));
493 fprintf(stderr
, "Allocate memory for Polly GPU function failed.\n");
496 Function
->Kernel
= (OpenCLKernel
*)malloc(sizeof(OpenCLKernel
));
497 if (Function
->Kernel
== 0) {
498 fprintf(stderr
, "Allocate memory for Polly OpenCL kernel failed.\n");
502 if (!GlobalDeviceID
) {
503 fprintf(stderr
, "GPGPU-code generation not initialized correctly.\n");
509 if (HandleOpenCLBeignet
) {
510 // This is a workaround, since clCreateProgramWithLLVMIntel only
511 // accepts a filename to a valid llvm-ir file as an argument, instead
512 // of accepting the BinaryBuffer directly.
513 char FileName
[] = "/tmp/polly_kernelXXXXXX";
514 int File
= mkstemp(FileName
);
515 write(File
, BinaryBuffer
, strlen(BinaryBuffer
));
517 ((OpenCLKernel
*)Function
->Kernel
)->Program
=
518 clCreateProgramWithLLVMIntelFcnPtr(
519 ((OpenCLContext
*)GlobalContext
->Context
)->Context
, 1,
520 &GlobalDeviceID
, FileName
, &Ret
);
521 checkOpenCLError(Ret
, "Failed to create program from llvm.\n");
525 size_t BinarySize
= strlen(BinaryBuffer
);
526 ((OpenCLKernel
*)Function
->Kernel
)->Program
=
527 clCreateProgramWithBinaryFcnPtr(
528 ((OpenCLContext
*)GlobalContext
->Context
)->Context
, 1,
529 &GlobalDeviceID
, (const size_t *)&BinarySize
,
530 (const unsigned char **)&BinaryBuffer
, NULL
, &Ret
);
531 checkOpenCLError(Ret
, "Failed to create program from binary.\n");
534 Ret
= clBuildProgramFcnPtr(((OpenCLKernel
*)Function
->Kernel
)->Program
, 1,
535 &GlobalDeviceID
, NULL
, NULL
, NULL
);
536 checkOpenCLError(Ret
, "Failed to build program.\n");
538 ((OpenCLKernel
*)Function
->Kernel
)->Kernel
= clCreateKernelFcnPtr(
539 ((OpenCLKernel
*)Function
->Kernel
)->Program
, KernelName
, &Ret
);
540 checkOpenCLError(Ret
, "Failed to create kernel.\n");
542 ((OpenCLKernel
*)Function
->Kernel
)->BinaryString
= BinaryBuffer
;
545 if (KernelCache
[NextCacheItem
])
546 freeKernelCL(KernelCache
[NextCacheItem
]);
548 KernelCache
[NextCacheItem
] = Function
;
550 NextCacheItem
= (NextCacheItem
+ 1) % KERNEL_CACHE_SIZE
;
556 static void copyFromHostToDeviceCL(void *HostData
, PollyGPUDevicePtr
*DevData
,
560 if (!GlobalContext
) {
561 fprintf(stderr
, "GPGPU-code generation not correctly initialized.\n");
566 Ret
= clEnqueueWriteBufferFcnPtr(
567 ((OpenCLContext
*)GlobalContext
->Context
)->CommandQueue
,
568 ((OpenCLDevicePtr
*)DevData
->DevicePtr
)->MemObj
, CL_TRUE
, 0, MemSize
,
569 HostData
, 0, NULL
, NULL
);
570 checkOpenCLError(Ret
, "Copying data from host memory to device failed.\n");
573 static void copyFromDeviceToHostCL(PollyGPUDevicePtr
*DevData
, void *HostData
,
577 if (!GlobalContext
) {
578 fprintf(stderr
, "GPGPU-code generation not correctly initialized.\n");
583 Ret
= clEnqueueReadBufferFcnPtr(
584 ((OpenCLContext
*)GlobalContext
->Context
)->CommandQueue
,
585 ((OpenCLDevicePtr
*)DevData
->DevicePtr
)->MemObj
, CL_TRUE
, 0, MemSize
,
586 HostData
, 0, NULL
, NULL
);
587 checkOpenCLError(Ret
, "Copying results from device to host memory failed.\n");
590 static void launchKernelCL(PollyGPUFunction
*Kernel
, unsigned int GridDimX
,
591 unsigned int GridDimY
, unsigned int BlockDimX
,
592 unsigned int BlockDimY
, unsigned int BlockDimZ
,
599 if (!GlobalContext
) {
600 fprintf(stderr
, "GPGPU-code generation not correctly initialized.\n");
604 OpenCLKernel
*CLKernel
= (OpenCLKernel
*)Kernel
->Kernel
;
605 Ret
= clGetKernelInfoFcnPtr(CLKernel
->Kernel
, CL_KERNEL_NUM_ARGS
,
606 sizeof(cl_uint
), &NumArgs
, NULL
);
607 checkOpenCLError(Ret
, "Failed to get number of kernel arguments.\n");
609 /* Argument sizes are stored at the end of the Parameters array. */
610 for (cl_uint i
= 0; i
< NumArgs
; i
++) {
611 Ret
= clSetKernelArgFcnPtr(CLKernel
->Kernel
, i
,
612 *((int *)Parameters
[NumArgs
+ i
]),
613 (void *)Parameters
[i
]);
614 checkOpenCLError(Ret
, "Failed to set Kernel argument %d.\n", i
);
617 unsigned int GridDimZ
= 1;
618 size_t GlobalWorkSize
[3] = {BlockDimX
* GridDimX
, BlockDimY
* GridDimY
,
619 BlockDimZ
* GridDimZ
};
620 size_t LocalWorkSize
[3] = {BlockDimX
, BlockDimY
, BlockDimZ
};
622 static const int WorkDim
= 3;
623 OpenCLContext
*CLContext
= (OpenCLContext
*)GlobalContext
->Context
;
624 Ret
= clEnqueueNDRangeKernelFcnPtr(CLContext
->CommandQueue
, CLKernel
->Kernel
,
625 WorkDim
, NULL
, GlobalWorkSize
,
626 LocalWorkSize
, 0, NULL
, NULL
);
627 checkOpenCLError(Ret
, "Launching OpenCL kernel failed.\n");
630 static void freeDeviceMemoryCL(PollyGPUDevicePtr
*Allocation
) {
633 OpenCLDevicePtr
*DevPtr
= (OpenCLDevicePtr
*)Allocation
->DevicePtr
;
634 cl_int Ret
= clReleaseMemObjectFcnPtr((cl_mem
)DevPtr
->MemObj
);
635 checkOpenCLError(Ret
, "Failed to free device memory.\n");
641 static PollyGPUDevicePtr
*allocateMemoryForDeviceCL(long MemSize
) {
644 if (!GlobalContext
) {
645 fprintf(stderr
, "GPGPU-code generation not correctly initialized.\n");
649 PollyGPUDevicePtr
*DevData
= malloc(sizeof(PollyGPUDevicePtr
));
651 fprintf(stderr
, "Allocate memory for GPU device memory pointer failed.\n");
654 DevData
->DevicePtr
= (OpenCLDevicePtr
*)malloc(sizeof(OpenCLDevicePtr
));
655 if (DevData
->DevicePtr
== 0) {
656 fprintf(stderr
, "Allocate memory for GPU device memory pointer failed.\n");
661 ((OpenCLDevicePtr
*)DevData
->DevicePtr
)->MemObj
=
662 clCreateBufferFcnPtr(((OpenCLContext
*)GlobalContext
->Context
)->Context
,
663 CL_MEM_READ_WRITE
, MemSize
, NULL
, &Ret
);
664 checkOpenCLError(Ret
,
665 "Allocate memory for GPU device memory pointer failed.\n");
670 static void *getDevicePtrCL(PollyGPUDevicePtr
*Allocation
) {
673 OpenCLDevicePtr
*DevPtr
= (OpenCLDevicePtr
*)Allocation
->DevicePtr
;
674 return (void *)DevPtr
->MemObj
;
677 static void synchronizeDeviceCL() {
680 if (!GlobalContext
) {
681 fprintf(stderr
, "GPGPU-code generation not correctly initialized.\n");
685 if (clFinishFcnPtr(((OpenCLContext
*)GlobalContext
->Context
)->CommandQueue
) !=
687 fprintf(stderr
, "Synchronizing device and host memory failed.\n");
692 static void freeContextCL(PollyGPUContext
*Context
) {
697 GlobalContext
= NULL
;
699 OpenCLContext
*Ctx
= (OpenCLContext
*)Context
->Context
;
700 if (Ctx
->CommandQueue
) {
701 Ret
= clReleaseCommandQueueFcnPtr(Ctx
->CommandQueue
);
702 checkOpenCLError(Ret
, "Could not release command queue.\n");
706 Ret
= clReleaseContextFcnPtr(Ctx
->Context
);
707 checkOpenCLError(Ret
, "Could not release context.\n");
714 static void printOpenCLError(int Error
) {
718 // Success, don't print an error.
721 // JIT/Runtime errors.
722 case CL_DEVICE_NOT_FOUND
:
723 fprintf(stderr
, "Device not found.\n");
725 case CL_DEVICE_NOT_AVAILABLE
:
726 fprintf(stderr
, "Device not available.\n");
728 case CL_COMPILER_NOT_AVAILABLE
:
729 fprintf(stderr
, "Compiler not available.\n");
731 case CL_MEM_OBJECT_ALLOCATION_FAILURE
:
732 fprintf(stderr
, "Mem object allocation failure.\n");
734 case CL_OUT_OF_RESOURCES
:
735 fprintf(stderr
, "Out of resources.\n");
737 case CL_OUT_OF_HOST_MEMORY
:
738 fprintf(stderr
, "Out of host memory.\n");
740 case CL_PROFILING_INFO_NOT_AVAILABLE
:
741 fprintf(stderr
, "Profiling info not available.\n");
743 case CL_MEM_COPY_OVERLAP
:
744 fprintf(stderr
, "Mem copy overlap.\n");
746 case CL_IMAGE_FORMAT_MISMATCH
:
747 fprintf(stderr
, "Image format mismatch.\n");
749 case CL_IMAGE_FORMAT_NOT_SUPPORTED
:
750 fprintf(stderr
, "Image format not supported.\n");
752 case CL_BUILD_PROGRAM_FAILURE
:
753 fprintf(stderr
, "Build program failure.\n");
756 fprintf(stderr
, "Map failure.\n");
758 case CL_MISALIGNED_SUB_BUFFER_OFFSET
:
759 fprintf(stderr
, "Misaligned sub buffer offset.\n");
761 case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST
:
762 fprintf(stderr
, "Exec status error for events in wait list.\n");
764 case CL_COMPILE_PROGRAM_FAILURE
:
765 fprintf(stderr
, "Compile program failure.\n");
767 case CL_LINKER_NOT_AVAILABLE
:
768 fprintf(stderr
, "Linker not available.\n");
770 case CL_LINK_PROGRAM_FAILURE
:
771 fprintf(stderr
, "Link program failure.\n");
773 case CL_DEVICE_PARTITION_FAILED
:
774 fprintf(stderr
, "Device partition failed.\n");
776 case CL_KERNEL_ARG_INFO_NOT_AVAILABLE
:
777 fprintf(stderr
, "Kernel arg info not available.\n");
781 case CL_INVALID_VALUE
:
782 fprintf(stderr
, "Invalid value.\n");
784 case CL_INVALID_DEVICE_TYPE
:
785 fprintf(stderr
, "Invalid device type.\n");
787 case CL_INVALID_PLATFORM
:
788 fprintf(stderr
, "Invalid platform.\n");
790 case CL_INVALID_DEVICE
:
791 fprintf(stderr
, "Invalid device.\n");
793 case CL_INVALID_CONTEXT
:
794 fprintf(stderr
, "Invalid context.\n");
796 case CL_INVALID_QUEUE_PROPERTIES
:
797 fprintf(stderr
, "Invalid queue properties.\n");
799 case CL_INVALID_COMMAND_QUEUE
:
800 fprintf(stderr
, "Invalid command queue.\n");
802 case CL_INVALID_HOST_PTR
:
803 fprintf(stderr
, "Invalid host pointer.\n");
805 case CL_INVALID_MEM_OBJECT
:
806 fprintf(stderr
, "Invalid memory object.\n");
808 case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR
:
809 fprintf(stderr
, "Invalid image format descriptor.\n");
811 case CL_INVALID_IMAGE_SIZE
:
812 fprintf(stderr
, "Invalid image size.\n");
814 case CL_INVALID_SAMPLER
:
815 fprintf(stderr
, "Invalid sampler.\n");
817 case CL_INVALID_BINARY
:
818 fprintf(stderr
, "Invalid binary.\n");
820 case CL_INVALID_BUILD_OPTIONS
:
821 fprintf(stderr
, "Invalid build options.\n");
823 case CL_INVALID_PROGRAM
:
824 fprintf(stderr
, "Invalid program.\n");
826 case CL_INVALID_PROGRAM_EXECUTABLE
:
827 fprintf(stderr
, "Invalid program executable.\n");
829 case CL_INVALID_KERNEL_NAME
:
830 fprintf(stderr
, "Invalid kernel name.\n");
832 case CL_INVALID_KERNEL_DEFINITION
:
833 fprintf(stderr
, "Invalid kernel definition.\n");
835 case CL_INVALID_KERNEL
:
836 fprintf(stderr
, "Invalid kernel.\n");
838 case CL_INVALID_ARG_INDEX
:
839 fprintf(stderr
, "Invalid arg index.\n");
841 case CL_INVALID_ARG_VALUE
:
842 fprintf(stderr
, "Invalid arg value.\n");
844 case CL_INVALID_ARG_SIZE
:
845 fprintf(stderr
, "Invalid arg size.\n");
847 case CL_INVALID_KERNEL_ARGS
:
848 fprintf(stderr
, "Invalid kernel args.\n");
850 case CL_INVALID_WORK_DIMENSION
:
851 fprintf(stderr
, "Invalid work dimension.\n");
853 case CL_INVALID_WORK_GROUP_SIZE
:
854 fprintf(stderr
, "Invalid work group size.\n");
856 case CL_INVALID_WORK_ITEM_SIZE
:
857 fprintf(stderr
, "Invalid work item size.\n");
859 case CL_INVALID_GLOBAL_OFFSET
:
860 fprintf(stderr
, "Invalid global offset.\n");
862 case CL_INVALID_EVENT_WAIT_LIST
:
863 fprintf(stderr
, "Invalid event wait list.\n");
865 case CL_INVALID_EVENT
:
866 fprintf(stderr
, "Invalid event.\n");
868 case CL_INVALID_OPERATION
:
869 fprintf(stderr
, "Invalid operation.\n");
871 case CL_INVALID_GL_OBJECT
:
872 fprintf(stderr
, "Invalid GL object.\n");
874 case CL_INVALID_BUFFER_SIZE
:
875 fprintf(stderr
, "Invalid buffer size.\n");
877 case CL_INVALID_MIP_LEVEL
:
878 fprintf(stderr
, "Invalid mip level.\n");
880 case CL_INVALID_GLOBAL_WORK_SIZE
:
881 fprintf(stderr
, "Invalid global work size.\n");
883 case CL_INVALID_PROPERTY
:
884 fprintf(stderr
, "Invalid property.\n");
886 case CL_INVALID_IMAGE_DESCRIPTOR
:
887 fprintf(stderr
, "Invalid image descriptor.\n");
889 case CL_INVALID_COMPILER_OPTIONS
:
890 fprintf(stderr
, "Invalid compiler options.\n");
892 case CL_INVALID_LINKER_OPTIONS
:
893 fprintf(stderr
, "Invalid linker options.\n");
895 case CL_INVALID_DEVICE_PARTITION_COUNT
:
896 fprintf(stderr
, "Invalid device partition count.\n");
898 case -69: // OpenCL 2.0 Code for CL_INVALID_PIPE_SIZE
899 fprintf(stderr
, "Invalid pipe size.\n");
901 case -70: // OpenCL 2.0 Code for CL_INVALID_DEVICE_QUEUE
902 fprintf(stderr
, "Invalid device queue.\n");
905 // NVIDIA specific error.
907 fprintf(stderr
, "NVIDIA invalid read or write buffer.\n");
911 fprintf(stderr
, "Unknown error code!\n");
916 #endif /* HAS_LIBOPENCL */
917 /******************************************************************************/
919 /******************************************************************************/
922 struct CUDAContextT
{
929 const char *BinaryString
;
932 struct CUDADevicePtrT
{
936 /* Dynamic library handles for the CUDA and CUDA runtime library. */
937 static void *HandleCuda
;
938 static void *HandleCudaRT
;
940 /* Type-defines of function pointer to CUDA driver APIs. */
941 typedef CUresult CUDAAPI
CuMemAllocFcnTy(CUdeviceptr
*, size_t);
942 static CuMemAllocFcnTy
*CuMemAllocFcnPtr
;
944 typedef CUresult CUDAAPI
CuMemAllocManagedFcnTy(CUdeviceptr
*, size_t,
946 static CuMemAllocManagedFcnTy
*CuMemAllocManagedFcnPtr
;
948 typedef CUresult CUDAAPI
CuLaunchKernelFcnTy(
949 CUfunction F
, unsigned int GridDimX
, unsigned int GridDimY
,
950 unsigned int gridDimZ
, unsigned int blockDimX
, unsigned int BlockDimY
,
951 unsigned int BlockDimZ
, unsigned int SharedMemBytes
, CUstream HStream
,
952 void **KernelParams
, void **Extra
);
953 static CuLaunchKernelFcnTy
*CuLaunchKernelFcnPtr
;
955 typedef CUresult CUDAAPI
CuMemcpyDtoHFcnTy(void *, CUdeviceptr
, size_t);
956 static CuMemcpyDtoHFcnTy
*CuMemcpyDtoHFcnPtr
;
958 typedef CUresult CUDAAPI
CuMemcpyHtoDFcnTy(CUdeviceptr
, const void *, size_t);
959 static CuMemcpyHtoDFcnTy
*CuMemcpyHtoDFcnPtr
;
961 typedef CUresult CUDAAPI
CuMemFreeFcnTy(CUdeviceptr
);
962 static CuMemFreeFcnTy
*CuMemFreeFcnPtr
;
964 typedef CUresult CUDAAPI
CuModuleUnloadFcnTy(CUmodule
);
965 static CuModuleUnloadFcnTy
*CuModuleUnloadFcnPtr
;
967 typedef CUresult CUDAAPI
CuProfilerStopFcnTy();
968 static CuProfilerStopFcnTy
*CuProfilerStopFcnPtr
;
970 typedef CUresult CUDAAPI
CuCtxDestroyFcnTy(CUcontext
);
971 static CuCtxDestroyFcnTy
*CuCtxDestroyFcnPtr
;
973 typedef CUresult CUDAAPI
CuInitFcnTy(unsigned int);
974 static CuInitFcnTy
*CuInitFcnPtr
;
976 typedef CUresult CUDAAPI
CuDeviceGetCountFcnTy(int *);
977 static CuDeviceGetCountFcnTy
*CuDeviceGetCountFcnPtr
;
979 typedef CUresult CUDAAPI
CuCtxCreateFcnTy(CUcontext
*, unsigned int, CUdevice
);
980 static CuCtxCreateFcnTy
*CuCtxCreateFcnPtr
;
982 typedef CUresult CUDAAPI
CuCtxGetCurrentFcnTy(CUcontext
*);
983 static CuCtxGetCurrentFcnTy
*CuCtxGetCurrentFcnPtr
;
985 typedef CUresult CUDAAPI
CuDeviceGetFcnTy(CUdevice
*, int);
986 static CuDeviceGetFcnTy
*CuDeviceGetFcnPtr
;
988 typedef CUresult CUDAAPI
CuModuleLoadDataExFcnTy(CUmodule
*, const void *,
989 unsigned int, CUjit_option
*,
991 static CuModuleLoadDataExFcnTy
*CuModuleLoadDataExFcnPtr
;
993 typedef CUresult CUDAAPI
CuModuleLoadDataFcnTy(CUmodule
*Module
,
995 static CuModuleLoadDataFcnTy
*CuModuleLoadDataFcnPtr
;
997 typedef CUresult CUDAAPI
CuModuleGetFunctionFcnTy(CUfunction
*, CUmodule
,
999 static CuModuleGetFunctionFcnTy
*CuModuleGetFunctionFcnPtr
;
1001 typedef CUresult CUDAAPI
CuDeviceComputeCapabilityFcnTy(int *, int *, CUdevice
);
1002 static CuDeviceComputeCapabilityFcnTy
*CuDeviceComputeCapabilityFcnPtr
;
1004 typedef CUresult CUDAAPI
CuDeviceGetNameFcnTy(char *, int, CUdevice
);
1005 static CuDeviceGetNameFcnTy
*CuDeviceGetNameFcnPtr
;
1007 typedef CUresult CUDAAPI
CuLinkAddDataFcnTy(CUlinkState State
,
1008 CUjitInputType Type
, void *Data
,
1009 size_t Size
, const char *Name
,
1010 unsigned int NumOptions
,
1011 CUjit_option
*Options
,
1012 void **OptionValues
);
1013 static CuLinkAddDataFcnTy
*CuLinkAddDataFcnPtr
;
1015 typedef CUresult CUDAAPI
CuLinkCreateFcnTy(unsigned int NumOptions
,
1016 CUjit_option
*Options
,
1017 void **OptionValues
,
1018 CUlinkState
*StateOut
);
1019 static CuLinkCreateFcnTy
*CuLinkCreateFcnPtr
;
1021 typedef CUresult CUDAAPI
CuLinkCompleteFcnTy(CUlinkState State
, void **CubinOut
,
1023 static CuLinkCompleteFcnTy
*CuLinkCompleteFcnPtr
;
1025 typedef CUresult CUDAAPI
CuLinkDestroyFcnTy(CUlinkState State
);
1026 static CuLinkDestroyFcnTy
*CuLinkDestroyFcnPtr
;
1028 typedef CUresult CUDAAPI
CuCtxSynchronizeFcnTy();
1029 static CuCtxSynchronizeFcnTy
*CuCtxSynchronizeFcnPtr
;
1031 /* Type-defines of function pointer ot CUDA runtime APIs. */
1032 typedef cudaError_t CUDARTAPI
CudaThreadSynchronizeFcnTy(void);
1033 static CudaThreadSynchronizeFcnTy
*CudaThreadSynchronizeFcnPtr
;
1035 static void *getAPIHandleCUDA(void *Handle
, const char *FuncName
) {
1039 FuncPtr
= dlsym(Handle
, FuncName
);
1040 if ((Err
= dlerror()) != 0) {
1041 fprintf(stderr
, "Load CUDA driver API failed: %s. \n", Err
);
1047 static int initialDeviceAPILibrariesCUDA() {
1048 HandleCuda
= dlopen("libcuda.so", RTLD_LAZY
);
1050 fprintf(stderr
, "Cannot open library: %s. \n", dlerror());
1054 HandleCudaRT
= dlopen("libcudart.so", RTLD_LAZY
);
1055 if (!HandleCudaRT
) {
1056 fprintf(stderr
, "Cannot open library: %s. \n", dlerror());
1063 /* Get function pointer to CUDA Driver APIs.
1065 * Note that compilers conforming to the ISO C standard are required to
1066 * generate a warning if a conversion from a void * pointer to a function
1067 * pointer is attempted as in the following statements. The warning
1068 * of this kind of cast may not be emitted by clang and new versions of gcc
1069 * as it is valid on POSIX 2008. For compilers required to generate a warning,
1070 * we temporarily disable -Wpedantic, to avoid bloating the output with
1071 * unnecessary warnings.
1074 * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html
1076 #pragma GCC diagnostic push
1077 #pragma GCC diagnostic ignored "-Wpedantic"
1078 static int initialDeviceAPIsCUDA() {
1079 if (initialDeviceAPILibrariesCUDA() == 0)
1082 CuLaunchKernelFcnPtr
=
1083 (CuLaunchKernelFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuLaunchKernel");
1086 (CuMemAllocFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuMemAlloc_v2");
1088 CuMemAllocManagedFcnPtr
= (CuMemAllocManagedFcnTy
*)getAPIHandleCUDA(
1089 HandleCuda
, "cuMemAllocManaged");
1092 (CuMemFreeFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuMemFree_v2");
1094 CuMemcpyDtoHFcnPtr
=
1095 (CuMemcpyDtoHFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuMemcpyDtoH_v2");
1097 CuMemcpyHtoDFcnPtr
=
1098 (CuMemcpyHtoDFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuMemcpyHtoD_v2");
1100 CuModuleUnloadFcnPtr
=
1101 (CuModuleUnloadFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuModuleUnload");
1103 CuProfilerStopFcnPtr
=
1104 (CuProfilerStopFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuProfilerStop");
1106 CuCtxDestroyFcnPtr
=
1107 (CuCtxDestroyFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuCtxDestroy");
1109 CuInitFcnPtr
= (CuInitFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuInit");
1111 CuDeviceGetCountFcnPtr
=
1112 (CuDeviceGetCountFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuDeviceGetCount");
1115 (CuDeviceGetFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuDeviceGet");
1118 (CuCtxCreateFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuCtxCreate_v2");
1120 CuCtxGetCurrentFcnPtr
=
1121 (CuCtxGetCurrentFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuCtxGetCurrent");
1123 CuModuleLoadDataExFcnPtr
= (CuModuleLoadDataExFcnTy
*)getAPIHandleCUDA(
1124 HandleCuda
, "cuModuleLoadDataEx");
1126 CuModuleLoadDataFcnPtr
=
1127 (CuModuleLoadDataFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuModuleLoadData");
1129 CuModuleGetFunctionFcnPtr
= (CuModuleGetFunctionFcnTy
*)getAPIHandleCUDA(
1130 HandleCuda
, "cuModuleGetFunction");
1132 CuDeviceComputeCapabilityFcnPtr
=
1133 (CuDeviceComputeCapabilityFcnTy
*)getAPIHandleCUDA(
1134 HandleCuda
, "cuDeviceComputeCapability");
1136 CuDeviceGetNameFcnPtr
=
1137 (CuDeviceGetNameFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuDeviceGetName");
1139 CuLinkAddDataFcnPtr
=
1140 (CuLinkAddDataFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuLinkAddData");
1142 CuLinkCreateFcnPtr
=
1143 (CuLinkCreateFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuLinkCreate");
1145 CuLinkCompleteFcnPtr
=
1146 (CuLinkCompleteFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuLinkComplete");
1148 CuLinkDestroyFcnPtr
=
1149 (CuLinkDestroyFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuLinkDestroy");
1151 CuCtxSynchronizeFcnPtr
=
1152 (CuCtxSynchronizeFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuCtxSynchronize");
1154 /* Get function pointer to CUDA Runtime APIs. */
1155 CudaThreadSynchronizeFcnPtr
= (CudaThreadSynchronizeFcnTy
*)getAPIHandleCUDA(
1156 HandleCudaRT
, "cudaThreadSynchronize");
1160 #pragma GCC diagnostic pop
1162 static PollyGPUContext
*initContextCUDA() {
1164 PollyGPUContext
*Context
;
1167 int Major
= 0, Minor
= 0, DeviceID
= 0;
1168 char DeviceName
[256];
1169 int DeviceCount
= 0;
1171 static __thread PollyGPUContext
*CurrentContext
= NULL
;
1174 return CurrentContext
;
1176 /* Get API handles. */
1177 if (initialDeviceAPIsCUDA() == 0) {
1178 fprintf(stderr
, "Getting the \"handle\" for the CUDA driver API failed.\n");
1182 if (CuInitFcnPtr(0) != CUDA_SUCCESS
) {
1183 fprintf(stderr
, "Initializing the CUDA driver API failed.\n");
1187 /* Get number of devices that supports CUDA. */
1188 CuDeviceGetCountFcnPtr(&DeviceCount
);
1189 if (DeviceCount
== 0) {
1190 fprintf(stderr
, "There is no device supporting CUDA.\n");
1194 CuDeviceGetFcnPtr(&Device
, 0);
1196 /* Get compute capabilities and the device name. */
1197 CuDeviceComputeCapabilityFcnPtr(&Major
, &Minor
, Device
);
1198 CuDeviceGetNameFcnPtr(DeviceName
, 256, Device
);
1199 debug_print("> Running on GPU device %d : %s.\n", DeviceID
, DeviceName
);
1201 /* Create context on the device. */
1202 Context
= (PollyGPUContext
*)malloc(sizeof(PollyGPUContext
));
1204 fprintf(stderr
, "Allocate memory for Polly GPU context failed.\n");
1207 Context
->Context
= malloc(sizeof(CUDAContext
));
1208 if (Context
->Context
== 0) {
1209 fprintf(stderr
, "Allocate memory for Polly CUDA context failed.\n");
1213 // In cases where managed memory is used, it is quite likely that
1214 // `cudaMallocManaged` / `polly_mallocManaged` was called before
1215 // `polly_initContext` was called.
1217 // If `polly_initContext` calls `CuCtxCreate` when there already was a
1218 // pre-existing context created by the runtime API, this causes code running
1219 // on P100 to hang. So, we query for a pre-existing context to try and use.
1220 // If there is no pre-existing context, we create a new context
1222 // The possible pre-existing context from previous runtime API calls.
1223 CUcontext MaybeRuntimeAPIContext
;
1224 if (CuCtxGetCurrentFcnPtr(&MaybeRuntimeAPIContext
) != CUDA_SUCCESS
) {
1225 fprintf(stderr
, "cuCtxGetCurrent failed.\n");
1229 // There was no previous context, initialise it.
1230 if (MaybeRuntimeAPIContext
== NULL
) {
1231 if (CuCtxCreateFcnPtr(&(((CUDAContext
*)Context
->Context
)->Cuda
), 0,
1232 Device
) != CUDA_SUCCESS
) {
1233 fprintf(stderr
, "cuCtxCreateFcnPtr failed.\n");
1237 ((CUDAContext
*)Context
->Context
)->Cuda
= MaybeRuntimeAPIContext
;
1241 CurrentContext
= Context
;
1246 static void freeKernelCUDA(PollyGPUFunction
*Kernel
) {
1252 if (((CUDAKernel
*)Kernel
->Kernel
)->CudaModule
)
1253 CuModuleUnloadFcnPtr(((CUDAKernel
*)Kernel
->Kernel
)->CudaModule
);
1256 free((CUDAKernel
*)Kernel
->Kernel
);
1262 static PollyGPUFunction
*getKernelCUDA(const char *BinaryBuffer
,
1263 const char *KernelName
) {
1266 static __thread PollyGPUFunction
*KernelCache
[KERNEL_CACHE_SIZE
];
1267 static __thread
int NextCacheItem
= 0;
1269 for (long i
= 0; i
< KERNEL_CACHE_SIZE
; i
++) {
1270 // We exploit here the property that all Polly-ACC kernels are allocated
1271 // as global constants, hence a pointer comparision is sufficient to
1272 // determin equality.
1273 if (KernelCache
[i
] &&
1274 ((CUDAKernel
*)KernelCache
[i
]->Kernel
)->BinaryString
== BinaryBuffer
) {
1275 debug_print(" -> using cached kernel\n");
1276 return KernelCache
[i
];
1280 PollyGPUFunction
*Function
= malloc(sizeof(PollyGPUFunction
));
1281 if (Function
== 0) {
1282 fprintf(stderr
, "Allocate memory for Polly GPU function failed.\n");
1285 Function
->Kernel
= (CUDAKernel
*)malloc(sizeof(CUDAKernel
));
1286 if (Function
->Kernel
== 0) {
1287 fprintf(stderr
, "Allocate memory for Polly CUDA function failed.\n");
1293 CUjit_option Options
[6];
1294 void *OptionVals
[6];
1296 unsigned long LogSize
= 8192;
1297 char ErrorLog
[8192], InfoLog
[8192];
1301 // Setup linker options
1302 // Return walltime from JIT compilation
1303 Options
[0] = CU_JIT_WALL_TIME
;
1304 OptionVals
[0] = (void *)&Walltime
;
1305 // Pass a buffer for info messages
1306 Options
[1] = CU_JIT_INFO_LOG_BUFFER
;
1307 OptionVals
[1] = (void *)InfoLog
;
1308 // Pass the size of the info buffer
1309 Options
[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
;
1310 OptionVals
[2] = (void *)LogSize
;
1311 // Pass a buffer for error message
1312 Options
[3] = CU_JIT_ERROR_LOG_BUFFER
;
1313 OptionVals
[3] = (void *)ErrorLog
;
1314 // Pass the size of the error buffer
1315 Options
[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
;
1316 OptionVals
[4] = (void *)LogSize
;
1317 // Make the linker verbose
1318 Options
[5] = CU_JIT_LOG_VERBOSE
;
1319 OptionVals
[5] = (void *)1;
1321 memset(ErrorLog
, 0, sizeof(ErrorLog
));
1323 CuLinkCreateFcnPtr(6, Options
, OptionVals
, &LState
);
1324 Res
= CuLinkAddDataFcnPtr(LState
, CU_JIT_INPUT_PTX
, (void *)BinaryBuffer
,
1325 strlen(BinaryBuffer
) + 1, 0, 0, 0, 0);
1326 if (Res
!= CUDA_SUCCESS
) {
1327 fprintf(stderr
, "PTX Linker Error:\n%s\n%s", ErrorLog
, InfoLog
);
1331 Res
= CuLinkCompleteFcnPtr(LState
, &CuOut
, &OutSize
);
1332 if (Res
!= CUDA_SUCCESS
) {
1333 fprintf(stderr
, "Complete ptx linker step failed.\n");
1334 fprintf(stderr
, "\n%s\n", ErrorLog
);
1338 debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime
,
1341 Res
= CuModuleLoadDataFcnPtr(&(((CUDAKernel
*)Function
->Kernel
)->CudaModule
),
1343 if (Res
!= CUDA_SUCCESS
) {
1344 fprintf(stderr
, "Loading ptx assembly text failed.\n");
1348 Res
= CuModuleGetFunctionFcnPtr(&(((CUDAKernel
*)Function
->Kernel
)->Cuda
),
1349 ((CUDAKernel
*)Function
->Kernel
)->CudaModule
,
1351 if (Res
!= CUDA_SUCCESS
) {
1352 fprintf(stderr
, "Loading kernel function failed.\n");
1356 CuLinkDestroyFcnPtr(LState
);
1358 ((CUDAKernel
*)Function
->Kernel
)->BinaryString
= BinaryBuffer
;
1361 if (KernelCache
[NextCacheItem
])
1362 freeKernelCUDA(KernelCache
[NextCacheItem
]);
1364 KernelCache
[NextCacheItem
] = Function
;
1366 NextCacheItem
= (NextCacheItem
+ 1) % KERNEL_CACHE_SIZE
;
1372 static void synchronizeDeviceCUDA() {
1374 if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS
) {
1375 fprintf(stderr
, "Synchronizing device and host memory failed.\n");
1380 static void copyFromHostToDeviceCUDA(void *HostData
, PollyGPUDevicePtr
*DevData
,
1384 CUdeviceptr CuDevData
= ((CUDADevicePtr
*)DevData
->DevicePtr
)->Cuda
;
1385 CuMemcpyHtoDFcnPtr(CuDevData
, HostData
, MemSize
);
1388 static void copyFromDeviceToHostCUDA(PollyGPUDevicePtr
*DevData
, void *HostData
,
1392 if (CuMemcpyDtoHFcnPtr(HostData
, ((CUDADevicePtr
*)DevData
->DevicePtr
)->Cuda
,
1393 MemSize
) != CUDA_SUCCESS
) {
1394 fprintf(stderr
, "Copying results from device to host memory failed.\n");
1399 static void launchKernelCUDA(PollyGPUFunction
*Kernel
, unsigned int GridDimX
,
1400 unsigned int GridDimY
, unsigned int BlockDimX
,
1401 unsigned int BlockDimY
, unsigned int BlockDimZ
,
1402 void **Parameters
) {
1405 unsigned GridDimZ
= 1;
1406 unsigned int SharedMemBytes
= CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE
;
1407 CUstream Stream
= 0;
1412 CuLaunchKernelFcnPtr(((CUDAKernel
*)Kernel
->Kernel
)->Cuda
, GridDimX
,
1413 GridDimY
, GridDimZ
, BlockDimX
, BlockDimY
, BlockDimZ
,
1414 SharedMemBytes
, Stream
, Parameters
, Extra
);
1415 if (Res
!= CUDA_SUCCESS
) {
1416 fprintf(stderr
, "Launching CUDA kernel failed.\n");
1421 // Maximum number of managed memory pointers.
1422 #define DEFAULT_MAX_POINTERS 4000
1423 // For the rationale behing a list of free pointers, see `polly_freeManaged`.
1424 void **g_managedptrs
;
1425 unsigned long long g_nmanagedptrs
= 0;
1426 unsigned long long g_maxmanagedptrs
= 0;
1428 __attribute__((constructor
)) static void initManagedPtrsBuffer() {
1429 g_maxmanagedptrs
= DEFAULT_MAX_POINTERS
;
1430 const char *maxManagedPointersString
= getenv("POLLY_MAX_MANAGED_POINTERS");
1431 if (maxManagedPointersString
)
1432 g_maxmanagedptrs
= atoll(maxManagedPointersString
);
1434 g_managedptrs
= (void **)malloc(sizeof(void *) * g_maxmanagedptrs
);
1437 // Add a pointer as being allocated by cuMallocManaged
1438 void addManagedPtr(void *mem
) {
1439 assert(g_maxmanagedptrs
> 0 && "g_maxmanagedptrs was set to 0!");
1440 assert(g_nmanagedptrs
< g_maxmanagedptrs
&&
1441 "We have hit the maximum number of "
1442 "managed pointers allowed. Set the "
1443 "POLLY_MAX_MANAGED_POINTERS environment variable. ");
1444 g_managedptrs
[g_nmanagedptrs
++] = mem
;
1447 int isManagedPtr(void *mem
) {
1448 for (unsigned long long i
= 0; i
< g_nmanagedptrs
; i
++) {
1449 if (g_managedptrs
[i
] == mem
)
1455 void freeManagedCUDA(void *mem
) {
1458 // In a real-world program this was used (COSMO), there were more `free`
1459 // calls in the original source than `malloc` calls. Hence, replacing all
1460 // `free`s with `cudaFree` does not work, since we would try to free
1461 // 'illegal' memory.
1462 // As a quick fix, we keep a free list and check if `mem` is a managed memory
1463 // pointer. If it is, we call `cudaFree`.
1464 // If not, we pass it along to the underlying allocator.
1465 // This is a hack, and can be removed if the underlying issue is fixed.
1466 if (isManagedPtr(mem
)) {
1467 if (CuMemFreeFcnPtr((size_t)mem
) != CUDA_SUCCESS
) {
1468 fprintf(stderr
, "cudaFree failed.\n");
1477 void *mallocManagedCUDA(size_t size
) {
1478 // Note: [Size 0 allocations]
1479 // Sometimes, some runtime computation of size could create a size of 0
1480 // for an allocation. In these cases, we do not wish to fail.
1481 // The CUDA API fails on size 0 allocations.
1482 // So, we allocate size a minimum of size 1.
1483 if (!size
&& DebugMode
)
1484 fprintf(stderr
, "cudaMallocManaged called with size 0. "
1485 "Promoting to size 1");
1486 size
= max(size
, 1);
1487 PollyGPUContext
*_
= polly_initContextCUDA();
1488 assert(_
&& "polly_initContextCUDA failed");
1491 const CUresult Res
= CuMemAllocManagedFcnPtr((CUdeviceptr
*)&newMemPtr
, size
,
1492 CU_MEM_ATTACH_GLOBAL
);
1493 if (Res
!= CUDA_SUCCESS
) {
1494 fprintf(stderr
, "cudaMallocManaged failed for size: %zu\n", size
);
1497 addManagedPtr(newMemPtr
);
1501 static void freeDeviceMemoryCUDA(PollyGPUDevicePtr
*Allocation
) {
1503 CUDADevicePtr
*DevPtr
= (CUDADevicePtr
*)Allocation
->DevicePtr
;
1504 CuMemFreeFcnPtr((CUdeviceptr
)DevPtr
->Cuda
);
1509 static PollyGPUDevicePtr
*allocateMemoryForDeviceCUDA(long MemSize
) {
1510 if (!MemSize
&& DebugMode
)
1511 fprintf(stderr
, "allocateMemoryForDeviceCUDA called with size 0. "
1512 "Promoting to size 1");
1513 // see: [Size 0 allocations]
1514 MemSize
= max(MemSize
, 1);
1517 PollyGPUDevicePtr
*DevData
= malloc(sizeof(PollyGPUDevicePtr
));
1520 "Allocate memory for GPU device memory pointer failed."
1521 " Line: %d | Size: %ld\n",
1525 DevData
->DevicePtr
= (CUDADevicePtr
*)malloc(sizeof(CUDADevicePtr
));
1526 if (DevData
->DevicePtr
== 0) {
1528 "Allocate memory for GPU device memory pointer failed."
1529 " Line: %d | Size: %ld\n",
1535 CuMemAllocFcnPtr(&(((CUDADevicePtr
*)DevData
->DevicePtr
)->Cuda
), MemSize
);
1537 if (Res
!= CUDA_SUCCESS
) {
1539 "Allocate memory for GPU device memory pointer failed."
1540 " Line: %d | Size: %ld\n",
1548 static void *getDevicePtrCUDA(PollyGPUDevicePtr
*Allocation
) {
1551 CUDADevicePtr
*DevPtr
= (CUDADevicePtr
*)Allocation
->DevicePtr
;
1552 return (void *)DevPtr
->Cuda
;
1555 static void freeContextCUDA(PollyGPUContext
*Context
) {
1558 CUDAContext
*Ctx
= (CUDAContext
*)Context
->Context
;
1560 CuProfilerStopFcnPtr();
1561 CuCtxDestroyFcnPtr(Ctx
->Cuda
);
1566 dlclose(HandleCuda
);
1567 dlclose(HandleCudaRT
);
1570 #endif /* HAS_LIBCUDART */
1571 /******************************************************************************/
1573 /******************************************************************************/
1575 PollyGPUContext
*polly_initContext() {
1576 DebugMode
= getenv("POLLY_DEBUG") != 0;
1577 CacheMode
= getenv("POLLY_NOCACHE") == 0;
1581 PollyGPUContext
*Context
;
1584 #ifdef HAS_LIBCUDART
1586 Context
= initContextCUDA();
1588 #endif /* HAS_LIBCUDART */
1589 #ifdef HAS_LIBOPENCL
1591 Context
= initContextCL();
1593 #endif /* HAS_LIBOPENCL */
1601 void polly_freeKernel(PollyGPUFunction
*Kernel
) {
1605 #ifdef HAS_LIBCUDART
1607 freeKernelCUDA(Kernel
);
1609 #endif /* HAS_LIBCUDART */
1610 #ifdef HAS_LIBOPENCL
1612 freeKernelCL(Kernel
);
1614 #endif /* HAS_LIBOPENCL */
1620 PollyGPUFunction
*polly_getKernel(const char *BinaryBuffer
,
1621 const char *KernelName
) {
1624 PollyGPUFunction
*Function
;
1627 #ifdef HAS_LIBCUDART
1629 Function
= getKernelCUDA(BinaryBuffer
, KernelName
);
1631 #endif /* HAS_LIBCUDART */
1632 #ifdef HAS_LIBOPENCL
1634 Function
= getKernelCL(BinaryBuffer
, KernelName
);
1636 #endif /* HAS_LIBOPENCL */
1644 void polly_copyFromHostToDevice(void *HostData
, PollyGPUDevicePtr
*DevData
,
1649 #ifdef HAS_LIBCUDART
1651 copyFromHostToDeviceCUDA(HostData
, DevData
, MemSize
);
1653 #endif /* HAS_LIBCUDART */
1654 #ifdef HAS_LIBOPENCL
1656 copyFromHostToDeviceCL(HostData
, DevData
, MemSize
);
1658 #endif /* HAS_LIBOPENCL */
1664 void polly_copyFromDeviceToHost(PollyGPUDevicePtr
*DevData
, void *HostData
,
1669 #ifdef HAS_LIBCUDART
1671 copyFromDeviceToHostCUDA(DevData
, HostData
, MemSize
);
1673 #endif /* HAS_LIBCUDART */
1674 #ifdef HAS_LIBOPENCL
1676 copyFromDeviceToHostCL(DevData
, HostData
, MemSize
);
1678 #endif /* HAS_LIBOPENCL */
1684 void polly_launchKernel(PollyGPUFunction
*Kernel
, unsigned int GridDimX
,
1685 unsigned int GridDimY
, unsigned int BlockDimX
,
1686 unsigned int BlockDimY
, unsigned int BlockDimZ
,
1687 void **Parameters
) {
1691 #ifdef HAS_LIBCUDART
1693 launchKernelCUDA(Kernel
, GridDimX
, GridDimY
, BlockDimX
, BlockDimY
,
1694 BlockDimZ
, Parameters
);
1696 #endif /* HAS_LIBCUDART */
1697 #ifdef HAS_LIBOPENCL
1699 launchKernelCL(Kernel
, GridDimX
, GridDimY
, BlockDimX
, BlockDimY
, BlockDimZ
,
1702 #endif /* HAS_LIBOPENCL */
1708 void polly_freeDeviceMemory(PollyGPUDevicePtr
*Allocation
) {
1712 #ifdef HAS_LIBCUDART
1714 freeDeviceMemoryCUDA(Allocation
);
1716 #endif /* HAS_LIBCUDART */
1717 #ifdef HAS_LIBOPENCL
1719 freeDeviceMemoryCL(Allocation
);
1721 #endif /* HAS_LIBOPENCL */
1727 PollyGPUDevicePtr
*polly_allocateMemoryForDevice(long MemSize
) {
1730 PollyGPUDevicePtr
*DevData
;
1733 #ifdef HAS_LIBCUDART
1735 DevData
= allocateMemoryForDeviceCUDA(MemSize
);
1737 #endif /* HAS_LIBCUDART */
1738 #ifdef HAS_LIBOPENCL
1740 DevData
= allocateMemoryForDeviceCL(MemSize
);
1742 #endif /* HAS_LIBOPENCL */
1750 void *polly_getDevicePtr(PollyGPUDevicePtr
*Allocation
) {
1756 #ifdef HAS_LIBCUDART
1758 DevPtr
= getDevicePtrCUDA(Allocation
);
1760 #endif /* HAS_LIBCUDART */
1761 #ifdef HAS_LIBOPENCL
1763 DevPtr
= getDevicePtrCL(Allocation
);
1765 #endif /* HAS_LIBOPENCL */
1773 void polly_synchronizeDevice() {
1777 #ifdef HAS_LIBCUDART
1779 synchronizeDeviceCUDA();
1781 #endif /* HAS_LIBCUDART */
1782 #ifdef HAS_LIBOPENCL
1784 synchronizeDeviceCL();
1786 #endif /* HAS_LIBOPENCL */
1792 void polly_freeContext(PollyGPUContext
*Context
) {
1799 #ifdef HAS_LIBCUDART
1801 freeContextCUDA(Context
);
1803 #endif /* HAS_LIBCUDART */
1804 #ifdef HAS_LIBOPENCL
1806 freeContextCL(Context
);
1808 #endif /* HAS_LIBOPENCL */
1814 void polly_freeManaged(void *mem
) {
1817 #ifdef HAS_LIBCUDART
1818 freeManagedCUDA(mem
);
1820 fprintf(stderr
, "No CUDA Runtime. Managed memory only supported by CUDA\n");
1825 void *polly_mallocManaged(size_t size
) {
1828 #ifdef HAS_LIBCUDART
1829 return mallocManagedCUDA(size
);
1831 fprintf(stderr
, "No CUDA Runtime. Managed memory only supported by CUDA\n");
1836 /* Initialize GPUJIT with CUDA as runtime library. */
1837 PollyGPUContext
*polly_initContextCUDA() {
1838 #ifdef HAS_LIBCUDART
1839 Runtime
= RUNTIME_CUDA
;
1840 return polly_initContext();
1842 fprintf(stderr
, "GPU Runtime was built without CUDA support.\n");
1844 #endif /* HAS_LIBCUDART */
1847 /* Initialize GPUJIT with OpenCL as runtime library. */
1848 PollyGPUContext
*polly_initContextCL() {
1849 #ifdef HAS_LIBOPENCL
1850 Runtime
= RUNTIME_CL
;
1851 return polly_initContext();
1853 fprintf(stderr
, "GPU Runtime was built without OpenCL support.\n");
1855 #endif /* HAS_LIBOPENCL */