1 /******************** GPUJIT.c - GPUJIT Execution Engine **********************/
3 /* The LLVM Compiler Infrastructure */
5 /* This file is dual licensed under the MIT and the University of Illinois */
6 /* Open Source License. See LICENSE.TXT for details. */
8 /******************************************************************************/
10 /* This file implements GPUJIT, a ptx string execution engine for GPU. */
12 /******************************************************************************/
18 #include <cuda_runtime.h>
19 #endif /* HAS_LIBCUDART */
23 #include <OpenCL/opencl.h>
26 #endif /* __APPLE__ */
27 #endif /* HAS_LIBOPENCL */
39 static PollyGPURuntime Runtime
= RUNTIME_NONE
;
41 static void debug_print(const char *format
, ...) {
46 va_start(args
, format
);
47 vfprintf(stderr
, format
, args
);
50 #define dump_function() debug_print("-> %s\n", __func__)
52 #define KERNEL_CACHE_SIZE 10
54 static void err_runtime() __attribute__((noreturn
));
55 static void err_runtime() {
56 fprintf(stderr
, "Runtime not correctly initialized.\n");
60 struct PollyGPUContextT
{
64 struct PollyGPUFunctionT
{
68 struct PollyGPUDevicePtrT
{
72 /******************************************************************************/
74 /******************************************************************************/
77 struct OpenCLContextT
{
79 cl_command_queue CommandQueue
;
82 struct OpenCLKernelT
{
85 const char *BinaryString
;
88 struct OpenCLDevicePtrT
{
92 /* Dynamic library handles for the OpenCL runtime library. */
93 static void *HandleOpenCL
;
94 static void *HandleOpenCLBeignet
;
96 /* Type-defines of function pointer to OpenCL Runtime API. */
97 typedef cl_int
clGetPlatformIDsFcnTy(cl_uint NumEntries
,
98 cl_platform_id
*Platforms
,
99 cl_uint
*NumPlatforms
);
100 static clGetPlatformIDsFcnTy
*clGetPlatformIDsFcnPtr
;
102 typedef cl_int
clGetDeviceIDsFcnTy(cl_platform_id Platform
,
103 cl_device_type DeviceType
,
104 cl_uint NumEntries
, cl_device_id
*Devices
,
105 cl_uint
*NumDevices
);
106 static clGetDeviceIDsFcnTy
*clGetDeviceIDsFcnPtr
;
108 typedef cl_int
clGetDeviceInfoFcnTy(cl_device_id Device
,
109 cl_device_info ParamName
,
110 size_t ParamValueSize
, void *ParamValue
,
111 size_t *ParamValueSizeRet
);
112 static clGetDeviceInfoFcnTy
*clGetDeviceInfoFcnPtr
;
114 typedef cl_int
clGetKernelInfoFcnTy(cl_kernel Kernel
, cl_kernel_info ParamName
,
115 size_t ParamValueSize
, void *ParamValue
,
116 size_t *ParamValueSizeRet
);
117 static clGetKernelInfoFcnTy
*clGetKernelInfoFcnPtr
;
119 typedef cl_context
clCreateContextFcnTy(
120 const cl_context_properties
*Properties
, cl_uint NumDevices
,
121 const cl_device_id
*Devices
,
122 void CL_CALLBACK
*pfn_notify(const char *Errinfo
, const void *PrivateInfo
,
123 size_t CB
, void *UserData
),
124 void *UserData
, cl_int
*ErrcodeRet
);
125 static clCreateContextFcnTy
*clCreateContextFcnPtr
;
127 typedef cl_command_queue
128 clCreateCommandQueueFcnTy(cl_context Context
, cl_device_id Device
,
129 cl_command_queue_properties Properties
,
131 static clCreateCommandQueueFcnTy
*clCreateCommandQueueFcnPtr
;
133 typedef cl_mem
clCreateBufferFcnTy(cl_context Context
, cl_mem_flags Flags
,
134 size_t Size
, void *HostPtr
,
136 static clCreateBufferFcnTy
*clCreateBufferFcnPtr
;
139 clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue
, cl_mem Buffer
,
140 cl_bool BlockingWrite
, size_t Offset
, size_t Size
,
141 const void *Ptr
, cl_uint NumEventsInWaitList
,
142 const cl_event
*EventWaitList
, cl_event
*Event
);
143 static clEnqueueWriteBufferFcnTy
*clEnqueueWriteBufferFcnPtr
;
146 clCreateProgramWithLLVMIntelFcnTy(cl_context Context
, cl_uint NumDevices
,
147 const cl_device_id
*DeviceList
,
148 const char *Filename
, cl_int
*ErrcodeRet
);
149 static clCreateProgramWithLLVMIntelFcnTy
*clCreateProgramWithLLVMIntelFcnPtr
;
151 typedef cl_program
clCreateProgramWithBinaryFcnTy(
152 cl_context Context
, cl_uint NumDevices
, const cl_device_id
*DeviceList
,
153 const size_t *Lengths
, const unsigned char **Binaries
, cl_int
*BinaryStatus
,
155 static clCreateProgramWithBinaryFcnTy
*clCreateProgramWithBinaryFcnPtr
;
157 typedef cl_int
clBuildProgramFcnTy(
158 cl_program Program
, cl_uint NumDevices
, const cl_device_id
*DeviceList
,
160 void(CL_CALLBACK
*pfn_notify
)(cl_program Program
, void *UserData
),
162 static clBuildProgramFcnTy
*clBuildProgramFcnPtr
;
164 typedef cl_kernel
clCreateKernelFcnTy(cl_program Program
,
165 const char *KernelName
,
167 static clCreateKernelFcnTy
*clCreateKernelFcnPtr
;
169 typedef cl_int
clSetKernelArgFcnTy(cl_kernel Kernel
, cl_uint ArgIndex
,
170 size_t ArgSize
, const void *ArgValue
);
171 static clSetKernelArgFcnTy
*clSetKernelArgFcnPtr
;
173 typedef cl_int
clEnqueueNDRangeKernelFcnTy(
174 cl_command_queue CommandQueue
, cl_kernel Kernel
, cl_uint WorkDim
,
175 const size_t *GlobalWorkOffset
, const size_t *GlobalWorkSize
,
176 const size_t *LocalWorkSize
, cl_uint NumEventsInWaitList
,
177 const cl_event
*EventWaitList
, cl_event
*Event
);
178 static clEnqueueNDRangeKernelFcnTy
*clEnqueueNDRangeKernelFcnPtr
;
180 typedef cl_int
clEnqueueReadBufferFcnTy(cl_command_queue CommandQueue
,
181 cl_mem Buffer
, cl_bool BlockingRead
,
182 size_t Offset
, size_t Size
, void *Ptr
,
183 cl_uint NumEventsInWaitList
,
184 const cl_event
*EventWaitList
,
186 static clEnqueueReadBufferFcnTy
*clEnqueueReadBufferFcnPtr
;
188 typedef cl_int
clFlushFcnTy(cl_command_queue CommandQueue
);
189 static clFlushFcnTy
*clFlushFcnPtr
;
191 typedef cl_int
clFinishFcnTy(cl_command_queue CommandQueue
);
192 static clFinishFcnTy
*clFinishFcnPtr
;
194 typedef cl_int
clReleaseKernelFcnTy(cl_kernel Kernel
);
195 static clReleaseKernelFcnTy
*clReleaseKernelFcnPtr
;
197 typedef cl_int
clReleaseProgramFcnTy(cl_program Program
);
198 static clReleaseProgramFcnTy
*clReleaseProgramFcnPtr
;
200 typedef cl_int
clReleaseMemObjectFcnTy(cl_mem Memobject
);
201 static clReleaseMemObjectFcnTy
*clReleaseMemObjectFcnPtr
;
203 typedef cl_int
clReleaseCommandQueueFcnTy(cl_command_queue CommandQueue
);
204 static clReleaseCommandQueueFcnTy
*clReleaseCommandQueueFcnPtr
;
206 typedef cl_int
clReleaseContextFcnTy(cl_context Context
);
207 static clReleaseContextFcnTy
*clReleaseContextFcnPtr
;
209 static void *getAPIHandleCL(void *Handle
, const char *FuncName
) {
213 FuncPtr
= dlsym(Handle
, FuncName
);
214 if ((Err
= dlerror()) != 0) {
215 fprintf(stderr
, "Load OpenCL Runtime API failed: %s. \n", Err
);
221 static int initialDeviceAPILibrariesCL() {
222 HandleOpenCLBeignet
= dlopen("/usr/local/lib/beignet/libcl.so", RTLD_LAZY
);
223 HandleOpenCL
= dlopen("libOpenCL.so", RTLD_LAZY
);
225 fprintf(stderr
, "Cannot open library: %s. \n", dlerror());
231 /* Get function pointer to OpenCL Runtime API.
233 * Note that compilers conforming to the ISO C standard are required to
234 * generate a warning if a conversion from a void * pointer to a function
235 * pointer is attempted as in the following statements. The warning
236 * of this kind of cast may not be emitted by clang and new versions of gcc
237 * as it is valid on POSIX 2008. For compilers required to generate a warning,
238 * we temporarily disable -Wpedantic, to avoid bloating the output with
239 * unnecessary warnings.
242 * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html
244 #pragma GCC diagnostic push
245 #pragma GCC diagnostic ignored "-Wpedantic"
246 static int initialDeviceAPIsCL() {
247 if (initialDeviceAPILibrariesCL() == 0)
250 // FIXME: We are now always selecting the Intel Beignet driver if it is
251 // available on the system, instead of a possible NVIDIA or AMD OpenCL
252 // API. This selection should occurr based on the target architecture
253 // chosen when compiling.
255 (HandleOpenCLBeignet
!= NULL
? HandleOpenCLBeignet
: HandleOpenCL
);
257 clGetPlatformIDsFcnPtr
=
258 (clGetPlatformIDsFcnTy
*)getAPIHandleCL(Handle
, "clGetPlatformIDs");
260 clGetDeviceIDsFcnPtr
=
261 (clGetDeviceIDsFcnTy
*)getAPIHandleCL(Handle
, "clGetDeviceIDs");
263 clGetDeviceInfoFcnPtr
=
264 (clGetDeviceInfoFcnTy
*)getAPIHandleCL(Handle
, "clGetDeviceInfo");
266 clGetKernelInfoFcnPtr
=
267 (clGetKernelInfoFcnTy
*)getAPIHandleCL(Handle
, "clGetKernelInfo");
269 clCreateContextFcnPtr
=
270 (clCreateContextFcnTy
*)getAPIHandleCL(Handle
, "clCreateContext");
272 clCreateCommandQueueFcnPtr
= (clCreateCommandQueueFcnTy
*)getAPIHandleCL(
273 Handle
, "clCreateCommandQueue");
275 clCreateBufferFcnPtr
=
276 (clCreateBufferFcnTy
*)getAPIHandleCL(Handle
, "clCreateBuffer");
278 clEnqueueWriteBufferFcnPtr
= (clEnqueueWriteBufferFcnTy
*)getAPIHandleCL(
279 Handle
, "clEnqueueWriteBuffer");
281 if (HandleOpenCLBeignet
)
282 clCreateProgramWithLLVMIntelFcnPtr
=
283 (clCreateProgramWithLLVMIntelFcnTy
*)getAPIHandleCL(
284 Handle
, "clCreateProgramWithLLVMIntel");
286 clCreateProgramWithBinaryFcnPtr
=
287 (clCreateProgramWithBinaryFcnTy
*)getAPIHandleCL(
288 Handle
, "clCreateProgramWithBinary");
290 clBuildProgramFcnPtr
=
291 (clBuildProgramFcnTy
*)getAPIHandleCL(Handle
, "clBuildProgram");
293 clCreateKernelFcnPtr
=
294 (clCreateKernelFcnTy
*)getAPIHandleCL(Handle
, "clCreateKernel");
296 clSetKernelArgFcnPtr
=
297 (clSetKernelArgFcnTy
*)getAPIHandleCL(Handle
, "clSetKernelArg");
299 clEnqueueNDRangeKernelFcnPtr
= (clEnqueueNDRangeKernelFcnTy
*)getAPIHandleCL(
300 Handle
, "clEnqueueNDRangeKernel");
302 clEnqueueReadBufferFcnPtr
=
303 (clEnqueueReadBufferFcnTy
*)getAPIHandleCL(Handle
, "clEnqueueReadBuffer");
305 clFlushFcnPtr
= (clFlushFcnTy
*)getAPIHandleCL(Handle
, "clFlush");
307 clFinishFcnPtr
= (clFinishFcnTy
*)getAPIHandleCL(Handle
, "clFinish");
309 clReleaseKernelFcnPtr
=
310 (clReleaseKernelFcnTy
*)getAPIHandleCL(Handle
, "clReleaseKernel");
312 clReleaseProgramFcnPtr
=
313 (clReleaseProgramFcnTy
*)getAPIHandleCL(Handle
, "clReleaseProgram");
315 clReleaseMemObjectFcnPtr
=
316 (clReleaseMemObjectFcnTy
*)getAPIHandleCL(Handle
, "clReleaseMemObject");
318 clReleaseCommandQueueFcnPtr
= (clReleaseCommandQueueFcnTy
*)getAPIHandleCL(
319 Handle
, "clReleaseCommandQueue");
321 clReleaseContextFcnPtr
=
322 (clReleaseContextFcnTy
*)getAPIHandleCL(Handle
, "clReleaseContext");
326 #pragma GCC diagnostic pop
328 /* Context and Device. */
329 static PollyGPUContext
*GlobalContext
= NULL
;
330 static cl_device_id GlobalDeviceID
= NULL
;
332 /* Fd-Decl: Print out OpenCL Error codes to human readable strings. */
333 static void printOpenCLError(int Error
);
335 static void checkOpenCLError(int Ret
, const char *format
, ...) {
336 if (Ret
== CL_SUCCESS
)
339 printOpenCLError(Ret
);
341 va_start(args
, format
);
342 vfprintf(stderr
, format
, args
);
347 static PollyGPUContext
*initContextCL() {
350 PollyGPUContext
*Context
;
352 cl_platform_id PlatformID
= NULL
;
353 cl_device_id DeviceID
= NULL
;
354 cl_uint NumDevicesRet
;
357 char DeviceRevision
[256];
358 char DeviceName
[256];
359 size_t DeviceRevisionRetSize
, DeviceNameRetSize
;
361 static __thread PollyGPUContext
*CurrentContext
= NULL
;
364 return CurrentContext
;
366 /* Get API handles. */
367 if (initialDeviceAPIsCL() == 0) {
368 fprintf(stderr
, "Getting the \"handle\" for the OpenCL Runtime failed.\n");
372 /* Get number of devices that support OpenCL. */
373 static const int NumberOfPlatforms
= 1;
374 Ret
= clGetPlatformIDsFcnPtr(NumberOfPlatforms
, &PlatformID
, NULL
);
375 checkOpenCLError(Ret
, "Failed to get platform IDs.\n");
376 // TODO: Extend to CL_DEVICE_TYPE_ALL?
377 static const int NumberOfDevices
= 1;
378 Ret
= clGetDeviceIDsFcnPtr(PlatformID
, CL_DEVICE_TYPE_GPU
, NumberOfDevices
,
379 &DeviceID
, &NumDevicesRet
);
380 checkOpenCLError(Ret
, "Failed to get device IDs.\n");
382 GlobalDeviceID
= DeviceID
;
383 if (NumDevicesRet
== 0) {
384 fprintf(stderr
, "There is no device supporting OpenCL.\n");
388 /* Get device revision. */
390 clGetDeviceInfoFcnPtr(DeviceID
, CL_DEVICE_VERSION
, sizeof(DeviceRevision
),
391 DeviceRevision
, &DeviceRevisionRetSize
);
392 checkOpenCLError(Ret
, "Failed to fetch device revision.\n");
394 /* Get device name. */
395 Ret
= clGetDeviceInfoFcnPtr(DeviceID
, CL_DEVICE_NAME
, sizeof(DeviceName
),
396 DeviceName
, &DeviceNameRetSize
);
397 checkOpenCLError(Ret
, "Failed to fetch device name.\n");
399 debug_print("> Running on GPU device %d : %s.\n", DeviceID
, DeviceName
);
401 /* Create context on the device. */
402 Context
= (PollyGPUContext
*)malloc(sizeof(PollyGPUContext
));
404 fprintf(stderr
, "Allocate memory for Polly GPU context failed.\n");
407 Context
->Context
= (OpenCLContext
*)malloc(sizeof(OpenCLContext
));
408 if (Context
->Context
== 0) {
409 fprintf(stderr
, "Allocate memory for Polly OpenCL context failed.\n");
412 ((OpenCLContext
*)Context
->Context
)->Context
=
413 clCreateContextFcnPtr(NULL
, NumDevicesRet
, &DeviceID
, NULL
, NULL
, &Ret
);
414 checkOpenCLError(Ret
, "Failed to create context.\n");
416 static const int ExtraProperties
= 0;
417 ((OpenCLContext
*)Context
->Context
)->CommandQueue
=
418 clCreateCommandQueueFcnPtr(((OpenCLContext
*)Context
->Context
)->Context
,
419 DeviceID
, ExtraProperties
, &Ret
);
420 checkOpenCLError(Ret
, "Failed to create command queue.\n");
423 CurrentContext
= Context
;
425 GlobalContext
= Context
;
429 static void freeKernelCL(PollyGPUFunction
*Kernel
) {
435 if (!GlobalContext
) {
436 fprintf(stderr
, "GPGPU-code generation not correctly initialized.\n");
441 Ret
= clFlushFcnPtr(((OpenCLContext
*)GlobalContext
->Context
)->CommandQueue
);
442 checkOpenCLError(Ret
, "Failed to flush command queue.\n");
443 Ret
= clFinishFcnPtr(((OpenCLContext
*)GlobalContext
->Context
)->CommandQueue
);
444 checkOpenCLError(Ret
, "Failed to finish command queue.\n");
446 if (((OpenCLKernel
*)Kernel
->Kernel
)->Kernel
) {
448 clReleaseKernelFcnPtr(((OpenCLKernel
*)Kernel
->Kernel
)->Kernel
);
449 checkOpenCLError(Ret
, "Failed to release kernel.\n");
452 if (((OpenCLKernel
*)Kernel
->Kernel
)->Program
) {
454 clReleaseProgramFcnPtr(((OpenCLKernel
*)Kernel
->Kernel
)->Program
);
455 checkOpenCLError(Ret
, "Failed to release program.\n");
459 free((OpenCLKernel
*)Kernel
->Kernel
);
465 static PollyGPUFunction
*getKernelCL(const char *BinaryBuffer
,
466 const char *KernelName
) {
469 if (!GlobalContext
) {
470 fprintf(stderr
, "GPGPU-code generation not correctly initialized.\n");
474 static __thread PollyGPUFunction
*KernelCache
[KERNEL_CACHE_SIZE
];
475 static __thread
int NextCacheItem
= 0;
477 for (long i
= 0; i
< KERNEL_CACHE_SIZE
; i
++) {
478 // We exploit here the property that all Polly-ACC kernels are allocated
479 // as global constants, hence a pointer comparision is sufficient to
480 // determin equality.
481 if (KernelCache
[i
] &&
482 ((OpenCLKernel
*)KernelCache
[i
]->Kernel
)->BinaryString
==
484 debug_print(" -> using cached kernel\n");
485 return KernelCache
[i
];
489 PollyGPUFunction
*Function
= malloc(sizeof(PollyGPUFunction
));
491 fprintf(stderr
, "Allocate memory for Polly GPU function failed.\n");
494 Function
->Kernel
= (OpenCLKernel
*)malloc(sizeof(OpenCLKernel
));
495 if (Function
->Kernel
== 0) {
496 fprintf(stderr
, "Allocate memory for Polly OpenCL kernel failed.\n");
500 if (!GlobalDeviceID
) {
501 fprintf(stderr
, "GPGPU-code generation not initialized correctly.\n");
507 if (HandleOpenCLBeignet
) {
508 // TODO: This is a workaround, since clCreateProgramWithLLVMIntel only
509 // accepts a filename to a valid llvm-ir file as an argument, instead
510 // of accepting the BinaryBuffer directly.
511 FILE *fp
= fopen("kernel.ll", "wb");
513 fputs(BinaryBuffer
, fp
);
517 ((OpenCLKernel
*)Function
->Kernel
)->Program
=
518 clCreateProgramWithLLVMIntelFcnPtr(
519 ((OpenCLContext
*)GlobalContext
->Context
)->Context
, 1,
520 &GlobalDeviceID
, "kernel.ll", &Ret
);
521 checkOpenCLError(Ret
, "Failed to create program from llvm.\n");
524 size_t BinarySize
= strlen(BinaryBuffer
);
525 ((OpenCLKernel
*)Function
->Kernel
)->Program
=
526 clCreateProgramWithBinaryFcnPtr(
527 ((OpenCLContext
*)GlobalContext
->Context
)->Context
, 1,
528 &GlobalDeviceID
, (const size_t *)&BinarySize
,
529 (const unsigned char **)&BinaryBuffer
, NULL
, &Ret
);
530 checkOpenCLError(Ret
, "Failed to create program from binary.\n");
533 Ret
= clBuildProgramFcnPtr(((OpenCLKernel
*)Function
->Kernel
)->Program
, 1,
534 &GlobalDeviceID
, NULL
, NULL
, NULL
);
535 checkOpenCLError(Ret
, "Failed to build program.\n");
537 ((OpenCLKernel
*)Function
->Kernel
)->Kernel
= clCreateKernelFcnPtr(
538 ((OpenCLKernel
*)Function
->Kernel
)->Program
, KernelName
, &Ret
);
539 checkOpenCLError(Ret
, "Failed to create kernel.\n");
541 ((OpenCLKernel
*)Function
->Kernel
)->BinaryString
= BinaryBuffer
;
544 if (KernelCache
[NextCacheItem
])
545 freeKernelCL(KernelCache
[NextCacheItem
]);
547 KernelCache
[NextCacheItem
] = Function
;
549 NextCacheItem
= (NextCacheItem
+ 1) % KERNEL_CACHE_SIZE
;
555 static void copyFromHostToDeviceCL(void *HostData
, PollyGPUDevicePtr
*DevData
,
559 if (!GlobalContext
) {
560 fprintf(stderr
, "GPGPU-code generation not correctly initialized.\n");
565 Ret
= clEnqueueWriteBufferFcnPtr(
566 ((OpenCLContext
*)GlobalContext
->Context
)->CommandQueue
,
567 ((OpenCLDevicePtr
*)DevData
->DevicePtr
)->MemObj
, CL_TRUE
, 0, MemSize
,
568 HostData
, 0, NULL
, NULL
);
569 checkOpenCLError(Ret
, "Copying data from host memory to device failed.\n");
572 static void copyFromDeviceToHostCL(PollyGPUDevicePtr
*DevData
, void *HostData
,
576 if (!GlobalContext
) {
577 fprintf(stderr
, "GPGPU-code generation not correctly initialized.\n");
582 Ret
= clEnqueueReadBufferFcnPtr(
583 ((OpenCLContext
*)GlobalContext
->Context
)->CommandQueue
,
584 ((OpenCLDevicePtr
*)DevData
->DevicePtr
)->MemObj
, CL_TRUE
, 0, MemSize
,
585 HostData
, 0, NULL
, NULL
);
586 checkOpenCLError(Ret
, "Copying results from device to host memory failed.\n");
589 static void launchKernelCL(PollyGPUFunction
*Kernel
, unsigned int GridDimX
,
590 unsigned int GridDimY
, unsigned int BlockDimX
,
591 unsigned int BlockDimY
, unsigned int BlockDimZ
,
598 if (!GlobalContext
) {
599 fprintf(stderr
, "GPGPU-code generation not correctly initialized.\n");
603 OpenCLKernel
*CLKernel
= (OpenCLKernel
*)Kernel
->Kernel
;
604 Ret
= clGetKernelInfoFcnPtr(CLKernel
->Kernel
, CL_KERNEL_NUM_ARGS
,
605 sizeof(cl_uint
), &NumArgs
, NULL
);
606 checkOpenCLError(Ret
, "Failed to get number of kernel arguments.\n");
608 /* Argument sizes are stored at the end of the Parameters array. */
609 for (cl_uint i
= 0; i
< NumArgs
; i
++) {
610 Ret
= clSetKernelArgFcnPtr(CLKernel
->Kernel
, i
,
611 *((int *)Parameters
[NumArgs
+ i
]),
612 (void *)Parameters
[i
]);
613 checkOpenCLError(Ret
, "Failed to set Kernel argument %d.\n", i
);
616 unsigned int GridDimZ
= 1;
617 size_t GlobalWorkSize
[3] = {BlockDimX
* GridDimX
, BlockDimY
* GridDimY
,
618 BlockDimZ
* GridDimZ
};
619 size_t LocalWorkSize
[3] = {BlockDimX
, BlockDimY
, BlockDimZ
};
621 static const int WorkDim
= 3;
622 OpenCLContext
*CLContext
= (OpenCLContext
*)GlobalContext
->Context
;
623 Ret
= clEnqueueNDRangeKernelFcnPtr(CLContext
->CommandQueue
, CLKernel
->Kernel
,
624 WorkDim
, NULL
, GlobalWorkSize
,
625 LocalWorkSize
, 0, NULL
, NULL
);
626 checkOpenCLError(Ret
, "Launching OpenCL kernel failed.\n");
629 static void freeDeviceMemoryCL(PollyGPUDevicePtr
*Allocation
) {
632 OpenCLDevicePtr
*DevPtr
= (OpenCLDevicePtr
*)Allocation
->DevicePtr
;
633 cl_int Ret
= clReleaseMemObjectFcnPtr((cl_mem
)DevPtr
->MemObj
);
634 checkOpenCLError(Ret
, "Failed to free device memory.\n");
640 static PollyGPUDevicePtr
*allocateMemoryForDeviceCL(long MemSize
) {
643 if (!GlobalContext
) {
644 fprintf(stderr
, "GPGPU-code generation not correctly initialized.\n");
648 PollyGPUDevicePtr
*DevData
= malloc(sizeof(PollyGPUDevicePtr
));
650 fprintf(stderr
, "Allocate memory for GPU device memory pointer failed.\n");
653 DevData
->DevicePtr
= (OpenCLDevicePtr
*)malloc(sizeof(OpenCLDevicePtr
));
654 if (DevData
->DevicePtr
== 0) {
655 fprintf(stderr
, "Allocate memory for GPU device memory pointer failed.\n");
660 ((OpenCLDevicePtr
*)DevData
->DevicePtr
)->MemObj
=
661 clCreateBufferFcnPtr(((OpenCLContext
*)GlobalContext
->Context
)->Context
,
662 CL_MEM_READ_WRITE
, MemSize
, NULL
, &Ret
);
663 checkOpenCLError(Ret
,
664 "Allocate memory for GPU device memory pointer failed.\n");
669 static void *getDevicePtrCL(PollyGPUDevicePtr
*Allocation
) {
672 OpenCLDevicePtr
*DevPtr
= (OpenCLDevicePtr
*)Allocation
->DevicePtr
;
673 return (void *)DevPtr
->MemObj
;
676 static void synchronizeDeviceCL() {
679 if (!GlobalContext
) {
680 fprintf(stderr
, "GPGPU-code generation not correctly initialized.\n");
684 if (clFinishFcnPtr(((OpenCLContext
*)GlobalContext
->Context
)->CommandQueue
) !=
686 fprintf(stderr
, "Synchronizing device and host memory failed.\n");
691 static void freeContextCL(PollyGPUContext
*Context
) {
696 GlobalContext
= NULL
;
698 OpenCLContext
*Ctx
= (OpenCLContext
*)Context
->Context
;
699 if (Ctx
->CommandQueue
) {
700 Ret
= clReleaseCommandQueueFcnPtr(Ctx
->CommandQueue
);
701 checkOpenCLError(Ret
, "Could not release command queue.\n");
705 Ret
= clReleaseContextFcnPtr(Ctx
->Context
);
706 checkOpenCLError(Ret
, "Could not release context.\n");
713 static void printOpenCLError(int Error
) {
717 // Success, don't print an error.
720 // JIT/Runtime errors.
721 case CL_DEVICE_NOT_FOUND
:
722 fprintf(stderr
, "Device not found.\n");
724 case CL_DEVICE_NOT_AVAILABLE
:
725 fprintf(stderr
, "Device not available.\n");
727 case CL_COMPILER_NOT_AVAILABLE
:
728 fprintf(stderr
, "Compiler not available.\n");
730 case CL_MEM_OBJECT_ALLOCATION_FAILURE
:
731 fprintf(stderr
, "Mem object allocation failure.\n");
733 case CL_OUT_OF_RESOURCES
:
734 fprintf(stderr
, "Out of resources.\n");
736 case CL_OUT_OF_HOST_MEMORY
:
737 fprintf(stderr
, "Out of host memory.\n");
739 case CL_PROFILING_INFO_NOT_AVAILABLE
:
740 fprintf(stderr
, "Profiling info not available.\n");
742 case CL_MEM_COPY_OVERLAP
:
743 fprintf(stderr
, "Mem copy overlap.\n");
745 case CL_IMAGE_FORMAT_MISMATCH
:
746 fprintf(stderr
, "Image format mismatch.\n");
748 case CL_IMAGE_FORMAT_NOT_SUPPORTED
:
749 fprintf(stderr
, "Image format not supported.\n");
751 case CL_BUILD_PROGRAM_FAILURE
:
752 fprintf(stderr
, "Build program failure.\n");
755 fprintf(stderr
, "Map failure.\n");
757 case CL_MISALIGNED_SUB_BUFFER_OFFSET
:
758 fprintf(stderr
, "Misaligned sub buffer offset.\n");
760 case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST
:
761 fprintf(stderr
, "Exec status error for events in wait list.\n");
763 case CL_COMPILE_PROGRAM_FAILURE
:
764 fprintf(stderr
, "Compile program failure.\n");
766 case CL_LINKER_NOT_AVAILABLE
:
767 fprintf(stderr
, "Linker not available.\n");
769 case CL_LINK_PROGRAM_FAILURE
:
770 fprintf(stderr
, "Link program failure.\n");
772 case CL_DEVICE_PARTITION_FAILED
:
773 fprintf(stderr
, "Device partition failed.\n");
775 case CL_KERNEL_ARG_INFO_NOT_AVAILABLE
:
776 fprintf(stderr
, "Kernel arg info not available.\n");
780 case CL_INVALID_VALUE
:
781 fprintf(stderr
, "Invalid value.\n");
783 case CL_INVALID_DEVICE_TYPE
:
784 fprintf(stderr
, "Invalid device type.\n");
786 case CL_INVALID_PLATFORM
:
787 fprintf(stderr
, "Invalid platform.\n");
789 case CL_INVALID_DEVICE
:
790 fprintf(stderr
, "Invalid device.\n");
792 case CL_INVALID_CONTEXT
:
793 fprintf(stderr
, "Invalid context.\n");
795 case CL_INVALID_QUEUE_PROPERTIES
:
796 fprintf(stderr
, "Invalid queue properties.\n");
798 case CL_INVALID_COMMAND_QUEUE
:
799 fprintf(stderr
, "Invalid command queue.\n");
801 case CL_INVALID_HOST_PTR
:
802 fprintf(stderr
, "Invalid host pointer.\n");
804 case CL_INVALID_MEM_OBJECT
:
805 fprintf(stderr
, "Invalid memory object.\n");
807 case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR
:
808 fprintf(stderr
, "Invalid image format descriptor.\n");
810 case CL_INVALID_IMAGE_SIZE
:
811 fprintf(stderr
, "Invalid image size.\n");
813 case CL_INVALID_SAMPLER
:
814 fprintf(stderr
, "Invalid sampler.\n");
816 case CL_INVALID_BINARY
:
817 fprintf(stderr
, "Invalid binary.\n");
819 case CL_INVALID_BUILD_OPTIONS
:
820 fprintf(stderr
, "Invalid build options.\n");
822 case CL_INVALID_PROGRAM
:
823 fprintf(stderr
, "Invalid program.\n");
825 case CL_INVALID_PROGRAM_EXECUTABLE
:
826 fprintf(stderr
, "Invalid program executable.\n");
828 case CL_INVALID_KERNEL_NAME
:
829 fprintf(stderr
, "Invalid kernel name.\n");
831 case CL_INVALID_KERNEL_DEFINITION
:
832 fprintf(stderr
, "Invalid kernel definition.\n");
834 case CL_INVALID_KERNEL
:
835 fprintf(stderr
, "Invalid kernel.\n");
837 case CL_INVALID_ARG_INDEX
:
838 fprintf(stderr
, "Invalid arg index.\n");
840 case CL_INVALID_ARG_VALUE
:
841 fprintf(stderr
, "Invalid arg value.\n");
843 case CL_INVALID_ARG_SIZE
:
844 fprintf(stderr
, "Invalid arg size.\n");
846 case CL_INVALID_KERNEL_ARGS
:
847 fprintf(stderr
, "Invalid kernel args.\n");
849 case CL_INVALID_WORK_DIMENSION
:
850 fprintf(stderr
, "Invalid work dimension.\n");
852 case CL_INVALID_WORK_GROUP_SIZE
:
853 fprintf(stderr
, "Invalid work group size.\n");
855 case CL_INVALID_WORK_ITEM_SIZE
:
856 fprintf(stderr
, "Invalid work item size.\n");
858 case CL_INVALID_GLOBAL_OFFSET
:
859 fprintf(stderr
, "Invalid global offset.\n");
861 case CL_INVALID_EVENT_WAIT_LIST
:
862 fprintf(stderr
, "Invalid event wait list.\n");
864 case CL_INVALID_EVENT
:
865 fprintf(stderr
, "Invalid event.\n");
867 case CL_INVALID_OPERATION
:
868 fprintf(stderr
, "Invalid operation.\n");
870 case CL_INVALID_GL_OBJECT
:
871 fprintf(stderr
, "Invalid GL object.\n");
873 case CL_INVALID_BUFFER_SIZE
:
874 fprintf(stderr
, "Invalid buffer size.\n");
876 case CL_INVALID_MIP_LEVEL
:
877 fprintf(stderr
, "Invalid mip level.\n");
879 case CL_INVALID_GLOBAL_WORK_SIZE
:
880 fprintf(stderr
, "Invalid global work size.\n");
882 case CL_INVALID_PROPERTY
:
883 fprintf(stderr
, "Invalid property.\n");
885 case CL_INVALID_IMAGE_DESCRIPTOR
:
886 fprintf(stderr
, "Invalid image descriptor.\n");
888 case CL_INVALID_COMPILER_OPTIONS
:
889 fprintf(stderr
, "Invalid compiler options.\n");
891 case CL_INVALID_LINKER_OPTIONS
:
892 fprintf(stderr
, "Invalid linker options.\n");
894 case CL_INVALID_DEVICE_PARTITION_COUNT
:
895 fprintf(stderr
, "Invalid device partition count.\n");
897 case -69: // OpenCL 2.0 Code for CL_INVALID_PIPE_SIZE
898 fprintf(stderr
, "Invalid pipe size.\n");
900 case -70: // OpenCL 2.0 Code for CL_INVALID_DEVICE_QUEUE
901 fprintf(stderr
, "Invalid device queue.\n");
904 // NVIDIA specific error.
906 fprintf(stderr
, "NVIDIA invalid read or write buffer.\n");
910 fprintf(stderr
, "Unknown error code!\n");
915 #endif /* HAS_LIBOPENCL */
916 /******************************************************************************/
918 /******************************************************************************/
921 struct CUDAContextT
{
928 const char *BinaryString
;
931 struct CUDADevicePtrT
{
935 /* Dynamic library handles for the CUDA and CUDA runtime library. */
936 static void *HandleCuda
;
937 static void *HandleCudaRT
;
939 /* Type-defines of function pointer to CUDA driver APIs. */
940 typedef CUresult CUDAAPI
CuMemAllocFcnTy(CUdeviceptr
*, size_t);
941 static CuMemAllocFcnTy
*CuMemAllocFcnPtr
;
943 typedef CUresult CUDAAPI
CuLaunchKernelFcnTy(
944 CUfunction F
, unsigned int GridDimX
, unsigned int GridDimY
,
945 unsigned int gridDimZ
, unsigned int blockDimX
, unsigned int BlockDimY
,
946 unsigned int BlockDimZ
, unsigned int SharedMemBytes
, CUstream HStream
,
947 void **KernelParams
, void **Extra
);
948 static CuLaunchKernelFcnTy
*CuLaunchKernelFcnPtr
;
950 typedef CUresult CUDAAPI
CuMemcpyDtoHFcnTy(void *, CUdeviceptr
, size_t);
951 static CuMemcpyDtoHFcnTy
*CuMemcpyDtoHFcnPtr
;
953 typedef CUresult CUDAAPI
CuMemcpyHtoDFcnTy(CUdeviceptr
, const void *, size_t);
954 static CuMemcpyHtoDFcnTy
*CuMemcpyHtoDFcnPtr
;
956 typedef CUresult CUDAAPI
CuMemFreeFcnTy(CUdeviceptr
);
957 static CuMemFreeFcnTy
*CuMemFreeFcnPtr
;
959 typedef CUresult CUDAAPI
CuModuleUnloadFcnTy(CUmodule
);
960 static CuModuleUnloadFcnTy
*CuModuleUnloadFcnPtr
;
962 typedef CUresult CUDAAPI
CuProfilerStopFcnTy();
963 static CuProfilerStopFcnTy
*CuProfilerStopFcnPtr
;
965 typedef CUresult CUDAAPI
CuCtxDestroyFcnTy(CUcontext
);
966 static CuCtxDestroyFcnTy
*CuCtxDestroyFcnPtr
;
968 typedef CUresult CUDAAPI
CuInitFcnTy(unsigned int);
969 static CuInitFcnTy
*CuInitFcnPtr
;
971 typedef CUresult CUDAAPI
CuDeviceGetCountFcnTy(int *);
972 static CuDeviceGetCountFcnTy
*CuDeviceGetCountFcnPtr
;
974 typedef CUresult CUDAAPI
CuCtxCreateFcnTy(CUcontext
*, unsigned int, CUdevice
);
975 static CuCtxCreateFcnTy
*CuCtxCreateFcnPtr
;
977 typedef CUresult CUDAAPI
CuCtxGetCurrentFcnTy(CUcontext
*);
978 static CuCtxGetCurrentFcnTy
*CuCtxGetCurrentFcnPtr
;
980 typedef CUresult CUDAAPI
CuDeviceGetFcnTy(CUdevice
*, int);
981 static CuDeviceGetFcnTy
*CuDeviceGetFcnPtr
;
983 typedef CUresult CUDAAPI
CuModuleLoadDataExFcnTy(CUmodule
*, const void *,
984 unsigned int, CUjit_option
*,
986 static CuModuleLoadDataExFcnTy
*CuModuleLoadDataExFcnPtr
;
988 typedef CUresult CUDAAPI
CuModuleLoadDataFcnTy(CUmodule
*Module
,
990 static CuModuleLoadDataFcnTy
*CuModuleLoadDataFcnPtr
;
992 typedef CUresult CUDAAPI
CuModuleGetFunctionFcnTy(CUfunction
*, CUmodule
,
994 static CuModuleGetFunctionFcnTy
*CuModuleGetFunctionFcnPtr
;
996 typedef CUresult CUDAAPI
CuDeviceComputeCapabilityFcnTy(int *, int *, CUdevice
);
997 static CuDeviceComputeCapabilityFcnTy
*CuDeviceComputeCapabilityFcnPtr
;
999 typedef CUresult CUDAAPI
CuDeviceGetNameFcnTy(char *, int, CUdevice
);
1000 static CuDeviceGetNameFcnTy
*CuDeviceGetNameFcnPtr
;
1002 typedef CUresult CUDAAPI
CuLinkAddDataFcnTy(CUlinkState State
,
1003 CUjitInputType Type
, void *Data
,
1004 size_t Size
, const char *Name
,
1005 unsigned int NumOptions
,
1006 CUjit_option
*Options
,
1007 void **OptionValues
);
1008 static CuLinkAddDataFcnTy
*CuLinkAddDataFcnPtr
;
1010 typedef CUresult CUDAAPI
CuLinkCreateFcnTy(unsigned int NumOptions
,
1011 CUjit_option
*Options
,
1012 void **OptionValues
,
1013 CUlinkState
*StateOut
);
1014 static CuLinkCreateFcnTy
*CuLinkCreateFcnPtr
;
1016 typedef CUresult CUDAAPI
CuLinkCompleteFcnTy(CUlinkState State
, void **CubinOut
,
1018 static CuLinkCompleteFcnTy
*CuLinkCompleteFcnPtr
;
1020 typedef CUresult CUDAAPI
CuLinkDestroyFcnTy(CUlinkState State
);
1021 static CuLinkDestroyFcnTy
*CuLinkDestroyFcnPtr
;
1023 typedef CUresult CUDAAPI
CuCtxSynchronizeFcnTy();
1024 static CuCtxSynchronizeFcnTy
*CuCtxSynchronizeFcnPtr
;
1026 /* Type-defines of function pointer ot CUDA runtime APIs. */
1027 typedef cudaError_t CUDARTAPI
CudaThreadSynchronizeFcnTy(void);
1028 static CudaThreadSynchronizeFcnTy
*CudaThreadSynchronizeFcnPtr
;
1030 static void *getAPIHandleCUDA(void *Handle
, const char *FuncName
) {
1034 FuncPtr
= dlsym(Handle
, FuncName
);
1035 if ((Err
= dlerror()) != 0) {
1036 fprintf(stderr
, "Load CUDA driver API failed: %s. \n", Err
);
1042 static int initialDeviceAPILibrariesCUDA() {
1043 HandleCuda
= dlopen("libcuda.so", RTLD_LAZY
);
1045 fprintf(stderr
, "Cannot open library: %s. \n", dlerror());
1049 HandleCudaRT
= dlopen("libcudart.so", RTLD_LAZY
);
1050 if (!HandleCudaRT
) {
1051 fprintf(stderr
, "Cannot open library: %s. \n", dlerror());
1058 /* Get function pointer to CUDA Driver APIs.
1060 * Note that compilers conforming to the ISO C standard are required to
1061 * generate a warning if a conversion from a void * pointer to a function
1062 * pointer is attempted as in the following statements. The warning
1063 * of this kind of cast may not be emitted by clang and new versions of gcc
1064 * as it is valid on POSIX 2008. For compilers required to generate a warning,
1065 * we temporarily disable -Wpedantic, to avoid bloating the output with
1066 * unnecessary warnings.
1069 * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html
1071 #pragma GCC diagnostic push
1072 #pragma GCC diagnostic ignored "-Wpedantic"
1073 static int initialDeviceAPIsCUDA() {
1074 if (initialDeviceAPILibrariesCUDA() == 0)
1077 CuLaunchKernelFcnPtr
=
1078 (CuLaunchKernelFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuLaunchKernel");
1081 (CuMemAllocFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuMemAlloc_v2");
1084 (CuMemFreeFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuMemFree_v2");
1086 CuMemcpyDtoHFcnPtr
=
1087 (CuMemcpyDtoHFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuMemcpyDtoH_v2");
1089 CuMemcpyHtoDFcnPtr
=
1090 (CuMemcpyHtoDFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuMemcpyHtoD_v2");
1092 CuModuleUnloadFcnPtr
=
1093 (CuModuleUnloadFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuModuleUnload");
1095 CuProfilerStopFcnPtr
=
1096 (CuProfilerStopFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuProfilerStop");
1098 CuCtxDestroyFcnPtr
=
1099 (CuCtxDestroyFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuCtxDestroy");
1101 CuInitFcnPtr
= (CuInitFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuInit");
1103 CuDeviceGetCountFcnPtr
=
1104 (CuDeviceGetCountFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuDeviceGetCount");
1107 (CuDeviceGetFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuDeviceGet");
1110 (CuCtxCreateFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuCtxCreate_v2");
1112 CuCtxGetCurrentFcnPtr
=
1113 (CuCtxGetCurrentFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuCtxGetCurrent");
1115 CuModuleLoadDataExFcnPtr
= (CuModuleLoadDataExFcnTy
*)getAPIHandleCUDA(
1116 HandleCuda
, "cuModuleLoadDataEx");
1118 CuModuleLoadDataFcnPtr
=
1119 (CuModuleLoadDataFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuModuleLoadData");
1121 CuModuleGetFunctionFcnPtr
= (CuModuleGetFunctionFcnTy
*)getAPIHandleCUDA(
1122 HandleCuda
, "cuModuleGetFunction");
1124 CuDeviceComputeCapabilityFcnPtr
=
1125 (CuDeviceComputeCapabilityFcnTy
*)getAPIHandleCUDA(
1126 HandleCuda
, "cuDeviceComputeCapability");
1128 CuDeviceGetNameFcnPtr
=
1129 (CuDeviceGetNameFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuDeviceGetName");
1131 CuLinkAddDataFcnPtr
=
1132 (CuLinkAddDataFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuLinkAddData");
1134 CuLinkCreateFcnPtr
=
1135 (CuLinkCreateFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuLinkCreate");
1137 CuLinkCompleteFcnPtr
=
1138 (CuLinkCompleteFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuLinkComplete");
1140 CuLinkDestroyFcnPtr
=
1141 (CuLinkDestroyFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuLinkDestroy");
1143 CuCtxSynchronizeFcnPtr
=
1144 (CuCtxSynchronizeFcnTy
*)getAPIHandleCUDA(HandleCuda
, "cuCtxSynchronize");
1146 /* Get function pointer to CUDA Runtime APIs. */
1147 CudaThreadSynchronizeFcnPtr
= (CudaThreadSynchronizeFcnTy
*)getAPIHandleCUDA(
1148 HandleCudaRT
, "cudaThreadSynchronize");
1152 #pragma GCC diagnostic pop
1154 static PollyGPUContext
*initContextCUDA() {
1156 PollyGPUContext
*Context
;
1159 int Major
= 0, Minor
= 0, DeviceID
= 0;
1160 char DeviceName
[256];
1161 int DeviceCount
= 0;
1163 static __thread PollyGPUContext
*CurrentContext
= NULL
;
1166 return CurrentContext
;
1168 /* Get API handles. */
1169 if (initialDeviceAPIsCUDA() == 0) {
1170 fprintf(stderr
, "Getting the \"handle\" for the CUDA driver API failed.\n");
1174 if (CuInitFcnPtr(0) != CUDA_SUCCESS
) {
1175 fprintf(stderr
, "Initializing the CUDA driver API failed.\n");
1179 /* Get number of devices that supports CUDA. */
1180 CuDeviceGetCountFcnPtr(&DeviceCount
);
1181 if (DeviceCount
== 0) {
1182 fprintf(stderr
, "There is no device supporting CUDA.\n");
1186 CuDeviceGetFcnPtr(&Device
, 0);
1188 /* Get compute capabilities and the device name. */
1189 CuDeviceComputeCapabilityFcnPtr(&Major
, &Minor
, Device
);
1190 CuDeviceGetNameFcnPtr(DeviceName
, 256, Device
);
1191 debug_print("> Running on GPU device %d : %s.\n", DeviceID
, DeviceName
);
1193 /* Create context on the device. */
1194 Context
= (PollyGPUContext
*)malloc(sizeof(PollyGPUContext
));
1196 fprintf(stderr
, "Allocate memory for Polly GPU context failed.\n");
1199 Context
->Context
= malloc(sizeof(CUDAContext
));
1200 if (Context
->Context
== 0) {
1201 fprintf(stderr
, "Allocate memory for Polly CUDA context failed.\n");
1205 // In cases where managed memory is used, it is quite likely that
1206 // `cudaMallocManaged` / `polly_mallocManaged` was called before
1207 // `polly_initContext` was called.
1209 // If `polly_initContext` calls `CuCtxCreate` when there already was a
1210 // pre-existing context created by the runtime API, this causes code running
1211 // on P100 to hang. So, we query for a pre-existing context to try and use.
1212 // If there is no pre-existing context, we create a new context
1214 // The possible pre-existing context from previous runtime API calls.
1215 CUcontext MaybeRuntimeAPIContext
;
1216 if (CuCtxGetCurrentFcnPtr(&MaybeRuntimeAPIContext
) != CUDA_SUCCESS
) {
1217 fprintf(stderr
, "cuCtxGetCurrent failed.\n");
1221 // There was no previous context, initialise it.
1222 if (MaybeRuntimeAPIContext
== NULL
) {
1223 if (CuCtxCreateFcnPtr(&(((CUDAContext
*)Context
->Context
)->Cuda
), 0,
1224 Device
) != CUDA_SUCCESS
) {
1225 fprintf(stderr
, "cuCtxCreateFcnPtr failed.\n");
1229 ((CUDAContext
*)Context
->Context
)->Cuda
= MaybeRuntimeAPIContext
;
1233 CurrentContext
= Context
;
1238 static void freeKernelCUDA(PollyGPUFunction
*Kernel
) {
1244 if (((CUDAKernel
*)Kernel
->Kernel
)->CudaModule
)
1245 CuModuleUnloadFcnPtr(((CUDAKernel
*)Kernel
->Kernel
)->CudaModule
);
1248 free((CUDAKernel
*)Kernel
->Kernel
);
1254 static PollyGPUFunction
*getKernelCUDA(const char *BinaryBuffer
,
1255 const char *KernelName
) {
1258 static __thread PollyGPUFunction
*KernelCache
[KERNEL_CACHE_SIZE
];
1259 static __thread
int NextCacheItem
= 0;
1261 for (long i
= 0; i
< KERNEL_CACHE_SIZE
; i
++) {
1262 // We exploit here the property that all Polly-ACC kernels are allocated
1263 // as global constants, hence a pointer comparision is sufficient to
1264 // determin equality.
1265 if (KernelCache
[i
] &&
1266 ((CUDAKernel
*)KernelCache
[i
]->Kernel
)->BinaryString
== BinaryBuffer
) {
1267 debug_print(" -> using cached kernel\n");
1268 return KernelCache
[i
];
1272 PollyGPUFunction
*Function
= malloc(sizeof(PollyGPUFunction
));
1273 if (Function
== 0) {
1274 fprintf(stderr
, "Allocate memory for Polly GPU function failed.\n");
1277 Function
->Kernel
= (CUDAKernel
*)malloc(sizeof(CUDAKernel
));
1278 if (Function
->Kernel
== 0) {
1279 fprintf(stderr
, "Allocate memory for Polly CUDA function failed.\n");
1285 CUjit_option Options
[6];
1286 void *OptionVals
[6];
1288 unsigned long LogSize
= 8192;
1289 char ErrorLog
[8192], InfoLog
[8192];
1293 // Setup linker options
1294 // Return walltime from JIT compilation
1295 Options
[0] = CU_JIT_WALL_TIME
;
1296 OptionVals
[0] = (void *)&Walltime
;
1297 // Pass a buffer for info messages
1298 Options
[1] = CU_JIT_INFO_LOG_BUFFER
;
1299 OptionVals
[1] = (void *)InfoLog
;
1300 // Pass the size of the info buffer
1301 Options
[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
;
1302 OptionVals
[2] = (void *)LogSize
;
1303 // Pass a buffer for error message
1304 Options
[3] = CU_JIT_ERROR_LOG_BUFFER
;
1305 OptionVals
[3] = (void *)ErrorLog
;
1306 // Pass the size of the error buffer
1307 Options
[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
;
1308 OptionVals
[4] = (void *)LogSize
;
1309 // Make the linker verbose
1310 Options
[5] = CU_JIT_LOG_VERBOSE
;
1311 OptionVals
[5] = (void *)1;
1313 memset(ErrorLog
, 0, sizeof(ErrorLog
));
1315 CuLinkCreateFcnPtr(6, Options
, OptionVals
, &LState
);
1316 Res
= CuLinkAddDataFcnPtr(LState
, CU_JIT_INPUT_PTX
, (void *)BinaryBuffer
,
1317 strlen(BinaryBuffer
) + 1, 0, 0, 0, 0);
1318 if (Res
!= CUDA_SUCCESS
) {
1319 fprintf(stderr
, "PTX Linker Error:\n%s\n%s", ErrorLog
, InfoLog
);
1323 Res
= CuLinkCompleteFcnPtr(LState
, &CuOut
, &OutSize
);
1324 if (Res
!= CUDA_SUCCESS
) {
1325 fprintf(stderr
, "Complete ptx linker step failed.\n");
1326 fprintf(stderr
, "\n%s\n", ErrorLog
);
1330 debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime
,
1333 Res
= CuModuleLoadDataFcnPtr(&(((CUDAKernel
*)Function
->Kernel
)->CudaModule
),
1335 if (Res
!= CUDA_SUCCESS
) {
1336 fprintf(stderr
, "Loading ptx assembly text failed.\n");
1340 Res
= CuModuleGetFunctionFcnPtr(&(((CUDAKernel
*)Function
->Kernel
)->Cuda
),
1341 ((CUDAKernel
*)Function
->Kernel
)->CudaModule
,
1343 if (Res
!= CUDA_SUCCESS
) {
1344 fprintf(stderr
, "Loading kernel function failed.\n");
1348 CuLinkDestroyFcnPtr(LState
);
1350 ((CUDAKernel
*)Function
->Kernel
)->BinaryString
= BinaryBuffer
;
1353 if (KernelCache
[NextCacheItem
])
1354 freeKernelCUDA(KernelCache
[NextCacheItem
]);
1356 KernelCache
[NextCacheItem
] = Function
;
1358 NextCacheItem
= (NextCacheItem
+ 1) % KERNEL_CACHE_SIZE
;
1364 static void synchronizeDeviceCUDA() {
1366 if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS
) {
1367 fprintf(stderr
, "Synchronizing device and host memory failed.\n");
1372 static void copyFromHostToDeviceCUDA(void *HostData
, PollyGPUDevicePtr
*DevData
,
1376 CUdeviceptr CuDevData
= ((CUDADevicePtr
*)DevData
->DevicePtr
)->Cuda
;
1377 CuMemcpyHtoDFcnPtr(CuDevData
, HostData
, MemSize
);
1380 static void copyFromDeviceToHostCUDA(PollyGPUDevicePtr
*DevData
, void *HostData
,
1384 if (CuMemcpyDtoHFcnPtr(HostData
, ((CUDADevicePtr
*)DevData
->DevicePtr
)->Cuda
,
1385 MemSize
) != CUDA_SUCCESS
) {
1386 fprintf(stderr
, "Copying results from device to host memory failed.\n");
1391 static void launchKernelCUDA(PollyGPUFunction
*Kernel
, unsigned int GridDimX
,
1392 unsigned int GridDimY
, unsigned int BlockDimX
,
1393 unsigned int BlockDimY
, unsigned int BlockDimZ
,
1394 void **Parameters
) {
1397 unsigned GridDimZ
= 1;
1398 unsigned int SharedMemBytes
= CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE
;
1399 CUstream Stream
= 0;
1404 CuLaunchKernelFcnPtr(((CUDAKernel
*)Kernel
->Kernel
)->Cuda
, GridDimX
,
1405 GridDimY
, GridDimZ
, BlockDimX
, BlockDimY
, BlockDimZ
,
1406 SharedMemBytes
, Stream
, Parameters
, Extra
);
1407 if (Res
!= CUDA_SUCCESS
) {
1408 fprintf(stderr
, "Launching CUDA kernel failed.\n");
1413 // Maximum number of managed memory pointers.
1414 #define MAX_POINTERS 4000
1415 // For the rationale behing a list of free pointers, see `polly_freeManaged`.
1416 void *g_managedptrs
[MAX_POINTERS
];
1417 int g_nmanagedptrs
= 0;
1419 // Add a pointer as being allocated by cuMallocManaged
1420 void addManagedPtr(void *mem
) {
1421 assert(g_nmanagedptrs
< MAX_POINTERS
&& "We have hit the maximum number of "
1422 "managed pointers allowed. Increase "
1424 g_managedptrs
[g_nmanagedptrs
++] = mem
;
1427 int isManagedPtr(void *mem
) {
1428 for (int i
= 0; i
< g_nmanagedptrs
; i
++) {
1429 if (g_managedptrs
[i
] == mem
)
1435 void polly_freeManaged(void *mem
) {
1438 // In a real-world program this was used (COSMO), there were more `free`
1439 // calls in the original source than `malloc` calls. Hence, replacing all
1440 // `free`s with `cudaFree` does not work, since we would try to free
1441 // 'illegal' memory.
1442 // As a quick fix, we keep a free list and check if `mem` is a managed memory
1443 // pointer. If it is, we call `cudaFree`.
1444 // If not, we pass it along to the underlying allocator.
1445 // This is a hack, and can be removed if the underlying issue is fixed.
1446 if (isManagedPtr(mem
)) {
1447 if (cudaFree(mem
) != cudaSuccess
) {
1448 fprintf(stderr
, "cudaFree failed.\n");
1457 void *polly_mallocManaged(size_t size
) {
1460 if (cudaMallocManaged(&a
, size
, cudaMemAttachGlobal
) != cudaSuccess
) {
1461 fprintf(stderr
, "cudaMallocManaged failed for size: %zu\n", size
);
1468 static void freeDeviceMemoryCUDA(PollyGPUDevicePtr
*Allocation
) {
1470 CUDADevicePtr
*DevPtr
= (CUDADevicePtr
*)Allocation
->DevicePtr
;
1471 CuMemFreeFcnPtr((CUdeviceptr
)DevPtr
->Cuda
);
1476 static PollyGPUDevicePtr
*allocateMemoryForDeviceCUDA(long MemSize
) {
1479 PollyGPUDevicePtr
*DevData
= malloc(sizeof(PollyGPUDevicePtr
));
1481 fprintf(stderr
, "Allocate memory for GPU device memory pointer failed.\n");
1484 DevData
->DevicePtr
= (CUDADevicePtr
*)malloc(sizeof(CUDADevicePtr
));
1485 if (DevData
->DevicePtr
== 0) {
1486 fprintf(stderr
, "Allocate memory for GPU device memory pointer failed.\n");
1491 CuMemAllocFcnPtr(&(((CUDADevicePtr
*)DevData
->DevicePtr
)->Cuda
), MemSize
);
1493 if (Res
!= CUDA_SUCCESS
) {
1494 fprintf(stderr
, "Allocate memory for GPU device memory pointer failed.\n");
1501 static void *getDevicePtrCUDA(PollyGPUDevicePtr
*Allocation
) {
1504 CUDADevicePtr
*DevPtr
= (CUDADevicePtr
*)Allocation
->DevicePtr
;
1505 return (void *)DevPtr
->Cuda
;
1508 static void freeContextCUDA(PollyGPUContext
*Context
) {
1511 CUDAContext
*Ctx
= (CUDAContext
*)Context
->Context
;
1513 CuProfilerStopFcnPtr();
1514 CuCtxDestroyFcnPtr(Ctx
->Cuda
);
1519 dlclose(HandleCuda
);
1520 dlclose(HandleCudaRT
);
1523 #endif /* HAS_LIBCUDART */
1524 /******************************************************************************/
1526 /******************************************************************************/
1528 PollyGPUContext
*polly_initContext() {
1529 DebugMode
= getenv("POLLY_DEBUG") != 0;
1530 CacheMode
= getenv("POLLY_NOCACHE") == 0;
1534 PollyGPUContext
*Context
;
1537 #ifdef HAS_LIBCUDART
1539 Context
= initContextCUDA();
1541 #endif /* HAS_LIBCUDART */
1542 #ifdef HAS_LIBOPENCL
1544 Context
= initContextCL();
1546 #endif /* HAS_LIBOPENCL */
1554 void polly_freeKernel(PollyGPUFunction
*Kernel
) {
1558 #ifdef HAS_LIBCUDART
1560 freeKernelCUDA(Kernel
);
1562 #endif /* HAS_LIBCUDART */
1563 #ifdef HAS_LIBOPENCL
1565 freeKernelCL(Kernel
);
1567 #endif /* HAS_LIBOPENCL */
1573 PollyGPUFunction
*polly_getKernel(const char *BinaryBuffer
,
1574 const char *KernelName
) {
1577 PollyGPUFunction
*Function
;
1580 #ifdef HAS_LIBCUDART
1582 Function
= getKernelCUDA(BinaryBuffer
, KernelName
);
1584 #endif /* HAS_LIBCUDART */
1585 #ifdef HAS_LIBOPENCL
1587 Function
= getKernelCL(BinaryBuffer
, KernelName
);
1589 #endif /* HAS_LIBOPENCL */
1597 void polly_copyFromHostToDevice(void *HostData
, PollyGPUDevicePtr
*DevData
,
1602 #ifdef HAS_LIBCUDART
1604 copyFromHostToDeviceCUDA(HostData
, DevData
, MemSize
);
1606 #endif /* HAS_LIBCUDART */
1607 #ifdef HAS_LIBOPENCL
1609 copyFromHostToDeviceCL(HostData
, DevData
, MemSize
);
1611 #endif /* HAS_LIBOPENCL */
1617 void polly_copyFromDeviceToHost(PollyGPUDevicePtr
*DevData
, void *HostData
,
1622 #ifdef HAS_LIBCUDART
1624 copyFromDeviceToHostCUDA(DevData
, HostData
, MemSize
);
1626 #endif /* HAS_LIBCUDART */
1627 #ifdef HAS_LIBOPENCL
1629 copyFromDeviceToHostCL(DevData
, HostData
, MemSize
);
1631 #endif /* HAS_LIBOPENCL */
1637 void polly_launchKernel(PollyGPUFunction
*Kernel
, unsigned int GridDimX
,
1638 unsigned int GridDimY
, unsigned int BlockDimX
,
1639 unsigned int BlockDimY
, unsigned int BlockDimZ
,
1640 void **Parameters
) {
1644 #ifdef HAS_LIBCUDART
1646 launchKernelCUDA(Kernel
, GridDimX
, GridDimY
, BlockDimX
, BlockDimY
,
1647 BlockDimZ
, Parameters
);
1649 #endif /* HAS_LIBCUDART */
1650 #ifdef HAS_LIBOPENCL
1652 launchKernelCL(Kernel
, GridDimX
, GridDimY
, BlockDimX
, BlockDimY
, BlockDimZ
,
1655 #endif /* HAS_LIBOPENCL */
1661 void polly_freeDeviceMemory(PollyGPUDevicePtr
*Allocation
) {
1665 #ifdef HAS_LIBCUDART
1667 freeDeviceMemoryCUDA(Allocation
);
1669 #endif /* HAS_LIBCUDART */
1670 #ifdef HAS_LIBOPENCL
1672 freeDeviceMemoryCL(Allocation
);
1674 #endif /* HAS_LIBOPENCL */
1680 PollyGPUDevicePtr
*polly_allocateMemoryForDevice(long MemSize
) {
1683 PollyGPUDevicePtr
*DevData
;
1686 #ifdef HAS_LIBCUDART
1688 DevData
= allocateMemoryForDeviceCUDA(MemSize
);
1690 #endif /* HAS_LIBCUDART */
1691 #ifdef HAS_LIBOPENCL
1693 DevData
= allocateMemoryForDeviceCL(MemSize
);
1695 #endif /* HAS_LIBOPENCL */
1703 void *polly_getDevicePtr(PollyGPUDevicePtr
*Allocation
) {
1709 #ifdef HAS_LIBCUDART
1711 DevPtr
= getDevicePtrCUDA(Allocation
);
1713 #endif /* HAS_LIBCUDART */
1714 #ifdef HAS_LIBOPENCL
1716 DevPtr
= getDevicePtrCL(Allocation
);
1718 #endif /* HAS_LIBOPENCL */
1726 void polly_synchronizeDevice() {
1730 #ifdef HAS_LIBCUDART
1732 synchronizeDeviceCUDA();
1734 #endif /* HAS_LIBCUDART */
1735 #ifdef HAS_LIBOPENCL
1737 synchronizeDeviceCL();
1739 #endif /* HAS_LIBOPENCL */
1745 void polly_freeContext(PollyGPUContext
*Context
) {
1752 #ifdef HAS_LIBCUDART
1754 freeContextCUDA(Context
);
1756 #endif /* HAS_LIBCUDART */
1757 #ifdef HAS_LIBOPENCL
1759 freeContextCL(Context
);
1761 #endif /* HAS_LIBOPENCL */
1767 /* Initialize GPUJIT with CUDA as runtime library. */
1768 PollyGPUContext
*polly_initContextCUDA() {
1769 #ifdef HAS_LIBCUDART
1770 Runtime
= RUNTIME_CUDA
;
1771 return polly_initContext();
1773 fprintf(stderr
, "GPU Runtime was built without CUDA support.\n");
1775 #endif /* HAS_LIBCUDART */
1778 /* Initialize GPUJIT with OpenCL as runtime library. */
1779 PollyGPUContext
*polly_initContextCL() {
1780 #ifdef HAS_LIBOPENCL
1781 Runtime
= RUNTIME_CL
;
1782 return polly_initContext();
1784 fprintf(stderr
, "GPU Runtime was built without OpenCL support.\n");
1786 #endif /* HAS_LIBOPENCL */