[test] Fix test case without Polly-ACC.
[polly-mirror.git] / tools / GPURuntime / GPUJIT.c
blob74d183909d34bbb34e6325db2fcc7d3168890790
1 /******************** GPUJIT.c - GPUJIT Execution Engine **********************/
2 /* */
3 /* The LLVM Compiler Infrastructure */
4 /* */
5 /* This file is dual licensed under the MIT and the University of Illinois */
6 /* Open Source License. See LICENSE.TXT for details. */
7 /* */
8 /******************************************************************************/
9 /* */
10 /* This file implements GPUJIT, a ptx string execution engine for GPU. */
11 /* */
12 /******************************************************************************/
14 #include "GPUJIT.h"
16 #ifdef HAS_LIBCUDART
17 #include <cuda.h>
18 #include <cuda_runtime.h>
19 #endif /* HAS_LIBCUDART */
21 #ifdef HAS_LIBOPENCL
22 #ifdef __APPLE__
23 #include <OpenCL/opencl.h>
24 #else
25 #include <CL/cl.h>
26 #endif /* __APPLE__ */
27 #endif /* HAS_LIBOPENCL */
29 #include <assert.h>
30 #include <dlfcn.h>
31 #include <stdarg.h>
32 #include <stdio.h>
33 #include <string.h>
34 #include <unistd.h>
36 static int DebugMode;
37 static int CacheMode;
39 static PollyGPURuntime Runtime = RUNTIME_NONE;
41 static void debug_print(const char *format, ...) {
42 if (!DebugMode)
43 return;
45 va_list args;
46 va_start(args, format);
47 vfprintf(stderr, format, args);
48 va_end(args);
50 #define dump_function() debug_print("-> %s\n", __func__)
52 #define KERNEL_CACHE_SIZE 10
54 static void err_runtime() __attribute__((noreturn));
55 static void err_runtime() {
56 fprintf(stderr, "Runtime not correctly initialized.\n");
57 exit(-1);
60 struct PollyGPUContextT {
61 void *Context;
64 struct PollyGPUFunctionT {
65 void *Kernel;
68 struct PollyGPUDevicePtrT {
69 void *DevicePtr;
72 /******************************************************************************/
73 /* OpenCL */
74 /******************************************************************************/
75 #ifdef HAS_LIBOPENCL
77 struct OpenCLContextT {
78 cl_context Context;
79 cl_command_queue CommandQueue;
82 struct OpenCLKernelT {
83 cl_kernel Kernel;
84 cl_program Program;
85 const char *BinaryString;
88 struct OpenCLDevicePtrT {
89 cl_mem MemObj;
92 /* Dynamic library handles for the OpenCL runtime library. */
93 static void *HandleOpenCL;
94 static void *HandleOpenCLBeignet;
96 /* Type-defines of function pointer to OpenCL Runtime API. */
97 typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries,
98 cl_platform_id *Platforms,
99 cl_uint *NumPlatforms);
100 static clGetPlatformIDsFcnTy *clGetPlatformIDsFcnPtr;
102 typedef cl_int clGetDeviceIDsFcnTy(cl_platform_id Platform,
103 cl_device_type DeviceType,
104 cl_uint NumEntries, cl_device_id *Devices,
105 cl_uint *NumDevices);
106 static clGetDeviceIDsFcnTy *clGetDeviceIDsFcnPtr;
108 typedef cl_int clGetDeviceInfoFcnTy(cl_device_id Device,
109 cl_device_info ParamName,
110 size_t ParamValueSize, void *ParamValue,
111 size_t *ParamValueSizeRet);
112 static clGetDeviceInfoFcnTy *clGetDeviceInfoFcnPtr;
114 typedef cl_int clGetKernelInfoFcnTy(cl_kernel Kernel, cl_kernel_info ParamName,
115 size_t ParamValueSize, void *ParamValue,
116 size_t *ParamValueSizeRet);
117 static clGetKernelInfoFcnTy *clGetKernelInfoFcnPtr;
119 typedef cl_context clCreateContextFcnTy(
120 const cl_context_properties *Properties, cl_uint NumDevices,
121 const cl_device_id *Devices,
122 void CL_CALLBACK *pfn_notify(const char *Errinfo, const void *PrivateInfo,
123 size_t CB, void *UserData),
124 void *UserData, cl_int *ErrcodeRet);
125 static clCreateContextFcnTy *clCreateContextFcnPtr;
127 typedef cl_command_queue
128 clCreateCommandQueueFcnTy(cl_context Context, cl_device_id Device,
129 cl_command_queue_properties Properties,
130 cl_int *ErrcodeRet);
131 static clCreateCommandQueueFcnTy *clCreateCommandQueueFcnPtr;
133 typedef cl_mem clCreateBufferFcnTy(cl_context Context, cl_mem_flags Flags,
134 size_t Size, void *HostPtr,
135 cl_int *ErrcodeRet);
136 static clCreateBufferFcnTy *clCreateBufferFcnPtr;
138 typedef cl_int
139 clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue, cl_mem Buffer,
140 cl_bool BlockingWrite, size_t Offset, size_t Size,
141 const void *Ptr, cl_uint NumEventsInWaitList,
142 const cl_event *EventWaitList, cl_event *Event);
143 static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr;
145 typedef cl_program
146 clCreateProgramWithLLVMIntelFcnTy(cl_context Context, cl_uint NumDevices,
147 const cl_device_id *DeviceList,
148 const char *Filename, cl_int *ErrcodeRet);
149 static clCreateProgramWithLLVMIntelFcnTy *clCreateProgramWithLLVMIntelFcnPtr;
151 typedef cl_program clCreateProgramWithBinaryFcnTy(
152 cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList,
153 const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus,
154 cl_int *ErrcodeRet);
155 static clCreateProgramWithBinaryFcnTy *clCreateProgramWithBinaryFcnPtr;
157 typedef cl_int clBuildProgramFcnTy(
158 cl_program Program, cl_uint NumDevices, const cl_device_id *DeviceList,
159 const char *Options,
160 void(CL_CALLBACK *pfn_notify)(cl_program Program, void *UserData),
161 void *UserData);
162 static clBuildProgramFcnTy *clBuildProgramFcnPtr;
164 typedef cl_kernel clCreateKernelFcnTy(cl_program Program,
165 const char *KernelName,
166 cl_int *ErrcodeRet);
167 static clCreateKernelFcnTy *clCreateKernelFcnPtr;
169 typedef cl_int clSetKernelArgFcnTy(cl_kernel Kernel, cl_uint ArgIndex,
170 size_t ArgSize, const void *ArgValue);
171 static clSetKernelArgFcnTy *clSetKernelArgFcnPtr;
173 typedef cl_int clEnqueueNDRangeKernelFcnTy(
174 cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint WorkDim,
175 const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
176 const size_t *LocalWorkSize, cl_uint NumEventsInWaitList,
177 const cl_event *EventWaitList, cl_event *Event);
178 static clEnqueueNDRangeKernelFcnTy *clEnqueueNDRangeKernelFcnPtr;
180 typedef cl_int clEnqueueReadBufferFcnTy(cl_command_queue CommandQueue,
181 cl_mem Buffer, cl_bool BlockingRead,
182 size_t Offset, size_t Size, void *Ptr,
183 cl_uint NumEventsInWaitList,
184 const cl_event *EventWaitList,
185 cl_event *Event);
186 static clEnqueueReadBufferFcnTy *clEnqueueReadBufferFcnPtr;
188 typedef cl_int clFlushFcnTy(cl_command_queue CommandQueue);
189 static clFlushFcnTy *clFlushFcnPtr;
191 typedef cl_int clFinishFcnTy(cl_command_queue CommandQueue);
192 static clFinishFcnTy *clFinishFcnPtr;
194 typedef cl_int clReleaseKernelFcnTy(cl_kernel Kernel);
195 static clReleaseKernelFcnTy *clReleaseKernelFcnPtr;
197 typedef cl_int clReleaseProgramFcnTy(cl_program Program);
198 static clReleaseProgramFcnTy *clReleaseProgramFcnPtr;
200 typedef cl_int clReleaseMemObjectFcnTy(cl_mem Memobject);
201 static clReleaseMemObjectFcnTy *clReleaseMemObjectFcnPtr;
203 typedef cl_int clReleaseCommandQueueFcnTy(cl_command_queue CommandQueue);
204 static clReleaseCommandQueueFcnTy *clReleaseCommandQueueFcnPtr;
206 typedef cl_int clReleaseContextFcnTy(cl_context Context);
207 static clReleaseContextFcnTy *clReleaseContextFcnPtr;
209 static void *getAPIHandleCL(void *Handle, const char *FuncName) {
210 char *Err;
211 void *FuncPtr;
212 dlerror();
213 FuncPtr = dlsym(Handle, FuncName);
214 if ((Err = dlerror()) != 0) {
215 fprintf(stderr, "Load OpenCL Runtime API failed: %s. \n", Err);
216 return 0;
218 return FuncPtr;
221 static int initialDeviceAPILibrariesCL() {
222 HandleOpenCLBeignet = dlopen("/usr/local/lib/beignet/libcl.so", RTLD_LAZY);
223 HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY);
224 if (!HandleOpenCL) {
225 fprintf(stderr, "Cannot open library: %s. \n", dlerror());
226 return 0;
228 return 1;
231 /* Get function pointer to OpenCL Runtime API.
233 * Note that compilers conforming to the ISO C standard are required to
234 * generate a warning if a conversion from a void * pointer to a function
235 * pointer is attempted as in the following statements. The warning
236 * of this kind of cast may not be emitted by clang and new versions of gcc
237 * as it is valid on POSIX 2008. For compilers required to generate a warning,
238 * we temporarily disable -Wpedantic, to avoid bloating the output with
239 * unnecessary warnings.
241 * Reference:
242 * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html
244 #pragma GCC diagnostic push
245 #pragma GCC diagnostic ignored "-Wpedantic"
246 static int initialDeviceAPIsCL() {
247 if (initialDeviceAPILibrariesCL() == 0)
248 return 0;
250 // FIXME: We are now always selecting the Intel Beignet driver if it is
251 // available on the system, instead of a possible NVIDIA or AMD OpenCL
252 // API. This selection should occurr based on the target architecture
253 // chosen when compiling.
254 void *Handle =
255 (HandleOpenCLBeignet != NULL ? HandleOpenCLBeignet : HandleOpenCL);
257 clGetPlatformIDsFcnPtr =
258 (clGetPlatformIDsFcnTy *)getAPIHandleCL(Handle, "clGetPlatformIDs");
260 clGetDeviceIDsFcnPtr =
261 (clGetDeviceIDsFcnTy *)getAPIHandleCL(Handle, "clGetDeviceIDs");
263 clGetDeviceInfoFcnPtr =
264 (clGetDeviceInfoFcnTy *)getAPIHandleCL(Handle, "clGetDeviceInfo");
266 clGetKernelInfoFcnPtr =
267 (clGetKernelInfoFcnTy *)getAPIHandleCL(Handle, "clGetKernelInfo");
269 clCreateContextFcnPtr =
270 (clCreateContextFcnTy *)getAPIHandleCL(Handle, "clCreateContext");
272 clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL(
273 Handle, "clCreateCommandQueue");
275 clCreateBufferFcnPtr =
276 (clCreateBufferFcnTy *)getAPIHandleCL(Handle, "clCreateBuffer");
278 clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL(
279 Handle, "clEnqueueWriteBuffer");
281 if (HandleOpenCLBeignet)
282 clCreateProgramWithLLVMIntelFcnPtr =
283 (clCreateProgramWithLLVMIntelFcnTy *)getAPIHandleCL(
284 Handle, "clCreateProgramWithLLVMIntel");
286 clCreateProgramWithBinaryFcnPtr =
287 (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL(
288 Handle, "clCreateProgramWithBinary");
290 clBuildProgramFcnPtr =
291 (clBuildProgramFcnTy *)getAPIHandleCL(Handle, "clBuildProgram");
293 clCreateKernelFcnPtr =
294 (clCreateKernelFcnTy *)getAPIHandleCL(Handle, "clCreateKernel");
296 clSetKernelArgFcnPtr =
297 (clSetKernelArgFcnTy *)getAPIHandleCL(Handle, "clSetKernelArg");
299 clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL(
300 Handle, "clEnqueueNDRangeKernel");
302 clEnqueueReadBufferFcnPtr =
303 (clEnqueueReadBufferFcnTy *)getAPIHandleCL(Handle, "clEnqueueReadBuffer");
305 clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(Handle, "clFlush");
307 clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(Handle, "clFinish");
309 clReleaseKernelFcnPtr =
310 (clReleaseKernelFcnTy *)getAPIHandleCL(Handle, "clReleaseKernel");
312 clReleaseProgramFcnPtr =
313 (clReleaseProgramFcnTy *)getAPIHandleCL(Handle, "clReleaseProgram");
315 clReleaseMemObjectFcnPtr =
316 (clReleaseMemObjectFcnTy *)getAPIHandleCL(Handle, "clReleaseMemObject");
318 clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL(
319 Handle, "clReleaseCommandQueue");
321 clReleaseContextFcnPtr =
322 (clReleaseContextFcnTy *)getAPIHandleCL(Handle, "clReleaseContext");
324 return 1;
326 #pragma GCC diagnostic pop
328 /* Context and Device. */
329 static PollyGPUContext *GlobalContext = NULL;
330 static cl_device_id GlobalDeviceID = NULL;
332 /* Fd-Decl: Print out OpenCL Error codes to human readable strings. */
333 static void printOpenCLError(int Error);
335 static void checkOpenCLError(int Ret, const char *format, ...) {
336 if (Ret == CL_SUCCESS)
337 return;
339 printOpenCLError(Ret);
340 va_list args;
341 va_start(args, format);
342 vfprintf(stderr, format, args);
343 va_end(args);
344 exit(-1);
347 static PollyGPUContext *initContextCL() {
348 dump_function();
350 PollyGPUContext *Context;
352 cl_platform_id PlatformID = NULL;
353 cl_device_id DeviceID = NULL;
354 cl_uint NumDevicesRet;
355 cl_int Ret;
357 char DeviceRevision[256];
358 char DeviceName[256];
359 size_t DeviceRevisionRetSize, DeviceNameRetSize;
361 static __thread PollyGPUContext *CurrentContext = NULL;
363 if (CurrentContext)
364 return CurrentContext;
366 /* Get API handles. */
367 if (initialDeviceAPIsCL() == 0) {
368 fprintf(stderr, "Getting the \"handle\" for the OpenCL Runtime failed.\n");
369 exit(-1);
372 /* Get number of devices that support OpenCL. */
373 static const int NumberOfPlatforms = 1;
374 Ret = clGetPlatformIDsFcnPtr(NumberOfPlatforms, &PlatformID, NULL);
375 checkOpenCLError(Ret, "Failed to get platform IDs.\n");
376 // TODO: Extend to CL_DEVICE_TYPE_ALL?
377 static const int NumberOfDevices = 1;
378 Ret = clGetDeviceIDsFcnPtr(PlatformID, CL_DEVICE_TYPE_GPU, NumberOfDevices,
379 &DeviceID, &NumDevicesRet);
380 checkOpenCLError(Ret, "Failed to get device IDs.\n");
382 GlobalDeviceID = DeviceID;
383 if (NumDevicesRet == 0) {
384 fprintf(stderr, "There is no device supporting OpenCL.\n");
385 exit(-1);
388 /* Get device revision. */
389 Ret =
390 clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_VERSION, sizeof(DeviceRevision),
391 DeviceRevision, &DeviceRevisionRetSize);
392 checkOpenCLError(Ret, "Failed to fetch device revision.\n");
394 /* Get device name. */
395 Ret = clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_NAME, sizeof(DeviceName),
396 DeviceName, &DeviceNameRetSize);
397 checkOpenCLError(Ret, "Failed to fetch device name.\n");
399 debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
401 /* Create context on the device. */
402 Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
403 if (Context == 0) {
404 fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
405 exit(-1);
407 Context->Context = (OpenCLContext *)malloc(sizeof(OpenCLContext));
408 if (Context->Context == 0) {
409 fprintf(stderr, "Allocate memory for Polly OpenCL context failed.\n");
410 exit(-1);
412 ((OpenCLContext *)Context->Context)->Context =
413 clCreateContextFcnPtr(NULL, NumDevicesRet, &DeviceID, NULL, NULL, &Ret);
414 checkOpenCLError(Ret, "Failed to create context.\n");
416 static const int ExtraProperties = 0;
417 ((OpenCLContext *)Context->Context)->CommandQueue =
418 clCreateCommandQueueFcnPtr(((OpenCLContext *)Context->Context)->Context,
419 DeviceID, ExtraProperties, &Ret);
420 checkOpenCLError(Ret, "Failed to create command queue.\n");
422 if (CacheMode)
423 CurrentContext = Context;
425 GlobalContext = Context;
426 return Context;
429 static void freeKernelCL(PollyGPUFunction *Kernel) {
430 dump_function();
432 if (CacheMode)
433 return;
435 if (!GlobalContext) {
436 fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
437 exit(-1);
440 cl_int Ret;
441 Ret = clFlushFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
442 checkOpenCLError(Ret, "Failed to flush command queue.\n");
443 Ret = clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
444 checkOpenCLError(Ret, "Failed to finish command queue.\n");
446 if (((OpenCLKernel *)Kernel->Kernel)->Kernel) {
447 cl_int Ret =
448 clReleaseKernelFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Kernel);
449 checkOpenCLError(Ret, "Failed to release kernel.\n");
452 if (((OpenCLKernel *)Kernel->Kernel)->Program) {
453 cl_int Ret =
454 clReleaseProgramFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Program);
455 checkOpenCLError(Ret, "Failed to release program.\n");
458 if (Kernel->Kernel)
459 free((OpenCLKernel *)Kernel->Kernel);
461 if (Kernel)
462 free(Kernel);
465 static PollyGPUFunction *getKernelCL(const char *BinaryBuffer,
466 const char *KernelName) {
467 dump_function();
469 if (!GlobalContext) {
470 fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
471 exit(-1);
474 static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
475 static __thread int NextCacheItem = 0;
477 for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
478 // We exploit here the property that all Polly-ACC kernels are allocated
479 // as global constants, hence a pointer comparision is sufficient to
480 // determin equality.
481 if (KernelCache[i] &&
482 ((OpenCLKernel *)KernelCache[i]->Kernel)->BinaryString ==
483 BinaryBuffer) {
484 debug_print(" -> using cached kernel\n");
485 return KernelCache[i];
489 PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
490 if (Function == 0) {
491 fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
492 exit(-1);
494 Function->Kernel = (OpenCLKernel *)malloc(sizeof(OpenCLKernel));
495 if (Function->Kernel == 0) {
496 fprintf(stderr, "Allocate memory for Polly OpenCL kernel failed.\n");
497 exit(-1);
500 if (!GlobalDeviceID) {
501 fprintf(stderr, "GPGPU-code generation not initialized correctly.\n");
502 exit(-1);
505 cl_int Ret;
507 if (HandleOpenCLBeignet) {
508 // TODO: This is a workaround, since clCreateProgramWithLLVMIntel only
509 // accepts a filename to a valid llvm-ir file as an argument, instead
510 // of accepting the BinaryBuffer directly.
511 FILE *fp = fopen("kernel.ll", "wb");
512 if (fp != NULL) {
513 fputs(BinaryBuffer, fp);
514 fclose(fp);
517 ((OpenCLKernel *)Function->Kernel)->Program =
518 clCreateProgramWithLLVMIntelFcnPtr(
519 ((OpenCLContext *)GlobalContext->Context)->Context, 1,
520 &GlobalDeviceID, "kernel.ll", &Ret);
521 checkOpenCLError(Ret, "Failed to create program from llvm.\n");
522 unlink("kernel.ll");
523 } else {
524 size_t BinarySize = strlen(BinaryBuffer);
525 ((OpenCLKernel *)Function->Kernel)->Program =
526 clCreateProgramWithBinaryFcnPtr(
527 ((OpenCLContext *)GlobalContext->Context)->Context, 1,
528 &GlobalDeviceID, (const size_t *)&BinarySize,
529 (const unsigned char **)&BinaryBuffer, NULL, &Ret);
530 checkOpenCLError(Ret, "Failed to create program from binary.\n");
533 Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1,
534 &GlobalDeviceID, NULL, NULL, NULL);
535 checkOpenCLError(Ret, "Failed to build program.\n");
537 ((OpenCLKernel *)Function->Kernel)->Kernel = clCreateKernelFcnPtr(
538 ((OpenCLKernel *)Function->Kernel)->Program, KernelName, &Ret);
539 checkOpenCLError(Ret, "Failed to create kernel.\n");
541 ((OpenCLKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
543 if (CacheMode) {
544 if (KernelCache[NextCacheItem])
545 freeKernelCL(KernelCache[NextCacheItem]);
547 KernelCache[NextCacheItem] = Function;
549 NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
552 return Function;
555 static void copyFromHostToDeviceCL(void *HostData, PollyGPUDevicePtr *DevData,
556 long MemSize) {
557 dump_function();
559 if (!GlobalContext) {
560 fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
561 exit(-1);
564 cl_int Ret;
565 Ret = clEnqueueWriteBufferFcnPtr(
566 ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
567 ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
568 HostData, 0, NULL, NULL);
569 checkOpenCLError(Ret, "Copying data from host memory to device failed.\n");
572 static void copyFromDeviceToHostCL(PollyGPUDevicePtr *DevData, void *HostData,
573 long MemSize) {
574 dump_function();
576 if (!GlobalContext) {
577 fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
578 exit(-1);
581 cl_int Ret;
582 Ret = clEnqueueReadBufferFcnPtr(
583 ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
584 ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
585 HostData, 0, NULL, NULL);
586 checkOpenCLError(Ret, "Copying results from device to host memory failed.\n");
589 static void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX,
590 unsigned int GridDimY, unsigned int BlockDimX,
591 unsigned int BlockDimY, unsigned int BlockDimZ,
592 void **Parameters) {
593 dump_function();
595 cl_int Ret;
596 cl_uint NumArgs;
598 if (!GlobalContext) {
599 fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
600 exit(-1);
603 OpenCLKernel *CLKernel = (OpenCLKernel *)Kernel->Kernel;
604 Ret = clGetKernelInfoFcnPtr(CLKernel->Kernel, CL_KERNEL_NUM_ARGS,
605 sizeof(cl_uint), &NumArgs, NULL);
606 checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n");
608 /* Argument sizes are stored at the end of the Parameters array. */
609 for (cl_uint i = 0; i < NumArgs; i++) {
610 Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i,
611 *((int *)Parameters[NumArgs + i]),
612 (void *)Parameters[i]);
613 checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i);
616 unsigned int GridDimZ = 1;
617 size_t GlobalWorkSize[3] = {BlockDimX * GridDimX, BlockDimY * GridDimY,
618 BlockDimZ * GridDimZ};
619 size_t LocalWorkSize[3] = {BlockDimX, BlockDimY, BlockDimZ};
621 static const int WorkDim = 3;
622 OpenCLContext *CLContext = (OpenCLContext *)GlobalContext->Context;
623 Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue, CLKernel->Kernel,
624 WorkDim, NULL, GlobalWorkSize,
625 LocalWorkSize, 0, NULL, NULL);
626 checkOpenCLError(Ret, "Launching OpenCL kernel failed.\n");
629 static void freeDeviceMemoryCL(PollyGPUDevicePtr *Allocation) {
630 dump_function();
632 OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
633 cl_int Ret = clReleaseMemObjectFcnPtr((cl_mem)DevPtr->MemObj);
634 checkOpenCLError(Ret, "Failed to free device memory.\n");
636 free(DevPtr);
637 free(Allocation);
640 static PollyGPUDevicePtr *allocateMemoryForDeviceCL(long MemSize) {
641 dump_function();
643 if (!GlobalContext) {
644 fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
645 exit(-1);
648 PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
649 if (DevData == 0) {
650 fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
651 exit(-1);
653 DevData->DevicePtr = (OpenCLDevicePtr *)malloc(sizeof(OpenCLDevicePtr));
654 if (DevData->DevicePtr == 0) {
655 fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
656 exit(-1);
659 cl_int Ret;
660 ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj =
661 clCreateBufferFcnPtr(((OpenCLContext *)GlobalContext->Context)->Context,
662 CL_MEM_READ_WRITE, MemSize, NULL, &Ret);
663 checkOpenCLError(Ret,
664 "Allocate memory for GPU device memory pointer failed.\n");
666 return DevData;
669 static void *getDevicePtrCL(PollyGPUDevicePtr *Allocation) {
670 dump_function();
672 OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
673 return (void *)DevPtr->MemObj;
676 static void synchronizeDeviceCL() {
677 dump_function();
679 if (!GlobalContext) {
680 fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
681 exit(-1);
684 if (clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue) !=
685 CL_SUCCESS) {
686 fprintf(stderr, "Synchronizing device and host memory failed.\n");
687 exit(-1);
691 static void freeContextCL(PollyGPUContext *Context) {
692 dump_function();
694 cl_int Ret;
696 GlobalContext = NULL;
698 OpenCLContext *Ctx = (OpenCLContext *)Context->Context;
699 if (Ctx->CommandQueue) {
700 Ret = clReleaseCommandQueueFcnPtr(Ctx->CommandQueue);
701 checkOpenCLError(Ret, "Could not release command queue.\n");
704 if (Ctx->Context) {
705 Ret = clReleaseContextFcnPtr(Ctx->Context);
706 checkOpenCLError(Ret, "Could not release context.\n");
709 free(Ctx);
710 free(Context);
713 static void printOpenCLError(int Error) {
715 switch (Error) {
716 case CL_SUCCESS:
717 // Success, don't print an error.
718 break;
720 // JIT/Runtime errors.
721 case CL_DEVICE_NOT_FOUND:
722 fprintf(stderr, "Device not found.\n");
723 break;
724 case CL_DEVICE_NOT_AVAILABLE:
725 fprintf(stderr, "Device not available.\n");
726 break;
727 case CL_COMPILER_NOT_AVAILABLE:
728 fprintf(stderr, "Compiler not available.\n");
729 break;
730 case CL_MEM_OBJECT_ALLOCATION_FAILURE:
731 fprintf(stderr, "Mem object allocation failure.\n");
732 break;
733 case CL_OUT_OF_RESOURCES:
734 fprintf(stderr, "Out of resources.\n");
735 break;
736 case CL_OUT_OF_HOST_MEMORY:
737 fprintf(stderr, "Out of host memory.\n");
738 break;
739 case CL_PROFILING_INFO_NOT_AVAILABLE:
740 fprintf(stderr, "Profiling info not available.\n");
741 break;
742 case CL_MEM_COPY_OVERLAP:
743 fprintf(stderr, "Mem copy overlap.\n");
744 break;
745 case CL_IMAGE_FORMAT_MISMATCH:
746 fprintf(stderr, "Image format mismatch.\n");
747 break;
748 case CL_IMAGE_FORMAT_NOT_SUPPORTED:
749 fprintf(stderr, "Image format not supported.\n");
750 break;
751 case CL_BUILD_PROGRAM_FAILURE:
752 fprintf(stderr, "Build program failure.\n");
753 break;
754 case CL_MAP_FAILURE:
755 fprintf(stderr, "Map failure.\n");
756 break;
757 case CL_MISALIGNED_SUB_BUFFER_OFFSET:
758 fprintf(stderr, "Misaligned sub buffer offset.\n");
759 break;
760 case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
761 fprintf(stderr, "Exec status error for events in wait list.\n");
762 break;
763 case CL_COMPILE_PROGRAM_FAILURE:
764 fprintf(stderr, "Compile program failure.\n");
765 break;
766 case CL_LINKER_NOT_AVAILABLE:
767 fprintf(stderr, "Linker not available.\n");
768 break;
769 case CL_LINK_PROGRAM_FAILURE:
770 fprintf(stderr, "Link program failure.\n");
771 break;
772 case CL_DEVICE_PARTITION_FAILED:
773 fprintf(stderr, "Device partition failed.\n");
774 break;
775 case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
776 fprintf(stderr, "Kernel arg info not available.\n");
777 break;
779 // Compiler errors.
780 case CL_INVALID_VALUE:
781 fprintf(stderr, "Invalid value.\n");
782 break;
783 case CL_INVALID_DEVICE_TYPE:
784 fprintf(stderr, "Invalid device type.\n");
785 break;
786 case CL_INVALID_PLATFORM:
787 fprintf(stderr, "Invalid platform.\n");
788 break;
789 case CL_INVALID_DEVICE:
790 fprintf(stderr, "Invalid device.\n");
791 break;
792 case CL_INVALID_CONTEXT:
793 fprintf(stderr, "Invalid context.\n");
794 break;
795 case CL_INVALID_QUEUE_PROPERTIES:
796 fprintf(stderr, "Invalid queue properties.\n");
797 break;
798 case CL_INVALID_COMMAND_QUEUE:
799 fprintf(stderr, "Invalid command queue.\n");
800 break;
801 case CL_INVALID_HOST_PTR:
802 fprintf(stderr, "Invalid host pointer.\n");
803 break;
804 case CL_INVALID_MEM_OBJECT:
805 fprintf(stderr, "Invalid memory object.\n");
806 break;
807 case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
808 fprintf(stderr, "Invalid image format descriptor.\n");
809 break;
810 case CL_INVALID_IMAGE_SIZE:
811 fprintf(stderr, "Invalid image size.\n");
812 break;
813 case CL_INVALID_SAMPLER:
814 fprintf(stderr, "Invalid sampler.\n");
815 break;
816 case CL_INVALID_BINARY:
817 fprintf(stderr, "Invalid binary.\n");
818 break;
819 case CL_INVALID_BUILD_OPTIONS:
820 fprintf(stderr, "Invalid build options.\n");
821 break;
822 case CL_INVALID_PROGRAM:
823 fprintf(stderr, "Invalid program.\n");
824 break;
825 case CL_INVALID_PROGRAM_EXECUTABLE:
826 fprintf(stderr, "Invalid program executable.\n");
827 break;
828 case CL_INVALID_KERNEL_NAME:
829 fprintf(stderr, "Invalid kernel name.\n");
830 break;
831 case CL_INVALID_KERNEL_DEFINITION:
832 fprintf(stderr, "Invalid kernel definition.\n");
833 break;
834 case CL_INVALID_KERNEL:
835 fprintf(stderr, "Invalid kernel.\n");
836 break;
837 case CL_INVALID_ARG_INDEX:
838 fprintf(stderr, "Invalid arg index.\n");
839 break;
840 case CL_INVALID_ARG_VALUE:
841 fprintf(stderr, "Invalid arg value.\n");
842 break;
843 case CL_INVALID_ARG_SIZE:
844 fprintf(stderr, "Invalid arg size.\n");
845 break;
846 case CL_INVALID_KERNEL_ARGS:
847 fprintf(stderr, "Invalid kernel args.\n");
848 break;
849 case CL_INVALID_WORK_DIMENSION:
850 fprintf(stderr, "Invalid work dimension.\n");
851 break;
852 case CL_INVALID_WORK_GROUP_SIZE:
853 fprintf(stderr, "Invalid work group size.\n");
854 break;
855 case CL_INVALID_WORK_ITEM_SIZE:
856 fprintf(stderr, "Invalid work item size.\n");
857 break;
858 case CL_INVALID_GLOBAL_OFFSET:
859 fprintf(stderr, "Invalid global offset.\n");
860 break;
861 case CL_INVALID_EVENT_WAIT_LIST:
862 fprintf(stderr, "Invalid event wait list.\n");
863 break;
864 case CL_INVALID_EVENT:
865 fprintf(stderr, "Invalid event.\n");
866 break;
867 case CL_INVALID_OPERATION:
868 fprintf(stderr, "Invalid operation.\n");
869 break;
870 case CL_INVALID_GL_OBJECT:
871 fprintf(stderr, "Invalid GL object.\n");
872 break;
873 case CL_INVALID_BUFFER_SIZE:
874 fprintf(stderr, "Invalid buffer size.\n");
875 break;
876 case CL_INVALID_MIP_LEVEL:
877 fprintf(stderr, "Invalid mip level.\n");
878 break;
879 case CL_INVALID_GLOBAL_WORK_SIZE:
880 fprintf(stderr, "Invalid global work size.\n");
881 break;
882 case CL_INVALID_PROPERTY:
883 fprintf(stderr, "Invalid property.\n");
884 break;
885 case CL_INVALID_IMAGE_DESCRIPTOR:
886 fprintf(stderr, "Invalid image descriptor.\n");
887 break;
888 case CL_INVALID_COMPILER_OPTIONS:
889 fprintf(stderr, "Invalid compiler options.\n");
890 break;
891 case CL_INVALID_LINKER_OPTIONS:
892 fprintf(stderr, "Invalid linker options.\n");
893 break;
894 case CL_INVALID_DEVICE_PARTITION_COUNT:
895 fprintf(stderr, "Invalid device partition count.\n");
896 break;
897 case -69: // OpenCL 2.0 Code for CL_INVALID_PIPE_SIZE
898 fprintf(stderr, "Invalid pipe size.\n");
899 break;
900 case -70: // OpenCL 2.0 Code for CL_INVALID_DEVICE_QUEUE
901 fprintf(stderr, "Invalid device queue.\n");
902 break;
904 // NVIDIA specific error.
905 case -9999:
906 fprintf(stderr, "NVIDIA invalid read or write buffer.\n");
907 break;
909 default:
910 fprintf(stderr, "Unknown error code!\n");
911 break;
915 #endif /* HAS_LIBOPENCL */
916 /******************************************************************************/
917 /* CUDA */
918 /******************************************************************************/
919 #ifdef HAS_LIBCUDART
921 struct CUDAContextT {
922 CUcontext Cuda;
925 struct CUDAKernelT {
926 CUfunction Cuda;
927 CUmodule CudaModule;
928 const char *BinaryString;
931 struct CUDADevicePtrT {
932 CUdeviceptr Cuda;
935 /* Dynamic library handles for the CUDA and CUDA runtime library. */
936 static void *HandleCuda;
937 static void *HandleCudaRT;
939 /* Type-defines of function pointer to CUDA driver APIs. */
940 typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t);
941 static CuMemAllocFcnTy *CuMemAllocFcnPtr;
943 typedef CUresult CUDAAPI CuLaunchKernelFcnTy(
944 CUfunction F, unsigned int GridDimX, unsigned int GridDimY,
945 unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY,
946 unsigned int BlockDimZ, unsigned int SharedMemBytes, CUstream HStream,
947 void **KernelParams, void **Extra);
948 static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr;
950 typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t);
951 static CuMemcpyDtoHFcnTy *CuMemcpyDtoHFcnPtr;
953 typedef CUresult CUDAAPI CuMemcpyHtoDFcnTy(CUdeviceptr, const void *, size_t);
954 static CuMemcpyHtoDFcnTy *CuMemcpyHtoDFcnPtr;
956 typedef CUresult CUDAAPI CuMemFreeFcnTy(CUdeviceptr);
957 static CuMemFreeFcnTy *CuMemFreeFcnPtr;
959 typedef CUresult CUDAAPI CuModuleUnloadFcnTy(CUmodule);
960 static CuModuleUnloadFcnTy *CuModuleUnloadFcnPtr;
962 typedef CUresult CUDAAPI CuProfilerStopFcnTy();
963 static CuProfilerStopFcnTy *CuProfilerStopFcnPtr;
965 typedef CUresult CUDAAPI CuCtxDestroyFcnTy(CUcontext);
966 static CuCtxDestroyFcnTy *CuCtxDestroyFcnPtr;
968 typedef CUresult CUDAAPI CuInitFcnTy(unsigned int);
969 static CuInitFcnTy *CuInitFcnPtr;
971 typedef CUresult CUDAAPI CuDeviceGetCountFcnTy(int *);
972 static CuDeviceGetCountFcnTy *CuDeviceGetCountFcnPtr;
974 typedef CUresult CUDAAPI CuCtxCreateFcnTy(CUcontext *, unsigned int, CUdevice);
975 static CuCtxCreateFcnTy *CuCtxCreateFcnPtr;
977 typedef CUresult CUDAAPI CuCtxGetCurrentFcnTy(CUcontext *);
978 static CuCtxGetCurrentFcnTy *CuCtxGetCurrentFcnPtr;
980 typedef CUresult CUDAAPI CuDeviceGetFcnTy(CUdevice *, int);
981 static CuDeviceGetFcnTy *CuDeviceGetFcnPtr;
983 typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *,
984 unsigned int, CUjit_option *,
985 void **);
986 static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;
988 typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *Module,
989 const void *Image);
990 static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr;
992 typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
993 const char *);
994 static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr;
996 typedef CUresult CUDAAPI CuDeviceComputeCapabilityFcnTy(int *, int *, CUdevice);
997 static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr;
999 typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
1000 static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;
1002 typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState State,
1003 CUjitInputType Type, void *Data,
1004 size_t Size, const char *Name,
1005 unsigned int NumOptions,
1006 CUjit_option *Options,
1007 void **OptionValues);
1008 static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr;
1010 typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int NumOptions,
1011 CUjit_option *Options,
1012 void **OptionValues,
1013 CUlinkState *StateOut);
1014 static CuLinkCreateFcnTy *CuLinkCreateFcnPtr;
1016 typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState State, void **CubinOut,
1017 size_t *SizeOut);
1018 static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr;
1020 typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState State);
1021 static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;
1023 typedef CUresult CUDAAPI CuCtxSynchronizeFcnTy();
1024 static CuCtxSynchronizeFcnTy *CuCtxSynchronizeFcnPtr;
1026 /* Type-defines of function pointer ot CUDA runtime APIs. */
1027 typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
1028 static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
1030 static void *getAPIHandleCUDA(void *Handle, const char *FuncName) {
1031 char *Err;
1032 void *FuncPtr;
1033 dlerror();
1034 FuncPtr = dlsym(Handle, FuncName);
1035 if ((Err = dlerror()) != 0) {
1036 fprintf(stderr, "Load CUDA driver API failed: %s. \n", Err);
1037 return 0;
1039 return FuncPtr;
1042 static int initialDeviceAPILibrariesCUDA() {
1043 HandleCuda = dlopen("libcuda.so", RTLD_LAZY);
1044 if (!HandleCuda) {
1045 fprintf(stderr, "Cannot open library: %s. \n", dlerror());
1046 return 0;
1049 HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY);
1050 if (!HandleCudaRT) {
1051 fprintf(stderr, "Cannot open library: %s. \n", dlerror());
1052 return 0;
1055 return 1;
1058 /* Get function pointer to CUDA Driver APIs.
1060 * Note that compilers conforming to the ISO C standard are required to
1061 * generate a warning if a conversion from a void * pointer to a function
1062 * pointer is attempted as in the following statements. The warning
1063 * of this kind of cast may not be emitted by clang and new versions of gcc
1064 * as it is valid on POSIX 2008. For compilers required to generate a warning,
1065 * we temporarily disable -Wpedantic, to avoid bloating the output with
1066 * unnecessary warnings.
1068 * Reference:
1069 * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html
1071 #pragma GCC diagnostic push
1072 #pragma GCC diagnostic ignored "-Wpedantic"
1073 static int initialDeviceAPIsCUDA() {
1074 if (initialDeviceAPILibrariesCUDA() == 0)
1075 return 0;
1077 CuLaunchKernelFcnPtr =
1078 (CuLaunchKernelFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLaunchKernel");
1080 CuMemAllocFcnPtr =
1081 (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2");
1083 CuMemFreeFcnPtr =
1084 (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2");
1086 CuMemcpyDtoHFcnPtr =
1087 (CuMemcpyDtoHFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyDtoH_v2");
1089 CuMemcpyHtoDFcnPtr =
1090 (CuMemcpyHtoDFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyHtoD_v2");
1092 CuModuleUnloadFcnPtr =
1093 (CuModuleUnloadFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleUnload");
1095 CuProfilerStopFcnPtr =
1096 (CuProfilerStopFcnTy *)getAPIHandleCUDA(HandleCuda, "cuProfilerStop");
1098 CuCtxDestroyFcnPtr =
1099 (CuCtxDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxDestroy");
1101 CuInitFcnPtr = (CuInitFcnTy *)getAPIHandleCUDA(HandleCuda, "cuInit");
1103 CuDeviceGetCountFcnPtr =
1104 (CuDeviceGetCountFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetCount");
1106 CuDeviceGetFcnPtr =
1107 (CuDeviceGetFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGet");
1109 CuCtxCreateFcnPtr =
1110 (CuCtxCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxCreate_v2");
1112 CuCtxGetCurrentFcnPtr =
1113 (CuCtxGetCurrentFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxGetCurrent");
1115 CuModuleLoadDataExFcnPtr = (CuModuleLoadDataExFcnTy *)getAPIHandleCUDA(
1116 HandleCuda, "cuModuleLoadDataEx");
1118 CuModuleLoadDataFcnPtr =
1119 (CuModuleLoadDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleLoadData");
1121 CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandleCUDA(
1122 HandleCuda, "cuModuleGetFunction");
1124 CuDeviceComputeCapabilityFcnPtr =
1125 (CuDeviceComputeCapabilityFcnTy *)getAPIHandleCUDA(
1126 HandleCuda, "cuDeviceComputeCapability");
1128 CuDeviceGetNameFcnPtr =
1129 (CuDeviceGetNameFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetName");
1131 CuLinkAddDataFcnPtr =
1132 (CuLinkAddDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkAddData");
1134 CuLinkCreateFcnPtr =
1135 (CuLinkCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkCreate");
1137 CuLinkCompleteFcnPtr =
1138 (CuLinkCompleteFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkComplete");
1140 CuLinkDestroyFcnPtr =
1141 (CuLinkDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkDestroy");
1143 CuCtxSynchronizeFcnPtr =
1144 (CuCtxSynchronizeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxSynchronize");
1146 /* Get function pointer to CUDA Runtime APIs. */
1147 CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandleCUDA(
1148 HandleCudaRT, "cudaThreadSynchronize");
1150 return 1;
1152 #pragma GCC diagnostic pop
1154 static PollyGPUContext *initContextCUDA() {
1155 dump_function();
1156 PollyGPUContext *Context;
1157 CUdevice Device;
1159 int Major = 0, Minor = 0, DeviceID = 0;
1160 char DeviceName[256];
1161 int DeviceCount = 0;
1163 static __thread PollyGPUContext *CurrentContext = NULL;
1165 if (CurrentContext)
1166 return CurrentContext;
1168 /* Get API handles. */
1169 if (initialDeviceAPIsCUDA() == 0) {
1170 fprintf(stderr, "Getting the \"handle\" for the CUDA driver API failed.\n");
1171 exit(-1);
1174 if (CuInitFcnPtr(0) != CUDA_SUCCESS) {
1175 fprintf(stderr, "Initializing the CUDA driver API failed.\n");
1176 exit(-1);
1179 /* Get number of devices that supports CUDA. */
1180 CuDeviceGetCountFcnPtr(&DeviceCount);
1181 if (DeviceCount == 0) {
1182 fprintf(stderr, "There is no device supporting CUDA.\n");
1183 exit(-1);
1186 CuDeviceGetFcnPtr(&Device, 0);
1188 /* Get compute capabilities and the device name. */
1189 CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, Device);
1190 CuDeviceGetNameFcnPtr(DeviceName, 256, Device);
1191 debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
1193 /* Create context on the device. */
1194 Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
1195 if (Context == 0) {
1196 fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
1197 exit(-1);
1199 Context->Context = malloc(sizeof(CUDAContext));
1200 if (Context->Context == 0) {
1201 fprintf(stderr, "Allocate memory for Polly CUDA context failed.\n");
1202 exit(-1);
1205 // In cases where managed memory is used, it is quite likely that
1206 // `cudaMallocManaged` / `polly_mallocManaged` was called before
1207 // `polly_initContext` was called.
1209 // If `polly_initContext` calls `CuCtxCreate` when there already was a
1210 // pre-existing context created by the runtime API, this causes code running
1211 // on P100 to hang. So, we query for a pre-existing context to try and use.
1212 // If there is no pre-existing context, we create a new context
1214 // The possible pre-existing context from previous runtime API calls.
1215 CUcontext MaybeRuntimeAPIContext;
1216 if (CuCtxGetCurrentFcnPtr(&MaybeRuntimeAPIContext) != CUDA_SUCCESS) {
1217 fprintf(stderr, "cuCtxGetCurrent failed.\n");
1218 exit(-1);
1221 // There was no previous context, initialise it.
1222 if (MaybeRuntimeAPIContext == NULL) {
1223 if (CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0,
1224 Device) != CUDA_SUCCESS) {
1225 fprintf(stderr, "cuCtxCreateFcnPtr failed.\n");
1226 exit(-1);
1228 } else {
1229 ((CUDAContext *)Context->Context)->Cuda = MaybeRuntimeAPIContext;
1232 if (CacheMode)
1233 CurrentContext = Context;
1235 return Context;
1238 static void freeKernelCUDA(PollyGPUFunction *Kernel) {
1239 dump_function();
1241 if (CacheMode)
1242 return;
1244 if (((CUDAKernel *)Kernel->Kernel)->CudaModule)
1245 CuModuleUnloadFcnPtr(((CUDAKernel *)Kernel->Kernel)->CudaModule);
1247 if (Kernel->Kernel)
1248 free((CUDAKernel *)Kernel->Kernel);
1250 if (Kernel)
1251 free(Kernel);
1254 static PollyGPUFunction *getKernelCUDA(const char *BinaryBuffer,
1255 const char *KernelName) {
1256 dump_function();
1258 static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
1259 static __thread int NextCacheItem = 0;
1261 for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
1262 // We exploit here the property that all Polly-ACC kernels are allocated
1263 // as global constants, hence a pointer comparision is sufficient to
1264 // determin equality.
1265 if (KernelCache[i] &&
1266 ((CUDAKernel *)KernelCache[i]->Kernel)->BinaryString == BinaryBuffer) {
1267 debug_print(" -> using cached kernel\n");
1268 return KernelCache[i];
1272 PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
1273 if (Function == 0) {
1274 fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
1275 exit(-1);
1277 Function->Kernel = (CUDAKernel *)malloc(sizeof(CUDAKernel));
1278 if (Function->Kernel == 0) {
1279 fprintf(stderr, "Allocate memory for Polly CUDA function failed.\n");
1280 exit(-1);
1283 CUresult Res;
1284 CUlinkState LState;
1285 CUjit_option Options[6];
1286 void *OptionVals[6];
1287 float Walltime = 0;
1288 unsigned long LogSize = 8192;
1289 char ErrorLog[8192], InfoLog[8192];
1290 void *CuOut;
1291 size_t OutSize;
1293 // Setup linker options
1294 // Return walltime from JIT compilation
1295 Options[0] = CU_JIT_WALL_TIME;
1296 OptionVals[0] = (void *)&Walltime;
1297 // Pass a buffer for info messages
1298 Options[1] = CU_JIT_INFO_LOG_BUFFER;
1299 OptionVals[1] = (void *)InfoLog;
1300 // Pass the size of the info buffer
1301 Options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
1302 OptionVals[2] = (void *)LogSize;
1303 // Pass a buffer for error message
1304 Options[3] = CU_JIT_ERROR_LOG_BUFFER;
1305 OptionVals[3] = (void *)ErrorLog;
1306 // Pass the size of the error buffer
1307 Options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
1308 OptionVals[4] = (void *)LogSize;
1309 // Make the linker verbose
1310 Options[5] = CU_JIT_LOG_VERBOSE;
1311 OptionVals[5] = (void *)1;
1313 memset(ErrorLog, 0, sizeof(ErrorLog));
1315 CuLinkCreateFcnPtr(6, Options, OptionVals, &LState);
1316 Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)BinaryBuffer,
1317 strlen(BinaryBuffer) + 1, 0, 0, 0, 0);
1318 if (Res != CUDA_SUCCESS) {
1319 fprintf(stderr, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
1320 exit(-1);
1323 Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize);
1324 if (Res != CUDA_SUCCESS) {
1325 fprintf(stderr, "Complete ptx linker step failed.\n");
1326 fprintf(stderr, "\n%s\n", ErrorLog);
1327 exit(-1);
1330 debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime,
1331 InfoLog);
1333 Res = CuModuleLoadDataFcnPtr(&(((CUDAKernel *)Function->Kernel)->CudaModule),
1334 CuOut);
1335 if (Res != CUDA_SUCCESS) {
1336 fprintf(stderr, "Loading ptx assembly text failed.\n");
1337 exit(-1);
1340 Res = CuModuleGetFunctionFcnPtr(&(((CUDAKernel *)Function->Kernel)->Cuda),
1341 ((CUDAKernel *)Function->Kernel)->CudaModule,
1342 KernelName);
1343 if (Res != CUDA_SUCCESS) {
1344 fprintf(stderr, "Loading kernel function failed.\n");
1345 exit(-1);
1348 CuLinkDestroyFcnPtr(LState);
1350 ((CUDAKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
1352 if (CacheMode) {
1353 if (KernelCache[NextCacheItem])
1354 freeKernelCUDA(KernelCache[NextCacheItem]);
1356 KernelCache[NextCacheItem] = Function;
1358 NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
1361 return Function;
1364 static void synchronizeDeviceCUDA() {
1365 dump_function();
1366 if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) {
1367 fprintf(stderr, "Synchronizing device and host memory failed.\n");
1368 exit(-1);
1372 static void copyFromHostToDeviceCUDA(void *HostData, PollyGPUDevicePtr *DevData,
1373 long MemSize) {
1374 dump_function();
1376 CUdeviceptr CuDevData = ((CUDADevicePtr *)DevData->DevicePtr)->Cuda;
1377 CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize);
1380 static void copyFromDeviceToHostCUDA(PollyGPUDevicePtr *DevData, void *HostData,
1381 long MemSize) {
1382 dump_function();
1384 if (CuMemcpyDtoHFcnPtr(HostData, ((CUDADevicePtr *)DevData->DevicePtr)->Cuda,
1385 MemSize) != CUDA_SUCCESS) {
1386 fprintf(stderr, "Copying results from device to host memory failed.\n");
1387 exit(-1);
1391 static void launchKernelCUDA(PollyGPUFunction *Kernel, unsigned int GridDimX,
1392 unsigned int GridDimY, unsigned int BlockDimX,
1393 unsigned int BlockDimY, unsigned int BlockDimZ,
1394 void **Parameters) {
1395 dump_function();
1397 unsigned GridDimZ = 1;
1398 unsigned int SharedMemBytes = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE;
1399 CUstream Stream = 0;
1400 void **Extra = 0;
1402 CUresult Res;
1403 Res =
1404 CuLaunchKernelFcnPtr(((CUDAKernel *)Kernel->Kernel)->Cuda, GridDimX,
1405 GridDimY, GridDimZ, BlockDimX, BlockDimY, BlockDimZ,
1406 SharedMemBytes, Stream, Parameters, Extra);
1407 if (Res != CUDA_SUCCESS) {
1408 fprintf(stderr, "Launching CUDA kernel failed.\n");
1409 exit(-1);
1413 // Maximum number of managed memory pointers.
1414 #define MAX_POINTERS 4000
1415 // For the rationale behing a list of free pointers, see `polly_freeManaged`.
1416 void *g_managedptrs[MAX_POINTERS];
1417 int g_nmanagedptrs = 0;
1419 // Add a pointer as being allocated by cuMallocManaged
1420 void addManagedPtr(void *mem) {
1421 assert(g_nmanagedptrs < MAX_POINTERS && "We have hit the maximum number of "
1422 "managed pointers allowed. Increase "
1423 "MAX_POINTERS");
1424 g_managedptrs[g_nmanagedptrs++] = mem;
1427 int isManagedPtr(void *mem) {
1428 for (int i = 0; i < g_nmanagedptrs; i++) {
1429 if (g_managedptrs[i] == mem)
1430 return 1;
1432 return 0;
1435 void polly_freeManaged(void *mem) {
1436 dump_function();
1438 // In a real-world program this was used (COSMO), there were more `free`
1439 // calls in the original source than `malloc` calls. Hence, replacing all
1440 // `free`s with `cudaFree` does not work, since we would try to free
1441 // 'illegal' memory.
1442 // As a quick fix, we keep a free list and check if `mem` is a managed memory
1443 // pointer. If it is, we call `cudaFree`.
1444 // If not, we pass it along to the underlying allocator.
1445 // This is a hack, and can be removed if the underlying issue is fixed.
1446 if (isManagedPtr(mem)) {
1447 if (cudaFree(mem) != cudaSuccess) {
1448 fprintf(stderr, "cudaFree failed.\n");
1449 exit(-1);
1451 return;
1452 } else {
1453 free(mem);
1457 void *polly_mallocManaged(size_t size) {
1458 dump_function();
1459 void *a;
1460 if (cudaMallocManaged(&a, size, cudaMemAttachGlobal) != cudaSuccess) {
1461 fprintf(stderr, "cudaMallocManaged failed for size: %zu\n", size);
1462 exit(-1);
1464 addManagedPtr(a);
1465 return a;
1468 static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) {
1469 dump_function();
1470 CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
1471 CuMemFreeFcnPtr((CUdeviceptr)DevPtr->Cuda);
1472 free(DevPtr);
1473 free(Allocation);
1476 static PollyGPUDevicePtr *allocateMemoryForDeviceCUDA(long MemSize) {
1477 dump_function();
1479 PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
1480 if (DevData == 0) {
1481 fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
1482 exit(-1);
1484 DevData->DevicePtr = (CUDADevicePtr *)malloc(sizeof(CUDADevicePtr));
1485 if (DevData->DevicePtr == 0) {
1486 fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
1487 exit(-1);
1490 CUresult Res =
1491 CuMemAllocFcnPtr(&(((CUDADevicePtr *)DevData->DevicePtr)->Cuda), MemSize);
1493 if (Res != CUDA_SUCCESS) {
1494 fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
1495 exit(-1);
1498 return DevData;
1501 static void *getDevicePtrCUDA(PollyGPUDevicePtr *Allocation) {
1502 dump_function();
1504 CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
1505 return (void *)DevPtr->Cuda;
1508 static void freeContextCUDA(PollyGPUContext *Context) {
1509 dump_function();
1511 CUDAContext *Ctx = (CUDAContext *)Context->Context;
1512 if (Ctx->Cuda) {
1513 CuProfilerStopFcnPtr();
1514 CuCtxDestroyFcnPtr(Ctx->Cuda);
1515 free(Ctx);
1516 free(Context);
1519 dlclose(HandleCuda);
1520 dlclose(HandleCudaRT);
1523 #endif /* HAS_LIBCUDART */
1524 /******************************************************************************/
1525 /* API */
1526 /******************************************************************************/
1528 PollyGPUContext *polly_initContext() {
1529 DebugMode = getenv("POLLY_DEBUG") != 0;
1530 CacheMode = getenv("POLLY_NOCACHE") == 0;
1532 dump_function();
1534 PollyGPUContext *Context;
1536 switch (Runtime) {
1537 #ifdef HAS_LIBCUDART
1538 case RUNTIME_CUDA:
1539 Context = initContextCUDA();
1540 break;
1541 #endif /* HAS_LIBCUDART */
1542 #ifdef HAS_LIBOPENCL
1543 case RUNTIME_CL:
1544 Context = initContextCL();
1545 break;
1546 #endif /* HAS_LIBOPENCL */
1547 default:
1548 err_runtime();
1551 return Context;
1554 void polly_freeKernel(PollyGPUFunction *Kernel) {
1555 dump_function();
1557 switch (Runtime) {
1558 #ifdef HAS_LIBCUDART
1559 case RUNTIME_CUDA:
1560 freeKernelCUDA(Kernel);
1561 break;
1562 #endif /* HAS_LIBCUDART */
1563 #ifdef HAS_LIBOPENCL
1564 case RUNTIME_CL:
1565 freeKernelCL(Kernel);
1566 break;
1567 #endif /* HAS_LIBOPENCL */
1568 default:
1569 err_runtime();
1573 PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
1574 const char *KernelName) {
1575 dump_function();
1577 PollyGPUFunction *Function;
1579 switch (Runtime) {
1580 #ifdef HAS_LIBCUDART
1581 case RUNTIME_CUDA:
1582 Function = getKernelCUDA(BinaryBuffer, KernelName);
1583 break;
1584 #endif /* HAS_LIBCUDART */
1585 #ifdef HAS_LIBOPENCL
1586 case RUNTIME_CL:
1587 Function = getKernelCL(BinaryBuffer, KernelName);
1588 break;
1589 #endif /* HAS_LIBOPENCL */
1590 default:
1591 err_runtime();
1594 return Function;
1597 void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
1598 long MemSize) {
1599 dump_function();
1601 switch (Runtime) {
1602 #ifdef HAS_LIBCUDART
1603 case RUNTIME_CUDA:
1604 copyFromHostToDeviceCUDA(HostData, DevData, MemSize);
1605 break;
1606 #endif /* HAS_LIBCUDART */
1607 #ifdef HAS_LIBOPENCL
1608 case RUNTIME_CL:
1609 copyFromHostToDeviceCL(HostData, DevData, MemSize);
1610 break;
1611 #endif /* HAS_LIBOPENCL */
1612 default:
1613 err_runtime();
1617 void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
1618 long MemSize) {
1619 dump_function();
1621 switch (Runtime) {
1622 #ifdef HAS_LIBCUDART
1623 case RUNTIME_CUDA:
1624 copyFromDeviceToHostCUDA(DevData, HostData, MemSize);
1625 break;
1626 #endif /* HAS_LIBCUDART */
1627 #ifdef HAS_LIBOPENCL
1628 case RUNTIME_CL:
1629 copyFromDeviceToHostCL(DevData, HostData, MemSize);
1630 break;
1631 #endif /* HAS_LIBOPENCL */
1632 default:
1633 err_runtime();
1637 void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
1638 unsigned int GridDimY, unsigned int BlockDimX,
1639 unsigned int BlockDimY, unsigned int BlockDimZ,
1640 void **Parameters) {
1641 dump_function();
1643 switch (Runtime) {
1644 #ifdef HAS_LIBCUDART
1645 case RUNTIME_CUDA:
1646 launchKernelCUDA(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
1647 BlockDimZ, Parameters);
1648 break;
1649 #endif /* HAS_LIBCUDART */
1650 #ifdef HAS_LIBOPENCL
1651 case RUNTIME_CL:
1652 launchKernelCL(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ,
1653 Parameters);
1654 break;
1655 #endif /* HAS_LIBOPENCL */
1656 default:
1657 err_runtime();
1661 void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
1662 dump_function();
1664 switch (Runtime) {
1665 #ifdef HAS_LIBCUDART
1666 case RUNTIME_CUDA:
1667 freeDeviceMemoryCUDA(Allocation);
1668 break;
1669 #endif /* HAS_LIBCUDART */
1670 #ifdef HAS_LIBOPENCL
1671 case RUNTIME_CL:
1672 freeDeviceMemoryCL(Allocation);
1673 break;
1674 #endif /* HAS_LIBOPENCL */
1675 default:
1676 err_runtime();
1680 PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
1681 dump_function();
1683 PollyGPUDevicePtr *DevData;
1685 switch (Runtime) {
1686 #ifdef HAS_LIBCUDART
1687 case RUNTIME_CUDA:
1688 DevData = allocateMemoryForDeviceCUDA(MemSize);
1689 break;
1690 #endif /* HAS_LIBCUDART */
1691 #ifdef HAS_LIBOPENCL
1692 case RUNTIME_CL:
1693 DevData = allocateMemoryForDeviceCL(MemSize);
1694 break;
1695 #endif /* HAS_LIBOPENCL */
1696 default:
1697 err_runtime();
1700 return DevData;
1703 void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) {
1704 dump_function();
1706 void *DevPtr;
1708 switch (Runtime) {
1709 #ifdef HAS_LIBCUDART
1710 case RUNTIME_CUDA:
1711 DevPtr = getDevicePtrCUDA(Allocation);
1712 break;
1713 #endif /* HAS_LIBCUDART */
1714 #ifdef HAS_LIBOPENCL
1715 case RUNTIME_CL:
1716 DevPtr = getDevicePtrCL(Allocation);
1717 break;
1718 #endif /* HAS_LIBOPENCL */
1719 default:
1720 err_runtime();
1723 return DevPtr;
1726 void polly_synchronizeDevice() {
1727 dump_function();
1729 switch (Runtime) {
1730 #ifdef HAS_LIBCUDART
1731 case RUNTIME_CUDA:
1732 synchronizeDeviceCUDA();
1733 break;
1734 #endif /* HAS_LIBCUDART */
1735 #ifdef HAS_LIBOPENCL
1736 case RUNTIME_CL:
1737 synchronizeDeviceCL();
1738 break;
1739 #endif /* HAS_LIBOPENCL */
1740 default:
1741 err_runtime();
1745 void polly_freeContext(PollyGPUContext *Context) {
1746 dump_function();
1748 if (CacheMode)
1749 return;
1751 switch (Runtime) {
1752 #ifdef HAS_LIBCUDART
1753 case RUNTIME_CUDA:
1754 freeContextCUDA(Context);
1755 break;
1756 #endif /* HAS_LIBCUDART */
1757 #ifdef HAS_LIBOPENCL
1758 case RUNTIME_CL:
1759 freeContextCL(Context);
1760 break;
1761 #endif /* HAS_LIBOPENCL */
1762 default:
1763 err_runtime();
1767 /* Initialize GPUJIT with CUDA as runtime library. */
1768 PollyGPUContext *polly_initContextCUDA() {
1769 #ifdef HAS_LIBCUDART
1770 Runtime = RUNTIME_CUDA;
1771 return polly_initContext();
1772 #else
1773 fprintf(stderr, "GPU Runtime was built without CUDA support.\n");
1774 exit(-1);
1775 #endif /* HAS_LIBCUDART */
1778 /* Initialize GPUJIT with OpenCL as runtime library. */
1779 PollyGPUContext *polly_initContextCL() {
1780 #ifdef HAS_LIBOPENCL
1781 Runtime = RUNTIME_CL;
1782 return polly_initContext();
1783 #else
1784 fprintf(stderr, "GPU Runtime was built without OpenCL support.\n");
1785 exit(-1);
1786 #endif /* HAS_LIBOPENCL */