tools/GPURuntime/GPUJIT.c

   1 /******************** GPUJIT.c - GPUJIT Execution Engine **********************/
   2 /*                                                                            */
   3 /* Part of the LLVM Project, under the Apache License v2.0 with LLVM          */
   4 /* Exceptions.                                                                */
   5 /* See https://llvm.org/LICENSE.txt for license information.                  */
   6 /* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception                    */
   7 /*                                                                            */
   8 /******************************************************************************/
   9 /*                                                                            */
  10 /*  This file implements GPUJIT, a ptx string execution engine for GPU.       */
  11 /*                                                                            */
  12 /******************************************************************************/
  13
  14 #include "GPUJIT.h"
  15
  16 #ifdef HAS_LIBCUDART
  17 #include <cuda.h>
  18 #include <cuda_runtime.h>
  19 #endif /* HAS_LIBCUDART */
  20
  21 #ifdef HAS_LIBOPENCL
  22 #ifdef __APPLE__
  23 #include <OpenCL/opencl.h>
  24 #else
  25 #include <CL/cl.h>
  26 #endif /* __APPLE__ */
  27 #endif /* HAS_LIBOPENCL */
  28
  29 #include <assert.h>
  30 #include <dlfcn.h>
  31 #include <stdarg.h>
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <string.h>
  35 #include <unistd.h>
  36
  37 static int DebugMode;
  38 static int CacheMode;
  39 #define max(x, y) ((x) > (y) ? (x) : (y))
  40
  41 static PollyGPURuntime Runtime = RUNTIME_NONE;
  42
  43 static void debug_print(const char *format, ...) {
  44   if (!DebugMode)
  45     return;
  46
  47   va_list args;
  48   va_start(args, format);
  49   vfprintf(stderr, format, args);
  50   va_end(args);
  51 }
  52 #define dump_function() debug_print("-> %s\n", __func__)
  53
  54 #define KERNEL_CACHE_SIZE 10
  55
  56 static void err_runtime() __attribute__((noreturn));
  57 static void err_runtime() {
  58   fprintf(stderr, "Runtime not correctly initialized.\n");
  59   exit(-1);
  60 }
  61
  62 struct PollyGPUContextT {
  63   void *Context;
  64 };
  65
  66 struct PollyGPUFunctionT {
  67   void *Kernel;
  68 };
  69
  70 struct PollyGPUDevicePtrT {
  71   void *DevicePtr;
  72 };
  73
  74 /******************************************************************************/
  75 /*                                  OpenCL                                    */
  76 /******************************************************************************/
  77 #ifdef HAS_LIBOPENCL
  78
  79 struct OpenCLContextT {
  80   cl_context Context;
  81   cl_command_queue CommandQueue;
  82 };
  83
  84 struct OpenCLKernelT {
  85   cl_kernel Kernel;
  86   cl_program Program;
  87   const char *BinaryString;
  88 };
  89
  90 struct OpenCLDevicePtrT {
  91   cl_mem MemObj;
  92 };
  93
  94 /* Dynamic library handles for the OpenCL runtime library. */
  95 static void *HandleOpenCL;
  96 static void *HandleOpenCLBeignet;
  97
  98 /* Type-defines of function pointer to OpenCL Runtime API. */
  99 typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries,
 100                                      cl_platform_id *Platforms,
 101                                      cl_uint *NumPlatforms);
 102 static clGetPlatformIDsFcnTy *clGetPlatformIDsFcnPtr;
 103
 104 typedef cl_int clGetDeviceIDsFcnTy(cl_platform_id Platform,
 105                                    cl_device_type DeviceType,
 106                                    cl_uint NumEntries, cl_device_id *Devices,
 107                                    cl_uint *NumDevices);
 108 static clGetDeviceIDsFcnTy *clGetDeviceIDsFcnPtr;
 109
 110 typedef cl_int clGetDeviceInfoFcnTy(cl_device_id Device,
 111                                     cl_device_info ParamName,
 112                                     size_t ParamValueSize, void *ParamValue,
 113                                     size_t *ParamValueSizeRet);
 114 static clGetDeviceInfoFcnTy *clGetDeviceInfoFcnPtr;
 115
 116 typedef cl_int clGetKernelInfoFcnTy(cl_kernel Kernel, cl_kernel_info ParamName,
 117                                     size_t ParamValueSize, void *ParamValue,
 118                                     size_t *ParamValueSizeRet);
 119 static clGetKernelInfoFcnTy *clGetKernelInfoFcnPtr;
 120
 121 typedef cl_context clCreateContextFcnTy(
 122     const cl_context_properties *Properties, cl_uint NumDevices,
 123     const cl_device_id *Devices,
 124     void CL_CALLBACK *pfn_notify(const char *Errinfo, const void *PrivateInfo,
 125                                  size_t CB, void *UserData),
 126     void *UserData, cl_int *ErrcodeRet);
 127 static clCreateContextFcnTy *clCreateContextFcnPtr;
 128
 129 typedef cl_command_queue
 130 clCreateCommandQueueFcnTy(cl_context Context, cl_device_id Device,
 131                           cl_command_queue_properties Properties,
 132                           cl_int *ErrcodeRet);
 133 static clCreateCommandQueueFcnTy *clCreateCommandQueueFcnPtr;
 134
 135 typedef cl_mem clCreateBufferFcnTy(cl_context Context, cl_mem_flags Flags,
 136                                    size_t Size, void *HostPtr,
 137                                    cl_int *ErrcodeRet);
 138 static clCreateBufferFcnTy *clCreateBufferFcnPtr;
 139
 140 typedef cl_int
 141 clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue, cl_mem Buffer,
 142                           cl_bool BlockingWrite, size_t Offset, size_t Size,
 143                           const void *Ptr, cl_uint NumEventsInWaitList,
 144                           const cl_event *EventWaitList, cl_event *Event);
 145 static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr;
 146
 147 typedef cl_program
 148 clCreateProgramWithLLVMIntelFcnTy(cl_context Context, cl_uint NumDevices,
 149                                   const cl_device_id *DeviceList,
 150                                   const char *Filename, cl_int *ErrcodeRet);
 151 static clCreateProgramWithLLVMIntelFcnTy *clCreateProgramWithLLVMIntelFcnPtr;
 152
 153 typedef cl_program clCreateProgramWithBinaryFcnTy(
 154     cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList,
 155     const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus,
 156     cl_int *ErrcodeRet);
 157 static clCreateProgramWithBinaryFcnTy *clCreateProgramWithBinaryFcnPtr;
 158
 159 typedef cl_int clBuildProgramFcnTy(
 160     cl_program Program, cl_uint NumDevices, const cl_device_id *DeviceList,
 161     const char *Options,
 162     void(CL_CALLBACK *pfn_notify)(cl_program Program, void *UserData),
 163     void *UserData);
 164 static clBuildProgramFcnTy *clBuildProgramFcnPtr;
 165
 166 typedef cl_kernel clCreateKernelFcnTy(cl_program Program,
 167                                       const char *KernelName,
 168                                       cl_int *ErrcodeRet);
 169 static clCreateKernelFcnTy *clCreateKernelFcnPtr;
 170
 171 typedef cl_int clSetKernelArgFcnTy(cl_kernel Kernel, cl_uint ArgIndex,
 172                                    size_t ArgSize, const void *ArgValue);
 173 static clSetKernelArgFcnTy *clSetKernelArgFcnPtr;
 174
 175 typedef cl_int clEnqueueNDRangeKernelFcnTy(
 176     cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint WorkDim,
 177     const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
 178     const size_t *LocalWorkSize, cl_uint NumEventsInWaitList,
 179     const cl_event *EventWaitList, cl_event *Event);
 180 static clEnqueueNDRangeKernelFcnTy *clEnqueueNDRangeKernelFcnPtr;
 181
 182 typedef cl_int clEnqueueReadBufferFcnTy(cl_command_queue CommandQueue,
 183                                         cl_mem Buffer, cl_bool BlockingRead,
 184                                         size_t Offset, size_t Size, void *Ptr,
 185                                         cl_uint NumEventsInWaitList,
 186                                         const cl_event *EventWaitList,
 187                                         cl_event *Event);
 188 static clEnqueueReadBufferFcnTy *clEnqueueReadBufferFcnPtr;
 189
 190 typedef cl_int clFlushFcnTy(cl_command_queue CommandQueue);
 191 static clFlushFcnTy *clFlushFcnPtr;
 192
 193 typedef cl_int clFinishFcnTy(cl_command_queue CommandQueue);
 194 static clFinishFcnTy *clFinishFcnPtr;
 195
 196 typedef cl_int clReleaseKernelFcnTy(cl_kernel Kernel);
 197 static clReleaseKernelFcnTy *clReleaseKernelFcnPtr;
 198
 199 typedef cl_int clReleaseProgramFcnTy(cl_program Program);
 200 static clReleaseProgramFcnTy *clReleaseProgramFcnPtr;
 201
 202 typedef cl_int clReleaseMemObjectFcnTy(cl_mem Memobject);
 203 static clReleaseMemObjectFcnTy *clReleaseMemObjectFcnPtr;
 204
 205 typedef cl_int clReleaseCommandQueueFcnTy(cl_command_queue CommandQueue);
 206 static clReleaseCommandQueueFcnTy *clReleaseCommandQueueFcnPtr;
 207
 208 typedef cl_int clReleaseContextFcnTy(cl_context Context);
 209 static clReleaseContextFcnTy *clReleaseContextFcnPtr;
 210
 211 static void *getAPIHandleCL(void *Handle, const char *FuncName) {
 212   char *Err;
 213   void *FuncPtr;
 214   dlerror();
 215   FuncPtr = dlsym(Handle, FuncName);
 216   if ((Err = dlerror()) != 0) {
 217     fprintf(stderr, "Load OpenCL Runtime API failed: %s. \n", Err);
 218     return 0;
 219   }
 220   return FuncPtr;
 221 }
 222
 223 static int initialDeviceAPILibrariesCL() {
 224   HandleOpenCLBeignet = dlopen("/usr/local/lib/beignet/libcl.so", RTLD_LAZY);
 225   HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY);
 226   if (!HandleOpenCL) {
 227     fprintf(stderr, "Cannot open library: %s. \n", dlerror());
 228     return 0;
 229   }
 230   return 1;
 231 }
 232
 233 /* Get function pointer to OpenCL Runtime API.
 234  *
 235  * Note that compilers conforming to the ISO C standard are required to
 236  * generate a warning if a conversion from a void * pointer to a function
 237  * pointer is attempted as in the following statements. The warning
 238  * of this kind of cast may not be emitted by clang and new versions of gcc
 239  * as it is valid on POSIX 2008. For compilers required to generate a warning,
 240  * we temporarily disable -Wpedantic, to avoid bloating the output with
 241  * unnecessary warnings.
 242  *
 243  * Reference:
 244  * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html
 245  */
 246 #pragma GCC diagnostic push
 247 #pragma GCC diagnostic ignored "-Wpedantic"
 248 static int initialDeviceAPIsCL() {
 249   if (initialDeviceAPILibrariesCL() == 0)
 250     return 0;
 251
 252   // FIXME: We are now always selecting the Intel Beignet driver if it is
 253   // available on the system, instead of a possible NVIDIA or AMD OpenCL
 254   // API. This selection should occurr based on the target architecture
 255   // chosen when compiling.
 256   void *Handle =
 257       (HandleOpenCLBeignet != NULL ? HandleOpenCLBeignet : HandleOpenCL);
 258
 259   clGetPlatformIDsFcnPtr =
 260       (clGetPlatformIDsFcnTy *)getAPIHandleCL(Handle, "clGetPlatformIDs");
 261
 262   clGetDeviceIDsFcnPtr =
 263       (clGetDeviceIDsFcnTy *)getAPIHandleCL(Handle, "clGetDeviceIDs");
 264
 265   clGetDeviceInfoFcnPtr =
 266       (clGetDeviceInfoFcnTy *)getAPIHandleCL(Handle, "clGetDeviceInfo");
 267
 268   clGetKernelInfoFcnPtr =
 269       (clGetKernelInfoFcnTy *)getAPIHandleCL(Handle, "clGetKernelInfo");
 270
 271   clCreateContextFcnPtr =
 272       (clCreateContextFcnTy *)getAPIHandleCL(Handle, "clCreateContext");
 273
 274   clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL(
 275       Handle, "clCreateCommandQueue");
 276
 277   clCreateBufferFcnPtr =
 278       (clCreateBufferFcnTy *)getAPIHandleCL(Handle, "clCreateBuffer");
 279
 280   clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL(
 281       Handle, "clEnqueueWriteBuffer");
 282
 283   if (HandleOpenCLBeignet)
 284     clCreateProgramWithLLVMIntelFcnPtr =
 285         (clCreateProgramWithLLVMIntelFcnTy *)getAPIHandleCL(
 286             Handle, "clCreateProgramWithLLVMIntel");
 287
 288   clCreateProgramWithBinaryFcnPtr =
 289       (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL(
 290           Handle, "clCreateProgramWithBinary");
 291
 292   clBuildProgramFcnPtr =
 293       (clBuildProgramFcnTy *)getAPIHandleCL(Handle, "clBuildProgram");
 294
 295   clCreateKernelFcnPtr =
 296       (clCreateKernelFcnTy *)getAPIHandleCL(Handle, "clCreateKernel");
 297
 298   clSetKernelArgFcnPtr =
 299       (clSetKernelArgFcnTy *)getAPIHandleCL(Handle, "clSetKernelArg");
 300
 301   clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL(
 302       Handle, "clEnqueueNDRangeKernel");
 303
 304   clEnqueueReadBufferFcnPtr =
 305       (clEnqueueReadBufferFcnTy *)getAPIHandleCL(Handle, "clEnqueueReadBuffer");
 306
 307   clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(Handle, "clFlush");
 308
 309   clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(Handle, "clFinish");
 310
 311   clReleaseKernelFcnPtr =
 312       (clReleaseKernelFcnTy *)getAPIHandleCL(Handle, "clReleaseKernel");
 313
 314   clReleaseProgramFcnPtr =
 315       (clReleaseProgramFcnTy *)getAPIHandleCL(Handle, "clReleaseProgram");
 316
 317   clReleaseMemObjectFcnPtr =
 318       (clReleaseMemObjectFcnTy *)getAPIHandleCL(Handle, "clReleaseMemObject");
 319
 320   clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL(
 321       Handle, "clReleaseCommandQueue");
 322
 323   clReleaseContextFcnPtr =
 324       (clReleaseContextFcnTy *)getAPIHandleCL(Handle, "clReleaseContext");
 325
 326   return 1;
 327 }
 328 #pragma GCC diagnostic pop
 329
 330 /* Context and Device. */
 331 static PollyGPUContext *GlobalContext = NULL;
 332 static cl_device_id GlobalDeviceID = NULL;
 333
 334 /* Fd-Decl: Print out OpenCL Error codes to human readable strings. */
 335 static void printOpenCLError(int Error);
 336
 337 static void checkOpenCLError(int Ret, const char *format, ...) {
 338   if (Ret == CL_SUCCESS)
 339     return;
 340
 341   printOpenCLError(Ret);
 342   va_list args;
 343   va_start(args, format);
 344   vfprintf(stderr, format, args);
 345   va_end(args);
 346   exit(-1);
 347 }
 348
 349 static PollyGPUContext *initContextCL() {
 350   dump_function();
 351
 352   PollyGPUContext *Context;
 353
 354   cl_platform_id PlatformID = NULL;
 355   cl_device_id DeviceID = NULL;
 356   cl_uint NumDevicesRet;
 357   cl_int Ret;
 358
 359   char DeviceRevision[256];
 360   char DeviceName[256];
 361   size_t DeviceRevisionRetSize, DeviceNameRetSize;
 362
 363   static __thread PollyGPUContext *CurrentContext = NULL;
 364
 365   if (CurrentContext)
 366     return CurrentContext;
 367
 368   /* Get API handles. */
 369   if (initialDeviceAPIsCL() == 0) {
 370     fprintf(stderr, "Getting the \"handle\" for the OpenCL Runtime failed.\n");
 371     exit(-1);
 372   }
 373
 374   /* Get number of devices that support OpenCL. */
 375   static const int NumberOfPlatforms = 1;
 376   Ret = clGetPlatformIDsFcnPtr(NumberOfPlatforms, &PlatformID, NULL);
 377   checkOpenCLError(Ret, "Failed to get platform IDs.\n");
 378   // TODO: Extend to CL_DEVICE_TYPE_ALL?
 379   static const int NumberOfDevices = 1;
 380   Ret = clGetDeviceIDsFcnPtr(PlatformID, CL_DEVICE_TYPE_GPU, NumberOfDevices,
 381                              &DeviceID, &NumDevicesRet);
 382   checkOpenCLError(Ret, "Failed to get device IDs.\n");
 383
 384   GlobalDeviceID = DeviceID;
 385   if (NumDevicesRet == 0) {
 386     fprintf(stderr, "There is no device supporting OpenCL.\n");
 387     exit(-1);
 388   }
 389
 390   /* Get device revision. */
 391   Ret =
 392       clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_VERSION, sizeof(DeviceRevision),
 393                             DeviceRevision, &DeviceRevisionRetSize);
 394   checkOpenCLError(Ret, "Failed to fetch device revision.\n");
 395
 396   /* Get device name. */
 397   Ret = clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_NAME, sizeof(DeviceName),
 398                               DeviceName, &DeviceNameRetSize);
 399   checkOpenCLError(Ret, "Failed to fetch device name.\n");
 400
 401   debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
 402
 403   /* Create context on the device. */
 404   Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
 405   if (Context == 0) {
 406     fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
 407     exit(-1);
 408   }
 409   Context->Context = (OpenCLContext *)malloc(sizeof(OpenCLContext));
 410   if (Context->Context == 0) {
 411     fprintf(stderr, "Allocate memory for Polly OpenCL context failed.\n");
 412     exit(-1);
 413   }
 414   ((OpenCLContext *)Context->Context)->Context =
 415       clCreateContextFcnPtr(NULL, NumDevicesRet, &DeviceID, NULL, NULL, &Ret);
 416   checkOpenCLError(Ret, "Failed to create context.\n");
 417
 418   static const int ExtraProperties = 0;
 419   ((OpenCLContext *)Context->Context)->CommandQueue =
 420       clCreateCommandQueueFcnPtr(((OpenCLContext *)Context->Context)->Context,
 421                                  DeviceID, ExtraProperties, &Ret);
 422   checkOpenCLError(Ret, "Failed to create command queue.\n");
 423
 424   if (CacheMode)
 425     CurrentContext = Context;
 426
 427   GlobalContext = Context;
 428   return Context;
 429 }
 430
 431 static void freeKernelCL(PollyGPUFunction *Kernel) {
 432   dump_function();
 433
 434   if (CacheMode)
 435     return;
 436
 437   if (!GlobalContext) {
 438     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
 439     exit(-1);
 440   }
 441
 442   cl_int Ret;
 443   Ret = clFlushFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
 444   checkOpenCLError(Ret, "Failed to flush command queue.\n");
 445   Ret = clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
 446   checkOpenCLError(Ret, "Failed to finish command queue.\n");
 447
 448   if (((OpenCLKernel *)Kernel->Kernel)->Kernel) {
 449     cl_int Ret =
 450         clReleaseKernelFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Kernel);
 451     checkOpenCLError(Ret, "Failed to release kernel.\n");
 452   }
 453
 454   if (((OpenCLKernel *)Kernel->Kernel)->Program) {
 455     cl_int Ret =
 456         clReleaseProgramFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Program);
 457     checkOpenCLError(Ret, "Failed to release program.\n");
 458   }
 459
 460   if (Kernel->Kernel)
 461     free((OpenCLKernel *)Kernel->Kernel);
 462
 463   if (Kernel)
 464     free(Kernel);
 465 }
 466
 467 static PollyGPUFunction *getKernelCL(const char *BinaryBuffer,
 468                                      const char *KernelName) {
 469   dump_function();
 470
 471   if (!GlobalContext) {
 472     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
 473     exit(-1);
 474   }
 475
 476   static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
 477   static __thread int NextCacheItem = 0;
 478
 479   for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
 480     // We exploit here the property that all Polly-ACC kernels are allocated
 481     // as global constants, hence a pointer comparision is sufficient to
 482     // determin equality.
 483     if (KernelCache[i] &&
 484         ((OpenCLKernel *)KernelCache[i]->Kernel)->BinaryString ==
 485             BinaryBuffer) {
 486       debug_print("  -> using cached kernel\n");
 487       return KernelCache[i];
 488     }
 489   }
 490
 491   PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
 492   if (Function == 0) {
 493     fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
 494     exit(-1);
 495   }
 496   Function->Kernel = (OpenCLKernel *)malloc(sizeof(OpenCLKernel));
 497   if (Function->Kernel == 0) {
 498     fprintf(stderr, "Allocate memory for Polly OpenCL kernel failed.\n");
 499     exit(-1);
 500   }
 501
 502   if (!GlobalDeviceID) {
 503     fprintf(stderr, "GPGPU-code generation not initialized correctly.\n");
 504     exit(-1);
 505   }
 506
 507   cl_int Ret;
 508
 509   if (HandleOpenCLBeignet) {
 510     // This is a workaround, since clCreateProgramWithLLVMIntel only
 511     // accepts a filename to a valid llvm-ir file as an argument, instead
 512     // of accepting the BinaryBuffer directly.
 513     char FileName[] = "/tmp/polly_kernelXXXXXX";
 514     int File = mkstemp(FileName);
 515     write(File, BinaryBuffer, strlen(BinaryBuffer));
 516
 517     ((OpenCLKernel *)Function->Kernel)->Program =
 518         clCreateProgramWithLLVMIntelFcnPtr(
 519             ((OpenCLContext *)GlobalContext->Context)->Context, 1,
 520             &GlobalDeviceID, FileName, &Ret);
 521     checkOpenCLError(Ret, "Failed to create program from llvm.\n");
 522     close(File);
 523     unlink(FileName);
 524   } else {
 525     size_t BinarySize = strlen(BinaryBuffer);
 526     ((OpenCLKernel *)Function->Kernel)->Program =
 527         clCreateProgramWithBinaryFcnPtr(
 528             ((OpenCLContext *)GlobalContext->Context)->Context, 1,
 529             &GlobalDeviceID, (const size_t *)&BinarySize,
 530             (const unsigned char **)&BinaryBuffer, NULL, &Ret);
 531     checkOpenCLError(Ret, "Failed to create program from binary.\n");
 532   }
 533
 534   Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1,
 535                              &GlobalDeviceID, NULL, NULL, NULL);
 536   checkOpenCLError(Ret, "Failed to build program.\n");
 537
 538   ((OpenCLKernel *)Function->Kernel)->Kernel = clCreateKernelFcnPtr(
 539       ((OpenCLKernel *)Function->Kernel)->Program, KernelName, &Ret);
 540   checkOpenCLError(Ret, "Failed to create kernel.\n");
 541
 542   ((OpenCLKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
 543
 544   if (CacheMode) {
 545     if (KernelCache[NextCacheItem])
 546       freeKernelCL(KernelCache[NextCacheItem]);
 547
 548     KernelCache[NextCacheItem] = Function;
 549
 550     NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
 551   }
 552
 553   return Function;
 554 }
 555
 556 static void copyFromHostToDeviceCL(void *HostData, PollyGPUDevicePtr *DevData,
 557                                    long MemSize) {
 558   dump_function();
 559
 560   if (!GlobalContext) {
 561     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
 562     exit(-1);
 563   }
 564
 565   cl_int Ret;
 566   Ret = clEnqueueWriteBufferFcnPtr(
 567       ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
 568       ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
 569       HostData, 0, NULL, NULL);
 570   checkOpenCLError(Ret, "Copying data from host memory to device failed.\n");
 571 }
 572
 573 static void copyFromDeviceToHostCL(PollyGPUDevicePtr *DevData, void *HostData,
 574                                    long MemSize) {
 575   dump_function();
 576
 577   if (!GlobalContext) {
 578     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
 579     exit(-1);
 580   }
 581
 582   cl_int Ret;
 583   Ret = clEnqueueReadBufferFcnPtr(
 584       ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
 585       ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
 586       HostData, 0, NULL, NULL);
 587   checkOpenCLError(Ret, "Copying results from device to host memory failed.\n");
 588 }
 589
 590 static void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX,
 591                            unsigned int GridDimY, unsigned int BlockDimX,
 592                            unsigned int BlockDimY, unsigned int BlockDimZ,
 593                            void **Parameters) {
 594   dump_function();
 595
 596   cl_int Ret;
 597   cl_uint NumArgs;
 598
 599   if (!GlobalContext) {
 600     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
 601     exit(-1);
 602   }
 603
 604   OpenCLKernel *CLKernel = (OpenCLKernel *)Kernel->Kernel;
 605   Ret = clGetKernelInfoFcnPtr(CLKernel->Kernel, CL_KERNEL_NUM_ARGS,
 606                               sizeof(cl_uint), &NumArgs, NULL);
 607   checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n");
 608
 609   /* Argument sizes are stored at the end of the Parameters array. */
 610   for (cl_uint i = 0; i < NumArgs; i++) {
 611     Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i,
 612                                *((int *)Parameters[NumArgs + i]),
 613                                (void *)Parameters[i]);
 614     checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i);
 615   }
 616
 617   unsigned int GridDimZ = 1;
 618   size_t GlobalWorkSize[3] = {BlockDimX * GridDimX, BlockDimY * GridDimY,
 619                               BlockDimZ * GridDimZ};
 620   size_t LocalWorkSize[3] = {BlockDimX, BlockDimY, BlockDimZ};
 621
 622   static const int WorkDim = 3;
 623   OpenCLContext *CLContext = (OpenCLContext *)GlobalContext->Context;
 624   Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue, CLKernel->Kernel,
 625                                      WorkDim, NULL, GlobalWorkSize,
 626                                      LocalWorkSize, 0, NULL, NULL);
 627   checkOpenCLError(Ret, "Launching OpenCL kernel failed.\n");
 628 }
 629
 630 static void freeDeviceMemoryCL(PollyGPUDevicePtr *Allocation) {
 631   dump_function();
 632
 633   OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
 634   cl_int Ret = clReleaseMemObjectFcnPtr((cl_mem)DevPtr->MemObj);
 635   checkOpenCLError(Ret, "Failed to free device memory.\n");
 636
 637   free(DevPtr);
 638   free(Allocation);
 639 }
 640
 641 static PollyGPUDevicePtr *allocateMemoryForDeviceCL(long MemSize) {
 642   dump_function();
 643
 644   if (!GlobalContext) {
 645     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
 646     exit(-1);
 647   }
 648
 649   PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
 650   if (DevData == 0) {
 651     fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
 652     exit(-1);
 653   }
 654   DevData->DevicePtr = (OpenCLDevicePtr *)malloc(sizeof(OpenCLDevicePtr));
 655   if (DevData->DevicePtr == 0) {
 656     fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
 657     exit(-1);
 658   }
 659
 660   cl_int Ret;
 661   ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj =
 662       clCreateBufferFcnPtr(((OpenCLContext *)GlobalContext->Context)->Context,
 663                            CL_MEM_READ_WRITE, MemSize, NULL, &Ret);
 664   checkOpenCLError(Ret,
 665                    "Allocate memory for GPU device memory pointer failed.\n");
 666
 667   return DevData;
 668 }
 669
 670 static void *getDevicePtrCL(PollyGPUDevicePtr *Allocation) {
 671   dump_function();
 672
 673   OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
 674   return (void *)DevPtr->MemObj;
 675 }
 676
 677 static void synchronizeDeviceCL() {
 678   dump_function();
 679
 680   if (!GlobalContext) {
 681     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
 682     exit(-1);
 683   }
 684
 685   if (clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue) !=
 686       CL_SUCCESS) {
 687     fprintf(stderr, "Synchronizing device and host memory failed.\n");
 688     exit(-1);
 689   }
 690 }
 691
 692 static void freeContextCL(PollyGPUContext *Context) {
 693   dump_function();
 694
 695   cl_int Ret;
 696
 697   GlobalContext = NULL;
 698
 699   OpenCLContext *Ctx = (OpenCLContext *)Context->Context;
 700   if (Ctx->CommandQueue) {
 701     Ret = clReleaseCommandQueueFcnPtr(Ctx->CommandQueue);
 702     checkOpenCLError(Ret, "Could not release command queue.\n");
 703   }
 704
 705   if (Ctx->Context) {
 706     Ret = clReleaseContextFcnPtr(Ctx->Context);
 707     checkOpenCLError(Ret, "Could not release context.\n");
 708   }
 709
 710   free(Ctx);
 711   free(Context);
 712 }
 713
 714 static void printOpenCLError(int Error) {
 715
 716   switch (Error) {
 717   case CL_SUCCESS:
 718     // Success, don't print an error.
 719     break;
 720
 721   // JIT/Runtime errors.
 722   case CL_DEVICE_NOT_FOUND:
 723     fprintf(stderr, "Device not found.\n");
 724     break;
 725   case CL_DEVICE_NOT_AVAILABLE:
 726     fprintf(stderr, "Device not available.\n");
 727     break;
 728   case CL_COMPILER_NOT_AVAILABLE:
 729     fprintf(stderr, "Compiler not available.\n");
 730     break;
 731   case CL_MEM_OBJECT_ALLOCATION_FAILURE:
 732     fprintf(stderr, "Mem object allocation failure.\n");
 733     break;
 734   case CL_OUT_OF_RESOURCES:
 735     fprintf(stderr, "Out of resources.\n");
 736     break;
 737   case CL_OUT_OF_HOST_MEMORY:
 738     fprintf(stderr, "Out of host memory.\n");
 739     break;
 740   case CL_PROFILING_INFO_NOT_AVAILABLE:
 741     fprintf(stderr, "Profiling info not available.\n");
 742     break;
 743   case CL_MEM_COPY_OVERLAP:
 744     fprintf(stderr, "Mem copy overlap.\n");
 745     break;
 746   case CL_IMAGE_FORMAT_MISMATCH:
 747     fprintf(stderr, "Image format mismatch.\n");
 748     break;
 749   case CL_IMAGE_FORMAT_NOT_SUPPORTED:
 750     fprintf(stderr, "Image format not supported.\n");
 751     break;
 752   case CL_BUILD_PROGRAM_FAILURE:
 753     fprintf(stderr, "Build program failure.\n");
 754     break;
 755   case CL_MAP_FAILURE:
 756     fprintf(stderr, "Map failure.\n");
 757     break;
 758   case CL_MISALIGNED_SUB_BUFFER_OFFSET:
 759     fprintf(stderr, "Misaligned sub buffer offset.\n");
 760     break;
 761   case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
 762     fprintf(stderr, "Exec status error for events in wait list.\n");
 763     break;
 764   case CL_COMPILE_PROGRAM_FAILURE:
 765     fprintf(stderr, "Compile program failure.\n");
 766     break;
 767   case CL_LINKER_NOT_AVAILABLE:
 768     fprintf(stderr, "Linker not available.\n");
 769     break;
 770   case CL_LINK_PROGRAM_FAILURE:
 771     fprintf(stderr, "Link program failure.\n");
 772     break;
 773   case CL_DEVICE_PARTITION_FAILED:
 774     fprintf(stderr, "Device partition failed.\n");
 775     break;
 776   case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
 777     fprintf(stderr, "Kernel arg info not available.\n");
 778     break;
 779
 780   // Compiler errors.
 781   case CL_INVALID_VALUE:
 782     fprintf(stderr, "Invalid value.\n");
 783     break;
 784   case CL_INVALID_DEVICE_TYPE:
 785     fprintf(stderr, "Invalid device type.\n");
 786     break;
 787   case CL_INVALID_PLATFORM:
 788     fprintf(stderr, "Invalid platform.\n");
 789     break;
 790   case CL_INVALID_DEVICE:
 791     fprintf(stderr, "Invalid device.\n");
 792     break;
 793   case CL_INVALID_CONTEXT:
 794     fprintf(stderr, "Invalid context.\n");
 795     break;
 796   case CL_INVALID_QUEUE_PROPERTIES:
 797     fprintf(stderr, "Invalid queue properties.\n");
 798     break;
 799   case CL_INVALID_COMMAND_QUEUE:
 800     fprintf(stderr, "Invalid command queue.\n");
 801     break;
 802   case CL_INVALID_HOST_PTR:
 803     fprintf(stderr, "Invalid host pointer.\n");
 804     break;
 805   case CL_INVALID_MEM_OBJECT:
 806     fprintf(stderr, "Invalid memory object.\n");
 807     break;
 808   case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
 809     fprintf(stderr, "Invalid image format descriptor.\n");
 810     break;
 811   case CL_INVALID_IMAGE_SIZE:
 812     fprintf(stderr, "Invalid image size.\n");
 813     break;
 814   case CL_INVALID_SAMPLER:
 815     fprintf(stderr, "Invalid sampler.\n");
 816     break;
 817   case CL_INVALID_BINARY:
 818     fprintf(stderr, "Invalid binary.\n");
 819     break;
 820   case CL_INVALID_BUILD_OPTIONS:
 821     fprintf(stderr, "Invalid build options.\n");
 822     break;
 823   case CL_INVALID_PROGRAM:
 824     fprintf(stderr, "Invalid program.\n");
 825     break;
 826   case CL_INVALID_PROGRAM_EXECUTABLE:
 827     fprintf(stderr, "Invalid program executable.\n");
 828     break;
 829   case CL_INVALID_KERNEL_NAME:
 830     fprintf(stderr, "Invalid kernel name.\n");
 831     break;
 832   case CL_INVALID_KERNEL_DEFINITION:
 833     fprintf(stderr, "Invalid kernel definition.\n");
 834     break;
 835   case CL_INVALID_KERNEL:
 836     fprintf(stderr, "Invalid kernel.\n");
 837     break;
 838   case CL_INVALID_ARG_INDEX:
 839     fprintf(stderr, "Invalid arg index.\n");
 840     break;
 841   case CL_INVALID_ARG_VALUE:
 842     fprintf(stderr, "Invalid arg value.\n");
 843     break;
 844   case CL_INVALID_ARG_SIZE:
 845     fprintf(stderr, "Invalid arg size.\n");
 846     break;
 847   case CL_INVALID_KERNEL_ARGS:
 848     fprintf(stderr, "Invalid kernel args.\n");
 849     break;
 850   case CL_INVALID_WORK_DIMENSION:
 851     fprintf(stderr, "Invalid work dimension.\n");
 852     break;
 853   case CL_INVALID_WORK_GROUP_SIZE:
 854     fprintf(stderr, "Invalid work group size.\n");
 855     break;
 856   case CL_INVALID_WORK_ITEM_SIZE:
 857     fprintf(stderr, "Invalid work item size.\n");
 858     break;
 859   case CL_INVALID_GLOBAL_OFFSET:
 860     fprintf(stderr, "Invalid global offset.\n");
 861     break;
 862   case CL_INVALID_EVENT_WAIT_LIST:
 863     fprintf(stderr, "Invalid event wait list.\n");
 864     break;
 865   case CL_INVALID_EVENT:
 866     fprintf(stderr, "Invalid event.\n");
 867     break;
 868   case CL_INVALID_OPERATION:
 869     fprintf(stderr, "Invalid operation.\n");
 870     break;
 871   case CL_INVALID_GL_OBJECT:
 872     fprintf(stderr, "Invalid GL object.\n");
 873     break;
 874   case CL_INVALID_BUFFER_SIZE:
 875     fprintf(stderr, "Invalid buffer size.\n");
 876     break;
 877   case CL_INVALID_MIP_LEVEL:
 878     fprintf(stderr, "Invalid mip level.\n");
 879     break;
 880   case CL_INVALID_GLOBAL_WORK_SIZE:
 881     fprintf(stderr, "Invalid global work size.\n");
 882     break;
 883   case CL_INVALID_PROPERTY:
 884     fprintf(stderr, "Invalid property.\n");
 885     break;
 886   case CL_INVALID_IMAGE_DESCRIPTOR:
 887     fprintf(stderr, "Invalid image descriptor.\n");
 888     break;
 889   case CL_INVALID_COMPILER_OPTIONS:
 890     fprintf(stderr, "Invalid compiler options.\n");
 891     break;
 892   case CL_INVALID_LINKER_OPTIONS:
 893     fprintf(stderr, "Invalid linker options.\n");
 894     break;
 895   case CL_INVALID_DEVICE_PARTITION_COUNT:
 896     fprintf(stderr, "Invalid device partition count.\n");
 897     break;
 898   case -69: // OpenCL 2.0 Code for CL_INVALID_PIPE_SIZE
 899     fprintf(stderr, "Invalid pipe size.\n");
 900     break;
 901   case -70: // OpenCL 2.0 Code for CL_INVALID_DEVICE_QUEUE
 902     fprintf(stderr, "Invalid device queue.\n");
 903     break;
 904
 905   // NVIDIA specific error.
 906   case -9999:
 907     fprintf(stderr, "NVIDIA invalid read or write buffer.\n");
 908     break;
 909
 910   default:
 911     fprintf(stderr, "Unknown error code!\n");
 912     break;
 913   }
 914 }
 915
 916 #endif /* HAS_LIBOPENCL */
 917 /******************************************************************************/
 918 /*                                   CUDA                                     */
 919 /******************************************************************************/
 920 #ifdef HAS_LIBCUDART
 921
 922 struct CUDAContextT {
 923   CUcontext Cuda;
 924 };
 925
 926 struct CUDAKernelT {
 927   CUfunction Cuda;
 928   CUmodule CudaModule;
 929   const char *BinaryString;
 930 };
 931
 932 struct CUDADevicePtrT {
 933   CUdeviceptr Cuda;
 934 };
 935
 936 /* Dynamic library handles for the CUDA and CUDA runtime library. */
 937 static void *HandleCuda;
 938 static void *HandleCudaRT;
 939
 940 /* Type-defines of function pointer to CUDA driver APIs. */
 941 typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t);
 942 static CuMemAllocFcnTy *CuMemAllocFcnPtr;
 943
 944 typedef CUresult CUDAAPI CuMemAllocManagedFcnTy(CUdeviceptr *, size_t,
 945                                                 unsigned int);
 946 static CuMemAllocManagedFcnTy *CuMemAllocManagedFcnPtr;
 947
 948 typedef CUresult CUDAAPI CuLaunchKernelFcnTy(
 949     CUfunction F, unsigned int GridDimX, unsigned int GridDimY,
 950     unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY,
 951     unsigned int BlockDimZ, unsigned int SharedMemBytes, CUstream HStream,
 952     void **KernelParams, void **Extra);
 953 static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr;
 954
 955 typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t);
 956 static CuMemcpyDtoHFcnTy *CuMemcpyDtoHFcnPtr;
 957
 958 typedef CUresult CUDAAPI CuMemcpyHtoDFcnTy(CUdeviceptr, const void *, size_t);
 959 static CuMemcpyHtoDFcnTy *CuMemcpyHtoDFcnPtr;
 960
 961 typedef CUresult CUDAAPI CuMemFreeFcnTy(CUdeviceptr);
 962 static CuMemFreeFcnTy *CuMemFreeFcnPtr;
 963
 964 typedef CUresult CUDAAPI CuModuleUnloadFcnTy(CUmodule);
 965 static CuModuleUnloadFcnTy *CuModuleUnloadFcnPtr;
 966
 967 typedef CUresult CUDAAPI CuProfilerStopFcnTy();
 968 static CuProfilerStopFcnTy *CuProfilerStopFcnPtr;
 969
 970 typedef CUresult CUDAAPI CuCtxDestroyFcnTy(CUcontext);
 971 static CuCtxDestroyFcnTy *CuCtxDestroyFcnPtr;
 972
 973 typedef CUresult CUDAAPI CuInitFcnTy(unsigned int);
 974 static CuInitFcnTy *CuInitFcnPtr;
 975
 976 typedef CUresult CUDAAPI CuDeviceGetCountFcnTy(int *);
 977 static CuDeviceGetCountFcnTy *CuDeviceGetCountFcnPtr;
 978
 979 typedef CUresult CUDAAPI CuCtxCreateFcnTy(CUcontext *, unsigned int, CUdevice);
 980 static CuCtxCreateFcnTy *CuCtxCreateFcnPtr;
 981
 982 typedef CUresult CUDAAPI CuCtxGetCurrentFcnTy(CUcontext *);
 983 static CuCtxGetCurrentFcnTy *CuCtxGetCurrentFcnPtr;
 984
 985 typedef CUresult CUDAAPI CuDeviceGetFcnTy(CUdevice *, int);
 986 static CuDeviceGetFcnTy *CuDeviceGetFcnPtr;
 987
 988 typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *,
 989                                                  unsigned int, CUjit_option *,
 990                                                  void **);
 991 static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;
 992
 993 typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *Module,
 994                                                const void *Image);
 995 static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr;
 996
 997 typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
 998                                                   const char *);
 999 static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr;
1000
1001 typedef CUresult CUDAAPI CuDeviceComputeCapabilityFcnTy(int *, int *, CUdevice);
1002 static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr;
1003
1004 typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
1005 static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;
1006
1007 typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState State,
1008                                             CUjitInputType Type, void *Data,
1009                                             size_t Size, const char *Name,
1010                                             unsigned int NumOptions,
1011                                             CUjit_option *Options,
1012                                             void **OptionValues);
1013 static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr;
1014
1015 typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int NumOptions,
1016                                            CUjit_option *Options,
1017                                            void **OptionValues,
1018                                            CUlinkState *StateOut);
1019 static CuLinkCreateFcnTy *CuLinkCreateFcnPtr;
1020
1021 typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState State, void **CubinOut,
1022                                              size_t *SizeOut);
1023 static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr;
1024
1025 typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState State);
1026 static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;
1027
1028 typedef CUresult CUDAAPI CuCtxSynchronizeFcnTy();
1029 static CuCtxSynchronizeFcnTy *CuCtxSynchronizeFcnPtr;
1030
1031 /* Type-defines of function pointer ot CUDA runtime APIs. */
1032 typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
1033 static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
1034
1035 static void *getAPIHandleCUDA(void *Handle, const char *FuncName) {
1036   char *Err;
1037   void *FuncPtr;
1038   dlerror();
1039   FuncPtr = dlsym(Handle, FuncName);
1040   if ((Err = dlerror()) != 0) {
1041     fprintf(stderr, "Load CUDA driver API failed: %s. \n", Err);
1042     return 0;
1043   }
1044   return FuncPtr;
1045 }
1046
1047 static int initialDeviceAPILibrariesCUDA() {
1048   HandleCuda = dlopen("libcuda.so", RTLD_LAZY);
1049   if (!HandleCuda) {
1050     fprintf(stderr, "Cannot open library: %s. \n", dlerror());
1051     return 0;
1052   }
1053
1054   HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY);
1055   if (!HandleCudaRT) {
1056     fprintf(stderr, "Cannot open library: %s. \n", dlerror());
1057     return 0;
1058   }
1059
1060   return 1;
1061 }
1062
1063 /* Get function pointer to CUDA Driver APIs.
1064  *
1065  * Note that compilers conforming to the ISO C standard are required to
1066  * generate a warning if a conversion from a void * pointer to a function
1067  * pointer is attempted as in the following statements. The warning
1068  * of this kind of cast may not be emitted by clang and new versions of gcc
1069  * as it is valid on POSIX 2008. For compilers required to generate a warning,
1070  * we temporarily disable -Wpedantic, to avoid bloating the output with
1071  * unnecessary warnings.
1072  *
1073  * Reference:
1074  * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html
1075  */
1076 #pragma GCC diagnostic push
1077 #pragma GCC diagnostic ignored "-Wpedantic"
1078 static int initialDeviceAPIsCUDA() {
1079   if (initialDeviceAPILibrariesCUDA() == 0)
1080     return 0;
1081
1082   CuLaunchKernelFcnPtr =
1083       (CuLaunchKernelFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLaunchKernel");
1084
1085   CuMemAllocFcnPtr =
1086       (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2");
1087
1088   CuMemAllocManagedFcnPtr = (CuMemAllocManagedFcnTy *)getAPIHandleCUDA(
1089       HandleCuda, "cuMemAllocManaged");
1090
1091   CuMemFreeFcnPtr =
1092       (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2");
1093
1094   CuMemcpyDtoHFcnPtr =
1095       (CuMemcpyDtoHFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyDtoH_v2");
1096
1097   CuMemcpyHtoDFcnPtr =
1098       (CuMemcpyHtoDFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyHtoD_v2");
1099
1100   CuModuleUnloadFcnPtr =
1101       (CuModuleUnloadFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleUnload");
1102
1103   CuProfilerStopFcnPtr =
1104       (CuProfilerStopFcnTy *)getAPIHandleCUDA(HandleCuda, "cuProfilerStop");
1105
1106   CuCtxDestroyFcnPtr =
1107       (CuCtxDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxDestroy");
1108
1109   CuInitFcnPtr = (CuInitFcnTy *)getAPIHandleCUDA(HandleCuda, "cuInit");
1110
1111   CuDeviceGetCountFcnPtr =
1112       (CuDeviceGetCountFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetCount");
1113
1114   CuDeviceGetFcnPtr =
1115       (CuDeviceGetFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGet");
1116
1117   CuCtxCreateFcnPtr =
1118       (CuCtxCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxCreate_v2");
1119
1120   CuCtxGetCurrentFcnPtr =
1121       (CuCtxGetCurrentFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxGetCurrent");
1122
1123   CuModuleLoadDataExFcnPtr = (CuModuleLoadDataExFcnTy *)getAPIHandleCUDA(
1124       HandleCuda, "cuModuleLoadDataEx");
1125
1126   CuModuleLoadDataFcnPtr =
1127       (CuModuleLoadDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleLoadData");
1128
1129   CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandleCUDA(
1130       HandleCuda, "cuModuleGetFunction");
1131
1132   CuDeviceComputeCapabilityFcnPtr =
1133       (CuDeviceComputeCapabilityFcnTy *)getAPIHandleCUDA(
1134           HandleCuda, "cuDeviceComputeCapability");
1135
1136   CuDeviceGetNameFcnPtr =
1137       (CuDeviceGetNameFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetName");
1138
1139   CuLinkAddDataFcnPtr =
1140       (CuLinkAddDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkAddData");
1141
1142   CuLinkCreateFcnPtr =
1143       (CuLinkCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkCreate");
1144
1145   CuLinkCompleteFcnPtr =
1146       (CuLinkCompleteFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkComplete");
1147
1148   CuLinkDestroyFcnPtr =
1149       (CuLinkDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkDestroy");
1150
1151   CuCtxSynchronizeFcnPtr =
1152       (CuCtxSynchronizeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxSynchronize");
1153
1154   /* Get function pointer to CUDA Runtime APIs. */
1155   CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandleCUDA(
1156       HandleCudaRT, "cudaThreadSynchronize");
1157
1158   return 1;
1159 }
1160 #pragma GCC diagnostic pop
1161
1162 static PollyGPUContext *initContextCUDA() {
1163   dump_function();
1164   PollyGPUContext *Context;
1165   CUdevice Device;
1166
1167   int Major = 0, Minor = 0, DeviceID = 0;
1168   char DeviceName[256];
1169   int DeviceCount = 0;
1170
1171   static __thread PollyGPUContext *CurrentContext = NULL;
1172
1173   if (CurrentContext)
1174     return CurrentContext;
1175
1176   /* Get API handles. */
1177   if (initialDeviceAPIsCUDA() == 0) {
1178     fprintf(stderr, "Getting the \"handle\" for the CUDA driver API failed.\n");
1179     exit(-1);
1180   }
1181
1182   if (CuInitFcnPtr(0) != CUDA_SUCCESS) {
1183     fprintf(stderr, "Initializing the CUDA driver API failed.\n");
1184     exit(-1);
1185   }
1186
1187   /* Get number of devices that supports CUDA. */
1188   CuDeviceGetCountFcnPtr(&DeviceCount);
1189   if (DeviceCount == 0) {
1190     fprintf(stderr, "There is no device supporting CUDA.\n");
1191     exit(-1);
1192   }
1193
1194   CuDeviceGetFcnPtr(&Device, 0);
1195
1196   /* Get compute capabilities and the device name. */
1197   CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, Device);
1198   CuDeviceGetNameFcnPtr(DeviceName, 256, Device);
1199   debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
1200
1201   /* Create context on the device. */
1202   Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
1203   if (Context == 0) {
1204     fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
1205     exit(-1);
1206   }
1207   Context->Context = malloc(sizeof(CUDAContext));
1208   if (Context->Context == 0) {
1209     fprintf(stderr, "Allocate memory for Polly CUDA context failed.\n");
1210     exit(-1);
1211   }
1212
1213   // In cases where managed memory is used, it is quite likely that
1214   // `cudaMallocManaged` / `polly_mallocManaged` was called before
1215   // `polly_initContext` was called.
1216   //
1217   // If `polly_initContext` calls `CuCtxCreate` when there already was a
1218   // pre-existing context created by the runtime API, this causes code running
1219   // on P100 to hang. So, we query for a pre-existing context to try and use.
1220   // If there is no pre-existing context, we create a new context
1221
1222   // The possible pre-existing context from previous runtime API calls.
1223   CUcontext MaybeRuntimeAPIContext;
1224   if (CuCtxGetCurrentFcnPtr(&MaybeRuntimeAPIContext) != CUDA_SUCCESS) {
1225     fprintf(stderr, "cuCtxGetCurrent failed.\n");
1226     exit(-1);
1227   }
1228
1229   // There was no previous context, initialise it.
1230   if (MaybeRuntimeAPIContext == NULL) {
1231     if (CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0,
1232                           Device) != CUDA_SUCCESS) {
1233       fprintf(stderr, "cuCtxCreateFcnPtr failed.\n");
1234       exit(-1);
1235     }
1236   } else {
1237     ((CUDAContext *)Context->Context)->Cuda = MaybeRuntimeAPIContext;
1238   }
1239
1240   if (CacheMode)
1241     CurrentContext = Context;
1242
1243   return Context;
1244 }
1245
1246 static void freeKernelCUDA(PollyGPUFunction *Kernel) {
1247   dump_function();
1248
1249   if (CacheMode)
1250     return;
1251
1252   if (((CUDAKernel *)Kernel->Kernel)->CudaModule)
1253     CuModuleUnloadFcnPtr(((CUDAKernel *)Kernel->Kernel)->CudaModule);
1254
1255   if (Kernel->Kernel)
1256     free((CUDAKernel *)Kernel->Kernel);
1257
1258   if (Kernel)
1259     free(Kernel);
1260 }
1261
1262 static PollyGPUFunction *getKernelCUDA(const char *BinaryBuffer,
1263                                        const char *KernelName) {
1264   dump_function();
1265
1266   static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
1267   static __thread int NextCacheItem = 0;
1268
1269   for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
1270     // We exploit here the property that all Polly-ACC kernels are allocated
1271     // as global constants, hence a pointer comparision is sufficient to
1272     // determin equality.
1273     if (KernelCache[i] &&
1274         ((CUDAKernel *)KernelCache[i]->Kernel)->BinaryString == BinaryBuffer) {
1275       debug_print("  -> using cached kernel\n");
1276       return KernelCache[i];
1277     }
1278   }
1279
1280   PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
1281   if (Function == 0) {
1282     fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
1283     exit(-1);
1284   }
1285   Function->Kernel = (CUDAKernel *)malloc(sizeof(CUDAKernel));
1286   if (Function->Kernel == 0) {
1287     fprintf(stderr, "Allocate memory for Polly CUDA function failed.\n");
1288     exit(-1);
1289   }
1290
1291   CUresult Res;
1292   CUlinkState LState;
1293   CUjit_option Options[6];
1294   void *OptionVals[6];
1295   float Walltime = 0;
1296   unsigned long LogSize = 8192;
1297   char ErrorLog[8192], InfoLog[8192];
1298   void *CuOut;
1299   size_t OutSize;
1300
1301   // Setup linker options
1302   // Return walltime from JIT compilation
1303   Options[0] = CU_JIT_WALL_TIME;
1304   OptionVals[0] = (void *)&Walltime;
1305   // Pass a buffer for info messages
1306   Options[1] = CU_JIT_INFO_LOG_BUFFER;
1307   OptionVals[1] = (void *)InfoLog;
1308   // Pass the size of the info buffer
1309   Options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
1310   OptionVals[2] = (void *)LogSize;
1311   // Pass a buffer for error message
1312   Options[3] = CU_JIT_ERROR_LOG_BUFFER;
1313   OptionVals[3] = (void *)ErrorLog;
1314   // Pass the size of the error buffer
1315   Options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
1316   OptionVals[4] = (void *)LogSize;
1317   // Make the linker verbose
1318   Options[5] = CU_JIT_LOG_VERBOSE;
1319   OptionVals[5] = (void *)1;
1320
1321   memset(ErrorLog, 0, sizeof(ErrorLog));
1322
1323   CuLinkCreateFcnPtr(6, Options, OptionVals, &LState);
1324   Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)BinaryBuffer,
1325                             strlen(BinaryBuffer) + 1, 0, 0, 0, 0);
1326   if (Res != CUDA_SUCCESS) {
1327     fprintf(stderr, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
1328     exit(-1);
1329   }
1330
1331   Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize);
1332   if (Res != CUDA_SUCCESS) {
1333     fprintf(stderr, "Complete ptx linker step failed.\n");
1334     fprintf(stderr, "\n%s\n", ErrorLog);
1335     exit(-1);
1336   }
1337
1338   debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime,
1339               InfoLog);
1340
1341   Res = CuModuleLoadDataFcnPtr(&(((CUDAKernel *)Function->Kernel)->CudaModule),
1342                                CuOut);
1343   if (Res != CUDA_SUCCESS) {
1344     fprintf(stderr, "Loading ptx assembly text failed.\n");
1345     exit(-1);
1346   }
1347
1348   Res = CuModuleGetFunctionFcnPtr(&(((CUDAKernel *)Function->Kernel)->Cuda),
1349                                   ((CUDAKernel *)Function->Kernel)->CudaModule,
1350                                   KernelName);
1351   if (Res != CUDA_SUCCESS) {
1352     fprintf(stderr, "Loading kernel function failed.\n");
1353     exit(-1);
1354   }
1355
1356   CuLinkDestroyFcnPtr(LState);
1357
1358   ((CUDAKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
1359
1360   if (CacheMode) {
1361     if (KernelCache[NextCacheItem])
1362       freeKernelCUDA(KernelCache[NextCacheItem]);
1363
1364     KernelCache[NextCacheItem] = Function;
1365
1366     NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
1367   }
1368
1369   return Function;
1370 }
1371
1372 static void synchronizeDeviceCUDA() {
1373   dump_function();
1374   if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) {
1375     fprintf(stderr, "Synchronizing device and host memory failed.\n");
1376     exit(-1);
1377   }
1378 }
1379
1380 static void copyFromHostToDeviceCUDA(void *HostData, PollyGPUDevicePtr *DevData,
1381                                      long MemSize) {
1382   dump_function();
1383
1384   CUdeviceptr CuDevData = ((CUDADevicePtr *)DevData->DevicePtr)->Cuda;
1385   CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize);
1386 }
1387
1388 static void copyFromDeviceToHostCUDA(PollyGPUDevicePtr *DevData, void *HostData,
1389                                      long MemSize) {
1390   dump_function();
1391
1392   if (CuMemcpyDtoHFcnPtr(HostData, ((CUDADevicePtr *)DevData->DevicePtr)->Cuda,
1393                          MemSize) != CUDA_SUCCESS) {
1394     fprintf(stderr, "Copying results from device to host memory failed.\n");
1395     exit(-1);
1396   }
1397 }
1398
1399 static void launchKernelCUDA(PollyGPUFunction *Kernel, unsigned int GridDimX,
1400                              unsigned int GridDimY, unsigned int BlockDimX,
1401                              unsigned int BlockDimY, unsigned int BlockDimZ,
1402                              void **Parameters) {
1403   dump_function();
1404
1405   unsigned GridDimZ = 1;
1406   unsigned int SharedMemBytes = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE;
1407   CUstream Stream = 0;
1408   void **Extra = 0;
1409
1410   CUresult Res;
1411   Res =
1412       CuLaunchKernelFcnPtr(((CUDAKernel *)Kernel->Kernel)->Cuda, GridDimX,
1413                            GridDimY, GridDimZ, BlockDimX, BlockDimY, BlockDimZ,
1414                            SharedMemBytes, Stream, Parameters, Extra);
1415   if (Res != CUDA_SUCCESS) {
1416     fprintf(stderr, "Launching CUDA kernel failed.\n");
1417     exit(-1);
1418   }
1419 }
1420
1421 // Maximum number of managed memory pointers.
1422 #define DEFAULT_MAX_POINTERS 4000
1423 // For the rationale behing a list of free pointers, see `polly_freeManaged`.
1424 void **g_managedptrs;
1425 unsigned long long g_nmanagedptrs = 0;
1426 unsigned long long g_maxmanagedptrs = 0;
1427
1428 __attribute__((constructor)) static void initManagedPtrsBuffer() {
1429   g_maxmanagedptrs = DEFAULT_MAX_POINTERS;
1430   const char *maxManagedPointersString = getenv("POLLY_MAX_MANAGED_POINTERS");
1431   if (maxManagedPointersString)
1432     g_maxmanagedptrs = atoll(maxManagedPointersString);
1433
1434   g_managedptrs = (void **)malloc(sizeof(void *) * g_maxmanagedptrs);
1435 }
1436
1437 // Add a pointer as being allocated by cuMallocManaged
1438 void addManagedPtr(void *mem) {
1439   assert(g_maxmanagedptrs > 0 && "g_maxmanagedptrs was set to 0!");
1440   assert(g_nmanagedptrs < g_maxmanagedptrs &&
1441          "We have hit the maximum number of "
1442          "managed pointers allowed. Set the "
1443          "POLLY_MAX_MANAGED_POINTERS environment variable. ");
1444   g_managedptrs[g_nmanagedptrs++] = mem;
1445 }
1446
1447 int isManagedPtr(void *mem) {
1448   for (unsigned long long i = 0; i < g_nmanagedptrs; i++) {
1449     if (g_managedptrs[i] == mem)
1450       return 1;
1451   }
1452   return 0;
1453 }
1454
1455 void freeManagedCUDA(void *mem) {
1456   dump_function();
1457
1458   // In a real-world program this was used (COSMO), there were more `free`
1459   // calls in the original source than `malloc` calls. Hence, replacing all
1460   // `free`s with `cudaFree` does not work, since we would try to free
1461   // 'illegal' memory.
1462   // As a quick fix, we keep a free list and check if `mem` is a managed memory
1463   // pointer. If it is, we call `cudaFree`.
1464   // If not, we pass it along to the underlying allocator.
1465   // This is a hack, and can be removed if the underlying issue is fixed.
1466   if (isManagedPtr(mem)) {
1467     if (CuMemFreeFcnPtr((size_t)mem) != CUDA_SUCCESS) {
1468       fprintf(stderr, "cudaFree failed.\n");
1469       exit(-1);
1470     }
1471     return;
1472   } else {
1473     free(mem);
1474   }
1475 }
1476
1477 void *mallocManagedCUDA(size_t size) {
1478   // Note: [Size 0 allocations]
1479   // Sometimes, some runtime computation of size could create a size of 0
1480   // for an allocation. In these cases, we do not wish to fail.
1481   // The CUDA API fails on size 0 allocations.
1482   // So, we allocate size a minimum of size 1.
1483   if (!size && DebugMode)
1484     fprintf(stderr, "cudaMallocManaged called with size 0. "
1485                     "Promoting to size 1");
1486   size = max(size, 1);
1487   PollyGPUContext *_ = polly_initContextCUDA();
1488   assert(_ && "polly_initContextCUDA failed");
1489
1490   void *newMemPtr;
1491   const CUresult Res = CuMemAllocManagedFcnPtr((CUdeviceptr *)&newMemPtr, size,
1492                                                CU_MEM_ATTACH_GLOBAL);
1493   if (Res != CUDA_SUCCESS) {
1494     fprintf(stderr, "cudaMallocManaged failed for size: %zu\n", size);
1495     exit(-1);
1496   }
1497   addManagedPtr(newMemPtr);
1498   return newMemPtr;
1499 }
1500
1501 static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) {
1502   dump_function();
1503   CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
1504   CuMemFreeFcnPtr((CUdeviceptr)DevPtr->Cuda);
1505   free(DevPtr);
1506   free(Allocation);
1507 }
1508
1509 static PollyGPUDevicePtr *allocateMemoryForDeviceCUDA(long MemSize) {
1510   if (!MemSize && DebugMode)
1511     fprintf(stderr, "allocateMemoryForDeviceCUDA called with size 0. "
1512                     "Promoting to size 1");
1513   // see: [Size 0 allocations]
1514   MemSize = max(MemSize, 1);
1515   dump_function();
1516
1517   PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
1518   if (DevData == 0) {
1519     fprintf(stderr,
1520             "Allocate memory for GPU device memory pointer failed."
1521             " Line: %d | Size: %ld\n",
1522             __LINE__, MemSize);
1523     exit(-1);
1524   }
1525   DevData->DevicePtr = (CUDADevicePtr *)malloc(sizeof(CUDADevicePtr));
1526   if (DevData->DevicePtr == 0) {
1527     fprintf(stderr,
1528             "Allocate memory for GPU device memory pointer failed."
1529             " Line: %d | Size: %ld\n",
1530             __LINE__, MemSize);
1531     exit(-1);
1532   }
1533
1534   CUresult Res =
1535       CuMemAllocFcnPtr(&(((CUDADevicePtr *)DevData->DevicePtr)->Cuda), MemSize);
1536
1537   if (Res != CUDA_SUCCESS) {
1538     fprintf(stderr,
1539             "Allocate memory for GPU device memory pointer failed."
1540             " Line: %d | Size: %ld\n",
1541             __LINE__, MemSize);
1542     exit(-1);
1543   }
1544
1545   return DevData;
1546 }
1547
1548 static void *getDevicePtrCUDA(PollyGPUDevicePtr *Allocation) {
1549   dump_function();
1550
1551   CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
1552   return (void *)DevPtr->Cuda;
1553 }
1554
1555 static void freeContextCUDA(PollyGPUContext *Context) {
1556   dump_function();
1557
1558   CUDAContext *Ctx = (CUDAContext *)Context->Context;
1559   if (Ctx->Cuda) {
1560     CuProfilerStopFcnPtr();
1561     CuCtxDestroyFcnPtr(Ctx->Cuda);
1562     free(Ctx);
1563     free(Context);
1564   }
1565
1566   dlclose(HandleCuda);
1567   dlclose(HandleCudaRT);
1568 }
1569
1570 #endif /* HAS_LIBCUDART */
1571 /******************************************************************************/
1572 /*                                    API                                     */
1573 /******************************************************************************/
1574
1575 PollyGPUContext *polly_initContext() {
1576   DebugMode = getenv("POLLY_DEBUG") != 0;
1577   CacheMode = getenv("POLLY_NOCACHE") == 0;
1578
1579   dump_function();
1580
1581   PollyGPUContext *Context;
1582
1583   switch (Runtime) {
1584 #ifdef HAS_LIBCUDART
1585   case RUNTIME_CUDA:
1586     Context = initContextCUDA();
1587     break;
1588 #endif /* HAS_LIBCUDART */
1589 #ifdef HAS_LIBOPENCL
1590   case RUNTIME_CL:
1591     Context = initContextCL();
1592     break;
1593 #endif /* HAS_LIBOPENCL */
1594   default:
1595     err_runtime();
1596   }
1597
1598   return Context;
1599 }
1600
1601 void polly_freeKernel(PollyGPUFunction *Kernel) {
1602   dump_function();
1603
1604   switch (Runtime) {
1605 #ifdef HAS_LIBCUDART
1606   case RUNTIME_CUDA:
1607     freeKernelCUDA(Kernel);
1608     break;
1609 #endif /* HAS_LIBCUDART */
1610 #ifdef HAS_LIBOPENCL
1611   case RUNTIME_CL:
1612     freeKernelCL(Kernel);
1613     break;
1614 #endif /* HAS_LIBOPENCL */
1615   default:
1616     err_runtime();
1617   }
1618 }
1619
1620 PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
1621                                   const char *KernelName) {
1622   dump_function();
1623
1624   PollyGPUFunction *Function;
1625
1626   switch (Runtime) {
1627 #ifdef HAS_LIBCUDART
1628   case RUNTIME_CUDA:
1629     Function = getKernelCUDA(BinaryBuffer, KernelName);
1630     break;
1631 #endif /* HAS_LIBCUDART */
1632 #ifdef HAS_LIBOPENCL
1633   case RUNTIME_CL:
1634     Function = getKernelCL(BinaryBuffer, KernelName);
1635     break;
1636 #endif /* HAS_LIBOPENCL */
1637   default:
1638     err_runtime();
1639   }
1640
1641   return Function;
1642 }
1643
1644 void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
1645                                 long MemSize) {
1646   dump_function();
1647
1648   switch (Runtime) {
1649 #ifdef HAS_LIBCUDART
1650   case RUNTIME_CUDA:
1651     copyFromHostToDeviceCUDA(HostData, DevData, MemSize);
1652     break;
1653 #endif /* HAS_LIBCUDART */
1654 #ifdef HAS_LIBOPENCL
1655   case RUNTIME_CL:
1656     copyFromHostToDeviceCL(HostData, DevData, MemSize);
1657     break;
1658 #endif /* HAS_LIBOPENCL */
1659   default:
1660     err_runtime();
1661   }
1662 }
1663
1664 void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
1665                                 long MemSize) {
1666   dump_function();
1667
1668   switch (Runtime) {
1669 #ifdef HAS_LIBCUDART
1670   case RUNTIME_CUDA:
1671     copyFromDeviceToHostCUDA(DevData, HostData, MemSize);
1672     break;
1673 #endif /* HAS_LIBCUDART */
1674 #ifdef HAS_LIBOPENCL
1675   case RUNTIME_CL:
1676     copyFromDeviceToHostCL(DevData, HostData, MemSize);
1677     break;
1678 #endif /* HAS_LIBOPENCL */
1679   default:
1680     err_runtime();
1681   }
1682 }
1683
1684 void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
1685                         unsigned int GridDimY, unsigned int BlockDimX,
1686                         unsigned int BlockDimY, unsigned int BlockDimZ,
1687                         void **Parameters) {
1688   dump_function();
1689
1690   switch (Runtime) {
1691 #ifdef HAS_LIBCUDART
1692   case RUNTIME_CUDA:
1693     launchKernelCUDA(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
1694                      BlockDimZ, Parameters);
1695     break;
1696 #endif /* HAS_LIBCUDART */
1697 #ifdef HAS_LIBOPENCL
1698   case RUNTIME_CL:
1699     launchKernelCL(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ,
1700                    Parameters);
1701     break;
1702 #endif /* HAS_LIBOPENCL */
1703   default:
1704     err_runtime();
1705   }
1706 }
1707
1708 void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
1709   dump_function();
1710
1711   switch (Runtime) {
1712 #ifdef HAS_LIBCUDART
1713   case RUNTIME_CUDA:
1714     freeDeviceMemoryCUDA(Allocation);
1715     break;
1716 #endif /* HAS_LIBCUDART */
1717 #ifdef HAS_LIBOPENCL
1718   case RUNTIME_CL:
1719     freeDeviceMemoryCL(Allocation);
1720     break;
1721 #endif /* HAS_LIBOPENCL */
1722   default:
1723     err_runtime();
1724   }
1725 }
1726
1727 PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
1728   dump_function();
1729
1730   PollyGPUDevicePtr *DevData;
1731
1732   switch (Runtime) {
1733 #ifdef HAS_LIBCUDART
1734   case RUNTIME_CUDA:
1735     DevData = allocateMemoryForDeviceCUDA(MemSize);
1736     break;
1737 #endif /* HAS_LIBCUDART */
1738 #ifdef HAS_LIBOPENCL
1739   case RUNTIME_CL:
1740     DevData = allocateMemoryForDeviceCL(MemSize);
1741     break;
1742 #endif /* HAS_LIBOPENCL */
1743   default:
1744     err_runtime();
1745   }
1746
1747   return DevData;
1748 }
1749
1750 void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) {
1751   dump_function();
1752
1753   void *DevPtr;
1754
1755   switch (Runtime) {
1756 #ifdef HAS_LIBCUDART
1757   case RUNTIME_CUDA:
1758     DevPtr = getDevicePtrCUDA(Allocation);
1759     break;
1760 #endif /* HAS_LIBCUDART */
1761 #ifdef HAS_LIBOPENCL
1762   case RUNTIME_CL:
1763     DevPtr = getDevicePtrCL(Allocation);
1764     break;
1765 #endif /* HAS_LIBOPENCL */
1766   default:
1767     err_runtime();
1768   }
1769
1770   return DevPtr;
1771 }
1772
1773 void polly_synchronizeDevice() {
1774   dump_function();
1775
1776   switch (Runtime) {
1777 #ifdef HAS_LIBCUDART
1778   case RUNTIME_CUDA:
1779     synchronizeDeviceCUDA();
1780     break;
1781 #endif /* HAS_LIBCUDART */
1782 #ifdef HAS_LIBOPENCL
1783   case RUNTIME_CL:
1784     synchronizeDeviceCL();
1785     break;
1786 #endif /* HAS_LIBOPENCL */
1787   default:
1788     err_runtime();
1789   }
1790 }
1791
1792 void polly_freeContext(PollyGPUContext *Context) {
1793   dump_function();
1794
1795   if (CacheMode)
1796     return;
1797
1798   switch (Runtime) {
1799 #ifdef HAS_LIBCUDART
1800   case RUNTIME_CUDA:
1801     freeContextCUDA(Context);
1802     break;
1803 #endif /* HAS_LIBCUDART */
1804 #ifdef HAS_LIBOPENCL
1805   case RUNTIME_CL:
1806     freeContextCL(Context);
1807     break;
1808 #endif /* HAS_LIBOPENCL */
1809   default:
1810     err_runtime();
1811   }
1812 }
1813
1814 void polly_freeManaged(void *mem) {
1815   dump_function();
1816
1817 #ifdef HAS_LIBCUDART
1818   freeManagedCUDA(mem);
1819 #else
1820   fprintf(stderr, "No CUDA Runtime. Managed memory only supported by CUDA\n");
1821   exit(-1);
1822 #endif
1823 }
1824
1825 void *polly_mallocManaged(size_t size) {
1826   dump_function();
1827
1828 #ifdef HAS_LIBCUDART
1829   return mallocManagedCUDA(size);
1830 #else
1831   fprintf(stderr, "No CUDA Runtime. Managed memory only supported by CUDA\n");
1832   exit(-1);
1833 #endif
1834 }
1835
1836 /* Initialize GPUJIT with CUDA as runtime library. */
1837 PollyGPUContext *polly_initContextCUDA() {
1838 #ifdef HAS_LIBCUDART
1839   Runtime = RUNTIME_CUDA;
1840   return polly_initContext();
1841 #else
1842   fprintf(stderr, "GPU Runtime was built without CUDA support.\n");
1843   exit(-1);
1844 #endif /* HAS_LIBCUDART */
1845 }
1846
1847 /* Initialize GPUJIT with OpenCL as runtime library. */
1848 PollyGPUContext *polly_initContextCL() {
1849 #ifdef HAS_LIBOPENCL
1850   Runtime = RUNTIME_CL;
1851   return polly_initContext();
1852 #else
1853   fprintf(stderr, "GPU Runtime was built without OpenCL support.\n");
1854   exit(-1);
1855 #endif /* HAS_LIBOPENCL */
1856 }