tools/GPURuntime/GPUJIT.c

   1 /******************** GPUJIT.c - GPUJIT Execution Engine **********************/
   2 /*                                                                            */
   3 /*                     The LLVM Compiler Infrastructure                       */
   4 /*                                                                            */
   5 /* This file is dual licensed under the MIT and the University of Illinois    */
   6 /* Open Source License. See LICENSE.TXT for details.                          */
   7 /*                                                                            */
   8 /******************************************************************************/
   9 /*                                                                            */
  10 /*  This file implements GPUJIT, a ptx string execution engine for GPU.       */
  11 /*                                                                            */
  12 /******************************************************************************/
  13
  14 #include "GPUJIT.h"
  15
  16 #ifdef HAS_LIBCUDART
  17 #include <cuda.h>
  18 #include <cuda_runtime.h>
  19 #endif /* HAS_LIBCUDART */
  20
  21 #ifdef HAS_LIBOPENCL
  22 #ifdef __APPLE__
  23 #include <OpenCL/opencl.h>
  24 #else
  25 #include <CL/cl.h>
  26 #endif /* __APPLE__ */
  27 #endif /* HAS_LIBOPENCL */
  28
  29 #include <assert.h>
  30 #include <dlfcn.h>
  31 #include <stdarg.h>
  32 #include <stdio.h>
  33 #include <string.h>
  34 #include <unistd.h>
  35
  36 static int DebugMode;
  37 static int CacheMode;
  38
  39 static PollyGPURuntime Runtime = RUNTIME_NONE;
  40
  41 static void debug_print(const char *format, ...) {
  42   if (!DebugMode)
  43     return;
  44
  45   va_list args;
  46   va_start(args, format);
  47   vfprintf(stderr, format, args);
  48   va_end(args);
  49 }
  50 #define dump_function() debug_print("-> %s\n", __func__)
  51
  52 #define KERNEL_CACHE_SIZE 10
  53
  54 static void err_runtime() __attribute__((noreturn));
  55 static void err_runtime() {
  56   fprintf(stderr, "Runtime not correctly initialized.\n");
  57   exit(-1);
  58 }
  59
  60 struct PollyGPUContextT {
  61   void *Context;
  62 };
  63
  64 struct PollyGPUFunctionT {
  65   void *Kernel;
  66 };
  67
  68 struct PollyGPUDevicePtrT {
  69   void *DevicePtr;
  70 };
  71
  72 /******************************************************************************/
  73 /*                                  OpenCL                                    */
  74 /******************************************************************************/
  75 #ifdef HAS_LIBOPENCL
  76
  77 struct OpenCLContextT {
  78   cl_context Context;
  79   cl_command_queue CommandQueue;
  80 };
  81
  82 struct OpenCLKernelT {
  83   cl_kernel Kernel;
  84   cl_program Program;
  85   const char *BinaryString;
  86 };
  87
  88 struct OpenCLDevicePtrT {
  89   cl_mem MemObj;
  90 };
  91
  92 /* Dynamic library handles for the OpenCL runtime library. */
  93 static void *HandleOpenCL;
  94 static void *HandleOpenCLBeignet;
  95
  96 /* Type-defines of function pointer to OpenCL Runtime API. */
  97 typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries,
  98                                      cl_platform_id *Platforms,
  99                                      cl_uint *NumPlatforms);
 100 static clGetPlatformIDsFcnTy *clGetPlatformIDsFcnPtr;
 101
 102 typedef cl_int clGetDeviceIDsFcnTy(cl_platform_id Platform,
 103                                    cl_device_type DeviceType,
 104                                    cl_uint NumEntries, cl_device_id *Devices,
 105                                    cl_uint *NumDevices);
 106 static clGetDeviceIDsFcnTy *clGetDeviceIDsFcnPtr;
 107
 108 typedef cl_int clGetDeviceInfoFcnTy(cl_device_id Device,
 109                                     cl_device_info ParamName,
 110                                     size_t ParamValueSize, void *ParamValue,
 111                                     size_t *ParamValueSizeRet);
 112 static clGetDeviceInfoFcnTy *clGetDeviceInfoFcnPtr;
 113
 114 typedef cl_int clGetKernelInfoFcnTy(cl_kernel Kernel, cl_kernel_info ParamName,
 115                                     size_t ParamValueSize, void *ParamValue,
 116                                     size_t *ParamValueSizeRet);
 117 static clGetKernelInfoFcnTy *clGetKernelInfoFcnPtr;
 118
 119 typedef cl_context clCreateContextFcnTy(
 120     const cl_context_properties *Properties, cl_uint NumDevices,
 121     const cl_device_id *Devices,
 122     void CL_CALLBACK *pfn_notify(const char *Errinfo, const void *PrivateInfo,
 123                                  size_t CB, void *UserData),
 124     void *UserData, cl_int *ErrcodeRet);
 125 static clCreateContextFcnTy *clCreateContextFcnPtr;
 126
 127 typedef cl_command_queue
 128 clCreateCommandQueueFcnTy(cl_context Context, cl_device_id Device,
 129                           cl_command_queue_properties Properties,
 130                           cl_int *ErrcodeRet);
 131 static clCreateCommandQueueFcnTy *clCreateCommandQueueFcnPtr;
 132
 133 typedef cl_mem clCreateBufferFcnTy(cl_context Context, cl_mem_flags Flags,
 134                                    size_t Size, void *HostPtr,
 135                                    cl_int *ErrcodeRet);
 136 static clCreateBufferFcnTy *clCreateBufferFcnPtr;
 137
 138 typedef cl_int
 139 clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue, cl_mem Buffer,
 140                           cl_bool BlockingWrite, size_t Offset, size_t Size,
 141                           const void *Ptr, cl_uint NumEventsInWaitList,
 142                           const cl_event *EventWaitList, cl_event *Event);
 143 static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr;
 144
 145 typedef cl_program
 146 clCreateProgramWithLLVMIntelFcnTy(cl_context Context, cl_uint NumDevices,
 147                                   const cl_device_id *DeviceList,
 148                                   const char *Filename, cl_int *ErrcodeRet);
 149 static clCreateProgramWithLLVMIntelFcnTy *clCreateProgramWithLLVMIntelFcnPtr;
 150
 151 typedef cl_program clCreateProgramWithBinaryFcnTy(
 152     cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList,
 153     const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus,
 154     cl_int *ErrcodeRet);
 155 static clCreateProgramWithBinaryFcnTy *clCreateProgramWithBinaryFcnPtr;
 156
 157 typedef cl_int clBuildProgramFcnTy(
 158     cl_program Program, cl_uint NumDevices, const cl_device_id *DeviceList,
 159     const char *Options,
 160     void(CL_CALLBACK *pfn_notify)(cl_program Program, void *UserData),
 161     void *UserData);
 162 static clBuildProgramFcnTy *clBuildProgramFcnPtr;
 163
 164 typedef cl_kernel clCreateKernelFcnTy(cl_program Program,
 165                                       const char *KernelName,
 166                                       cl_int *ErrcodeRet);
 167 static clCreateKernelFcnTy *clCreateKernelFcnPtr;
 168
 169 typedef cl_int clSetKernelArgFcnTy(cl_kernel Kernel, cl_uint ArgIndex,
 170                                    size_t ArgSize, const void *ArgValue);
 171 static clSetKernelArgFcnTy *clSetKernelArgFcnPtr;
 172
 173 typedef cl_int clEnqueueNDRangeKernelFcnTy(
 174     cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint WorkDim,
 175     const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
 176     const size_t *LocalWorkSize, cl_uint NumEventsInWaitList,
 177     const cl_event *EventWaitList, cl_event *Event);
 178 static clEnqueueNDRangeKernelFcnTy *clEnqueueNDRangeKernelFcnPtr;
 179
 180 typedef cl_int clEnqueueReadBufferFcnTy(cl_command_queue CommandQueue,
 181                                         cl_mem Buffer, cl_bool BlockingRead,
 182                                         size_t Offset, size_t Size, void *Ptr,
 183                                         cl_uint NumEventsInWaitList,
 184                                         const cl_event *EventWaitList,
 185                                         cl_event *Event);
 186 static clEnqueueReadBufferFcnTy *clEnqueueReadBufferFcnPtr;
 187
 188 typedef cl_int clFlushFcnTy(cl_command_queue CommandQueue);
 189 static clFlushFcnTy *clFlushFcnPtr;
 190
 191 typedef cl_int clFinishFcnTy(cl_command_queue CommandQueue);
 192 static clFinishFcnTy *clFinishFcnPtr;
 193
 194 typedef cl_int clReleaseKernelFcnTy(cl_kernel Kernel);
 195 static clReleaseKernelFcnTy *clReleaseKernelFcnPtr;
 196
 197 typedef cl_int clReleaseProgramFcnTy(cl_program Program);
 198 static clReleaseProgramFcnTy *clReleaseProgramFcnPtr;
 199
 200 typedef cl_int clReleaseMemObjectFcnTy(cl_mem Memobject);
 201 static clReleaseMemObjectFcnTy *clReleaseMemObjectFcnPtr;
 202
 203 typedef cl_int clReleaseCommandQueueFcnTy(cl_command_queue CommandQueue);
 204 static clReleaseCommandQueueFcnTy *clReleaseCommandQueueFcnPtr;
 205
 206 typedef cl_int clReleaseContextFcnTy(cl_context Context);
 207 static clReleaseContextFcnTy *clReleaseContextFcnPtr;
 208
 209 static void *getAPIHandleCL(void *Handle, const char *FuncName) {
 210   char *Err;
 211   void *FuncPtr;
 212   dlerror();
 213   FuncPtr = dlsym(Handle, FuncName);
 214   if ((Err = dlerror()) != 0) {
 215     fprintf(stderr, "Load OpenCL Runtime API failed: %s. \n", Err);
 216     return 0;
 217   }
 218   return FuncPtr;
 219 }
 220
 221 static int initialDeviceAPILibrariesCL() {
 222   HandleOpenCLBeignet = dlopen("/usr/local/lib/beignet/libcl.so", RTLD_LAZY);
 223   HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY);
 224   if (!HandleOpenCL) {
 225     fprintf(stderr, "Cannot open library: %s. \n", dlerror());
 226     return 0;
 227   }
 228   return 1;
 229 }
 230
 231 /* Get function pointer to OpenCL Runtime API.
 232  *
 233  * Note that compilers conforming to the ISO C standard are required to
 234  * generate a warning if a conversion from a void * pointer to a function
 235  * pointer is attempted as in the following statements. The warning
 236  * of this kind of cast may not be emitted by clang and new versions of gcc
 237  * as it is valid on POSIX 2008. For compilers required to generate a warning,
 238  * we temporarily disable -Wpedantic, to avoid bloating the output with
 239  * unnecessary warnings.
 240  *
 241  * Reference:
 242  * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html
 243  */
 244 #pragma GCC diagnostic push
 245 #pragma GCC diagnostic ignored "-Wpedantic"
 246 static int initialDeviceAPIsCL() {
 247   if (initialDeviceAPILibrariesCL() == 0)
 248     return 0;
 249
 250   // FIXME: We are now always selecting the Intel Beignet driver if it is
 251   // available on the system, instead of a possible NVIDIA or AMD OpenCL
 252   // API. This selection should occurr based on the target architecture
 253   // chosen when compiling.
 254   void *Handle =
 255       (HandleOpenCLBeignet != NULL ? HandleOpenCLBeignet : HandleOpenCL);
 256
 257   clGetPlatformIDsFcnPtr =
 258       (clGetPlatformIDsFcnTy *)getAPIHandleCL(Handle, "clGetPlatformIDs");
 259
 260   clGetDeviceIDsFcnPtr =
 261       (clGetDeviceIDsFcnTy *)getAPIHandleCL(Handle, "clGetDeviceIDs");
 262
 263   clGetDeviceInfoFcnPtr =
 264       (clGetDeviceInfoFcnTy *)getAPIHandleCL(Handle, "clGetDeviceInfo");
 265
 266   clGetKernelInfoFcnPtr =
 267       (clGetKernelInfoFcnTy *)getAPIHandleCL(Handle, "clGetKernelInfo");
 268
 269   clCreateContextFcnPtr =
 270       (clCreateContextFcnTy *)getAPIHandleCL(Handle, "clCreateContext");
 271
 272   clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL(
 273       Handle, "clCreateCommandQueue");
 274
 275   clCreateBufferFcnPtr =
 276       (clCreateBufferFcnTy *)getAPIHandleCL(Handle, "clCreateBuffer");
 277
 278   clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL(
 279       Handle, "clEnqueueWriteBuffer");
 280
 281   if (HandleOpenCLBeignet)
 282     clCreateProgramWithLLVMIntelFcnPtr =
 283         (clCreateProgramWithLLVMIntelFcnTy *)getAPIHandleCL(
 284             Handle, "clCreateProgramWithLLVMIntel");
 285
 286   clCreateProgramWithBinaryFcnPtr =
 287       (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL(
 288           Handle, "clCreateProgramWithBinary");
 289
 290   clBuildProgramFcnPtr =
 291       (clBuildProgramFcnTy *)getAPIHandleCL(Handle, "clBuildProgram");
 292
 293   clCreateKernelFcnPtr =
 294       (clCreateKernelFcnTy *)getAPIHandleCL(Handle, "clCreateKernel");
 295
 296   clSetKernelArgFcnPtr =
 297       (clSetKernelArgFcnTy *)getAPIHandleCL(Handle, "clSetKernelArg");
 298
 299   clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL(
 300       Handle, "clEnqueueNDRangeKernel");
 301
 302   clEnqueueReadBufferFcnPtr =
 303       (clEnqueueReadBufferFcnTy *)getAPIHandleCL(Handle, "clEnqueueReadBuffer");
 304
 305   clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(Handle, "clFlush");
 306
 307   clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(Handle, "clFinish");
 308
 309   clReleaseKernelFcnPtr =
 310       (clReleaseKernelFcnTy *)getAPIHandleCL(Handle, "clReleaseKernel");
 311
 312   clReleaseProgramFcnPtr =
 313       (clReleaseProgramFcnTy *)getAPIHandleCL(Handle, "clReleaseProgram");
 314
 315   clReleaseMemObjectFcnPtr =
 316       (clReleaseMemObjectFcnTy *)getAPIHandleCL(Handle, "clReleaseMemObject");
 317
 318   clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL(
 319       Handle, "clReleaseCommandQueue");
 320
 321   clReleaseContextFcnPtr =
 322       (clReleaseContextFcnTy *)getAPIHandleCL(Handle, "clReleaseContext");
 323
 324   return 1;
 325 }
 326 #pragma GCC diagnostic pop
 327
 328 /* Context and Device. */
 329 static PollyGPUContext *GlobalContext = NULL;
 330 static cl_device_id GlobalDeviceID = NULL;
 331
 332 /* Fd-Decl: Print out OpenCL Error codes to human readable strings. */
 333 static void printOpenCLError(int Error);
 334
 335 static void checkOpenCLError(int Ret, const char *format, ...) {
 336   if (Ret == CL_SUCCESS)
 337     return;
 338
 339   printOpenCLError(Ret);
 340   va_list args;
 341   va_start(args, format);
 342   vfprintf(stderr, format, args);
 343   va_end(args);
 344   exit(-1);
 345 }
 346
 347 static PollyGPUContext *initContextCL() {
 348   dump_function();
 349
 350   PollyGPUContext *Context;
 351
 352   cl_platform_id PlatformID = NULL;
 353   cl_device_id DeviceID = NULL;
 354   cl_uint NumDevicesRet;
 355   cl_int Ret;
 356
 357   char DeviceRevision[256];
 358   char DeviceName[256];
 359   size_t DeviceRevisionRetSize, DeviceNameRetSize;
 360
 361   static __thread PollyGPUContext *CurrentContext = NULL;
 362
 363   if (CurrentContext)
 364     return CurrentContext;
 365
 366   /* Get API handles. */
 367   if (initialDeviceAPIsCL() == 0) {
 368     fprintf(stderr, "Getting the \"handle\" for the OpenCL Runtime failed.\n");
 369     exit(-1);
 370   }
 371
 372   /* Get number of devices that support OpenCL. */
 373   static const int NumberOfPlatforms = 1;
 374   Ret = clGetPlatformIDsFcnPtr(NumberOfPlatforms, &PlatformID, NULL);
 375   checkOpenCLError(Ret, "Failed to get platform IDs.\n");
 376   // TODO: Extend to CL_DEVICE_TYPE_ALL?
 377   static const int NumberOfDevices = 1;
 378   Ret = clGetDeviceIDsFcnPtr(PlatformID, CL_DEVICE_TYPE_GPU, NumberOfDevices,
 379                              &DeviceID, &NumDevicesRet);
 380   checkOpenCLError(Ret, "Failed to get device IDs.\n");
 381
 382   GlobalDeviceID = DeviceID;
 383   if (NumDevicesRet == 0) {
 384     fprintf(stderr, "There is no device supporting OpenCL.\n");
 385     exit(-1);
 386   }
 387
 388   /* Get device revision. */
 389   Ret =
 390       clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_VERSION, sizeof(DeviceRevision),
 391                             DeviceRevision, &DeviceRevisionRetSize);
 392   checkOpenCLError(Ret, "Failed to fetch device revision.\n");
 393
 394   /* Get device name. */
 395   Ret = clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_NAME, sizeof(DeviceName),
 396                               DeviceName, &DeviceNameRetSize);
 397   checkOpenCLError(Ret, "Failed to fetch device name.\n");
 398
 399   debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
 400
 401   /* Create context on the device. */
 402   Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
 403   if (Context == 0) {
 404     fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
 405     exit(-1);
 406   }
 407   Context->Context = (OpenCLContext *)malloc(sizeof(OpenCLContext));
 408   if (Context->Context == 0) {
 409     fprintf(stderr, "Allocate memory for Polly OpenCL context failed.\n");
 410     exit(-1);
 411   }
 412   ((OpenCLContext *)Context->Context)->Context =
 413       clCreateContextFcnPtr(NULL, NumDevicesRet, &DeviceID, NULL, NULL, &Ret);
 414   checkOpenCLError(Ret, "Failed to create context.\n");
 415
 416   static const int ExtraProperties = 0;
 417   ((OpenCLContext *)Context->Context)->CommandQueue =
 418       clCreateCommandQueueFcnPtr(((OpenCLContext *)Context->Context)->Context,
 419                                  DeviceID, ExtraProperties, &Ret);
 420   checkOpenCLError(Ret, "Failed to create command queue.\n");
 421
 422   if (CacheMode)
 423     CurrentContext = Context;
 424
 425   GlobalContext = Context;
 426   return Context;
 427 }
 428
 429 static void freeKernelCL(PollyGPUFunction *Kernel) {
 430   dump_function();
 431
 432   if (CacheMode)
 433     return;
 434
 435   if (!GlobalContext) {
 436     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
 437     exit(-1);
 438   }
 439
 440   cl_int Ret;
 441   Ret = clFlushFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
 442   checkOpenCLError(Ret, "Failed to flush command queue.\n");
 443   Ret = clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
 444   checkOpenCLError(Ret, "Failed to finish command queue.\n");
 445
 446   if (((OpenCLKernel *)Kernel->Kernel)->Kernel) {
 447     cl_int Ret =
 448         clReleaseKernelFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Kernel);
 449     checkOpenCLError(Ret, "Failed to release kernel.\n");
 450   }
 451
 452   if (((OpenCLKernel *)Kernel->Kernel)->Program) {
 453     cl_int Ret =
 454         clReleaseProgramFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Program);
 455     checkOpenCLError(Ret, "Failed to release program.\n");
 456   }
 457
 458   if (Kernel->Kernel)
 459     free((OpenCLKernel *)Kernel->Kernel);
 460
 461   if (Kernel)
 462     free(Kernel);
 463 }
 464
 465 static PollyGPUFunction *getKernelCL(const char *BinaryBuffer,
 466                                      const char *KernelName) {
 467   dump_function();
 468
 469   if (!GlobalContext) {
 470     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
 471     exit(-1);
 472   }
 473
 474   static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
 475   static __thread int NextCacheItem = 0;
 476
 477   for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
 478     // We exploit here the property that all Polly-ACC kernels are allocated
 479     // as global constants, hence a pointer comparision is sufficient to
 480     // determin equality.
 481     if (KernelCache[i] &&
 482         ((OpenCLKernel *)KernelCache[i]->Kernel)->BinaryString ==
 483             BinaryBuffer) {
 484       debug_print("  -> using cached kernel\n");
 485       return KernelCache[i];
 486     }
 487   }
 488
 489   PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
 490   if (Function == 0) {
 491     fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
 492     exit(-1);
 493   }
 494   Function->Kernel = (OpenCLKernel *)malloc(sizeof(OpenCLKernel));
 495   if (Function->Kernel == 0) {
 496     fprintf(stderr, "Allocate memory for Polly OpenCL kernel failed.\n");
 497     exit(-1);
 498   }
 499
 500   if (!GlobalDeviceID) {
 501     fprintf(stderr, "GPGPU-code generation not initialized correctly.\n");
 502     exit(-1);
 503   }
 504
 505   cl_int Ret;
 506
 507   if (HandleOpenCLBeignet) {
 508     // TODO: This is a workaround, since clCreateProgramWithLLVMIntel only
 509     // accepts a filename to a valid llvm-ir file as an argument, instead
 510     // of accepting the BinaryBuffer directly.
 511     FILE *fp = fopen("kernel.ll", "wb");
 512     if (fp != NULL) {
 513       fputs(BinaryBuffer, fp);
 514       fclose(fp);
 515     }
 516
 517     ((OpenCLKernel *)Function->Kernel)->Program =
 518         clCreateProgramWithLLVMIntelFcnPtr(
 519             ((OpenCLContext *)GlobalContext->Context)->Context, 1,
 520             &GlobalDeviceID, "kernel.ll", &Ret);
 521     checkOpenCLError(Ret, "Failed to create program from llvm.\n");
 522     unlink("kernel.ll");
 523   } else {
 524     size_t BinarySize = strlen(BinaryBuffer);
 525     ((OpenCLKernel *)Function->Kernel)->Program =
 526         clCreateProgramWithBinaryFcnPtr(
 527             ((OpenCLContext *)GlobalContext->Context)->Context, 1,
 528             &GlobalDeviceID, (const size_t *)&BinarySize,
 529             (const unsigned char **)&BinaryBuffer, NULL, &Ret);
 530     checkOpenCLError(Ret, "Failed to create program from binary.\n");
 531   }
 532
 533   Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1,
 534                              &GlobalDeviceID, NULL, NULL, NULL);
 535   checkOpenCLError(Ret, "Failed to build program.\n");
 536
 537   ((OpenCLKernel *)Function->Kernel)->Kernel = clCreateKernelFcnPtr(
 538       ((OpenCLKernel *)Function->Kernel)->Program, KernelName, &Ret);
 539   checkOpenCLError(Ret, "Failed to create kernel.\n");
 540
 541   ((OpenCLKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
 542
 543   if (CacheMode) {
 544     if (KernelCache[NextCacheItem])
 545       freeKernelCL(KernelCache[NextCacheItem]);
 546
 547     KernelCache[NextCacheItem] = Function;
 548
 549     NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
 550   }
 551
 552   return Function;
 553 }
 554
 555 static void copyFromHostToDeviceCL(void *HostData, PollyGPUDevicePtr *DevData,
 556                                    long MemSize) {
 557   dump_function();
 558
 559   if (!GlobalContext) {
 560     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
 561     exit(-1);
 562   }
 563
 564   cl_int Ret;
 565   Ret = clEnqueueWriteBufferFcnPtr(
 566       ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
 567       ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
 568       HostData, 0, NULL, NULL);
 569   checkOpenCLError(Ret, "Copying data from host memory to device failed.\n");
 570 }
 571
 572 static void copyFromDeviceToHostCL(PollyGPUDevicePtr *DevData, void *HostData,
 573                                    long MemSize) {
 574   dump_function();
 575
 576   if (!GlobalContext) {
 577     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
 578     exit(-1);
 579   }
 580
 581   cl_int Ret;
 582   Ret = clEnqueueReadBufferFcnPtr(
 583       ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
 584       ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
 585       HostData, 0, NULL, NULL);
 586   checkOpenCLError(Ret, "Copying results from device to host memory failed.\n");
 587 }
 588
 589 static void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX,
 590                            unsigned int GridDimY, unsigned int BlockDimX,
 591                            unsigned int BlockDimY, unsigned int BlockDimZ,
 592                            void **Parameters) {
 593   dump_function();
 594
 595   cl_int Ret;
 596   cl_uint NumArgs;
 597
 598   if (!GlobalContext) {
 599     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
 600     exit(-1);
 601   }
 602
 603   OpenCLKernel *CLKernel = (OpenCLKernel *)Kernel->Kernel;
 604   Ret = clGetKernelInfoFcnPtr(CLKernel->Kernel, CL_KERNEL_NUM_ARGS,
 605                               sizeof(cl_uint), &NumArgs, NULL);
 606   checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n");
 607
 608   /* Argument sizes are stored at the end of the Parameters array. */
 609   for (cl_uint i = 0; i < NumArgs; i++) {
 610     Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i,
 611                                *((int *)Parameters[NumArgs + i]),
 612                                (void *)Parameters[i]);
 613     checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i);
 614   }
 615
 616   unsigned int GridDimZ = 1;
 617   size_t GlobalWorkSize[3] = {BlockDimX * GridDimX, BlockDimY * GridDimY,
 618                               BlockDimZ * GridDimZ};
 619   size_t LocalWorkSize[3] = {BlockDimX, BlockDimY, BlockDimZ};
 620
 621   static const int WorkDim = 3;
 622   OpenCLContext *CLContext = (OpenCLContext *)GlobalContext->Context;
 623   Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue, CLKernel->Kernel,
 624                                      WorkDim, NULL, GlobalWorkSize,
 625                                      LocalWorkSize, 0, NULL, NULL);
 626   checkOpenCLError(Ret, "Launching OpenCL kernel failed.\n");
 627 }
 628
 629 static void freeDeviceMemoryCL(PollyGPUDevicePtr *Allocation) {
 630   dump_function();
 631
 632   OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
 633   cl_int Ret = clReleaseMemObjectFcnPtr((cl_mem)DevPtr->MemObj);
 634   checkOpenCLError(Ret, "Failed to free device memory.\n");
 635
 636   free(DevPtr);
 637   free(Allocation);
 638 }
 639
 640 static PollyGPUDevicePtr *allocateMemoryForDeviceCL(long MemSize) {
 641   dump_function();
 642
 643   if (!GlobalContext) {
 644     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
 645     exit(-1);
 646   }
 647
 648   PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
 649   if (DevData == 0) {
 650     fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
 651     exit(-1);
 652   }
 653   DevData->DevicePtr = (OpenCLDevicePtr *)malloc(sizeof(OpenCLDevicePtr));
 654   if (DevData->DevicePtr == 0) {
 655     fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
 656     exit(-1);
 657   }
 658
 659   cl_int Ret;
 660   ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj =
 661       clCreateBufferFcnPtr(((OpenCLContext *)GlobalContext->Context)->Context,
 662                            CL_MEM_READ_WRITE, MemSize, NULL, &Ret);
 663   checkOpenCLError(Ret,
 664                    "Allocate memory for GPU device memory pointer failed.\n");
 665
 666   return DevData;
 667 }
 668
 669 static void *getDevicePtrCL(PollyGPUDevicePtr *Allocation) {
 670   dump_function();
 671
 672   OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
 673   return (void *)DevPtr->MemObj;
 674 }
 675
 676 static void synchronizeDeviceCL() {
 677   dump_function();
 678
 679   if (!GlobalContext) {
 680     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
 681     exit(-1);
 682   }
 683
 684   if (clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue) !=
 685       CL_SUCCESS) {
 686     fprintf(stderr, "Synchronizing device and host memory failed.\n");
 687     exit(-1);
 688   }
 689 }
 690
 691 static void freeContextCL(PollyGPUContext *Context) {
 692   dump_function();
 693
 694   cl_int Ret;
 695
 696   GlobalContext = NULL;
 697
 698   OpenCLContext *Ctx = (OpenCLContext *)Context->Context;
 699   if (Ctx->CommandQueue) {
 700     Ret = clReleaseCommandQueueFcnPtr(Ctx->CommandQueue);
 701     checkOpenCLError(Ret, "Could not release command queue.\n");
 702   }
 703
 704   if (Ctx->Context) {
 705     Ret = clReleaseContextFcnPtr(Ctx->Context);
 706     checkOpenCLError(Ret, "Could not release context.\n");
 707   }
 708
 709   free(Ctx);
 710   free(Context);
 711 }
 712
 713 static void printOpenCLError(int Error) {
 714
 715   switch (Error) {
 716   case CL_SUCCESS:
 717     // Success, don't print an error.
 718     break;
 719
 720   // JIT/Runtime errors.
 721   case CL_DEVICE_NOT_FOUND:
 722     fprintf(stderr, "Device not found.\n");
 723     break;
 724   case CL_DEVICE_NOT_AVAILABLE:
 725     fprintf(stderr, "Device not available.\n");
 726     break;
 727   case CL_COMPILER_NOT_AVAILABLE:
 728     fprintf(stderr, "Compiler not available.\n");
 729     break;
 730   case CL_MEM_OBJECT_ALLOCATION_FAILURE:
 731     fprintf(stderr, "Mem object allocation failure.\n");
 732     break;
 733   case CL_OUT_OF_RESOURCES:
 734     fprintf(stderr, "Out of resources.\n");
 735     break;
 736   case CL_OUT_OF_HOST_MEMORY:
 737     fprintf(stderr, "Out of host memory.\n");
 738     break;
 739   case CL_PROFILING_INFO_NOT_AVAILABLE:
 740     fprintf(stderr, "Profiling info not available.\n");
 741     break;
 742   case CL_MEM_COPY_OVERLAP:
 743     fprintf(stderr, "Mem copy overlap.\n");
 744     break;
 745   case CL_IMAGE_FORMAT_MISMATCH:
 746     fprintf(stderr, "Image format mismatch.\n");
 747     break;
 748   case CL_IMAGE_FORMAT_NOT_SUPPORTED:
 749     fprintf(stderr, "Image format not supported.\n");
 750     break;
 751   case CL_BUILD_PROGRAM_FAILURE:
 752     fprintf(stderr, "Build program failure.\n");
 753     break;
 754   case CL_MAP_FAILURE:
 755     fprintf(stderr, "Map failure.\n");
 756     break;
 757   case CL_MISALIGNED_SUB_BUFFER_OFFSET:
 758     fprintf(stderr, "Misaligned sub buffer offset.\n");
 759     break;
 760   case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
 761     fprintf(stderr, "Exec status error for events in wait list.\n");
 762     break;
 763   case CL_COMPILE_PROGRAM_FAILURE:
 764     fprintf(stderr, "Compile program failure.\n");
 765     break;
 766   case CL_LINKER_NOT_AVAILABLE:
 767     fprintf(stderr, "Linker not available.\n");
 768     break;
 769   case CL_LINK_PROGRAM_FAILURE:
 770     fprintf(stderr, "Link program failure.\n");
 771     break;
 772   case CL_DEVICE_PARTITION_FAILED:
 773     fprintf(stderr, "Device partition failed.\n");
 774     break;
 775   case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
 776     fprintf(stderr, "Kernel arg info not available.\n");
 777     break;
 778
 779   // Compiler errors.
 780   case CL_INVALID_VALUE:
 781     fprintf(stderr, "Invalid value.\n");
 782     break;
 783   case CL_INVALID_DEVICE_TYPE:
 784     fprintf(stderr, "Invalid device type.\n");
 785     break;
 786   case CL_INVALID_PLATFORM:
 787     fprintf(stderr, "Invalid platform.\n");
 788     break;
 789   case CL_INVALID_DEVICE:
 790     fprintf(stderr, "Invalid device.\n");
 791     break;
 792   case CL_INVALID_CONTEXT:
 793     fprintf(stderr, "Invalid context.\n");
 794     break;
 795   case CL_INVALID_QUEUE_PROPERTIES:
 796     fprintf(stderr, "Invalid queue properties.\n");
 797     break;
 798   case CL_INVALID_COMMAND_QUEUE:
 799     fprintf(stderr, "Invalid command queue.\n");
 800     break;
 801   case CL_INVALID_HOST_PTR:
 802     fprintf(stderr, "Invalid host pointer.\n");
 803     break;
 804   case CL_INVALID_MEM_OBJECT:
 805     fprintf(stderr, "Invalid memory object.\n");
 806     break;
 807   case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
 808     fprintf(stderr, "Invalid image format descriptor.\n");
 809     break;
 810   case CL_INVALID_IMAGE_SIZE:
 811     fprintf(stderr, "Invalid image size.\n");
 812     break;
 813   case CL_INVALID_SAMPLER:
 814     fprintf(stderr, "Invalid sampler.\n");
 815     break;
 816   case CL_INVALID_BINARY:
 817     fprintf(stderr, "Invalid binary.\n");
 818     break;
 819   case CL_INVALID_BUILD_OPTIONS:
 820     fprintf(stderr, "Invalid build options.\n");
 821     break;
 822   case CL_INVALID_PROGRAM:
 823     fprintf(stderr, "Invalid program.\n");
 824     break;
 825   case CL_INVALID_PROGRAM_EXECUTABLE:
 826     fprintf(stderr, "Invalid program executable.\n");
 827     break;
 828   case CL_INVALID_KERNEL_NAME:
 829     fprintf(stderr, "Invalid kernel name.\n");
 830     break;
 831   case CL_INVALID_KERNEL_DEFINITION:
 832     fprintf(stderr, "Invalid kernel definition.\n");
 833     break;
 834   case CL_INVALID_KERNEL:
 835     fprintf(stderr, "Invalid kernel.\n");
 836     break;
 837   case CL_INVALID_ARG_INDEX:
 838     fprintf(stderr, "Invalid arg index.\n");
 839     break;
 840   case CL_INVALID_ARG_VALUE:
 841     fprintf(stderr, "Invalid arg value.\n");
 842     break;
 843   case CL_INVALID_ARG_SIZE:
 844     fprintf(stderr, "Invalid arg size.\n");
 845     break;
 846   case CL_INVALID_KERNEL_ARGS:
 847     fprintf(stderr, "Invalid kernel args.\n");
 848     break;
 849   case CL_INVALID_WORK_DIMENSION:
 850     fprintf(stderr, "Invalid work dimension.\n");
 851     break;
 852   case CL_INVALID_WORK_GROUP_SIZE:
 853     fprintf(stderr, "Invalid work group size.\n");
 854     break;
 855   case CL_INVALID_WORK_ITEM_SIZE:
 856     fprintf(stderr, "Invalid work item size.\n");
 857     break;
 858   case CL_INVALID_GLOBAL_OFFSET:
 859     fprintf(stderr, "Invalid global offset.\n");
 860     break;
 861   case CL_INVALID_EVENT_WAIT_LIST:
 862     fprintf(stderr, "Invalid event wait list.\n");
 863     break;
 864   case CL_INVALID_EVENT:
 865     fprintf(stderr, "Invalid event.\n");
 866     break;
 867   case CL_INVALID_OPERATION:
 868     fprintf(stderr, "Invalid operation.\n");
 869     break;
 870   case CL_INVALID_GL_OBJECT:
 871     fprintf(stderr, "Invalid GL object.\n");
 872     break;
 873   case CL_INVALID_BUFFER_SIZE:
 874     fprintf(stderr, "Invalid buffer size.\n");
 875     break;
 876   case CL_INVALID_MIP_LEVEL:
 877     fprintf(stderr, "Invalid mip level.\n");
 878     break;
 879   case CL_INVALID_GLOBAL_WORK_SIZE:
 880     fprintf(stderr, "Invalid global work size.\n");
 881     break;
 882   case CL_INVALID_PROPERTY:
 883     fprintf(stderr, "Invalid property.\n");
 884     break;
 885   case CL_INVALID_IMAGE_DESCRIPTOR:
 886     fprintf(stderr, "Invalid image descriptor.\n");
 887     break;
 888   case CL_INVALID_COMPILER_OPTIONS:
 889     fprintf(stderr, "Invalid compiler options.\n");
 890     break;
 891   case CL_INVALID_LINKER_OPTIONS:
 892     fprintf(stderr, "Invalid linker options.\n");
 893     break;
 894   case CL_INVALID_DEVICE_PARTITION_COUNT:
 895     fprintf(stderr, "Invalid device partition count.\n");
 896     break;
 897   case -69: // OpenCL 2.0 Code for CL_INVALID_PIPE_SIZE
 898     fprintf(stderr, "Invalid pipe size.\n");
 899     break;
 900   case -70: // OpenCL 2.0 Code for CL_INVALID_DEVICE_QUEUE
 901     fprintf(stderr, "Invalid device queue.\n");
 902     break;
 903
 904   // NVIDIA specific error.
 905   case -9999:
 906     fprintf(stderr, "NVIDIA invalid read or write buffer.\n");
 907     break;
 908
 909   default:
 910     fprintf(stderr, "Unknown error code!\n");
 911     break;
 912   }
 913 }
 914
 915 #endif /* HAS_LIBOPENCL */
 916 /******************************************************************************/
 917 /*                                   CUDA                                     */
 918 /******************************************************************************/
 919 #ifdef HAS_LIBCUDART
 920
 921 struct CUDAContextT {
 922   CUcontext Cuda;
 923 };
 924
 925 struct CUDAKernelT {
 926   CUfunction Cuda;
 927   CUmodule CudaModule;
 928   const char *BinaryString;
 929 };
 930
 931 struct CUDADevicePtrT {
 932   CUdeviceptr Cuda;
 933 };
 934
 935 /* Dynamic library handles for the CUDA and CUDA runtime library. */
 936 static void *HandleCuda;
 937 static void *HandleCudaRT;
 938
 939 /* Type-defines of function pointer to CUDA driver APIs. */
 940 typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t);
 941 static CuMemAllocFcnTy *CuMemAllocFcnPtr;
 942
 943 typedef CUresult CUDAAPI CuLaunchKernelFcnTy(
 944     CUfunction F, unsigned int GridDimX, unsigned int GridDimY,
 945     unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY,
 946     unsigned int BlockDimZ, unsigned int SharedMemBytes, CUstream HStream,
 947     void **KernelParams, void **Extra);
 948 static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr;
 949
 950 typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t);
 951 static CuMemcpyDtoHFcnTy *CuMemcpyDtoHFcnPtr;
 952
 953 typedef CUresult CUDAAPI CuMemcpyHtoDFcnTy(CUdeviceptr, const void *, size_t);
 954 static CuMemcpyHtoDFcnTy *CuMemcpyHtoDFcnPtr;
 955
 956 typedef CUresult CUDAAPI CuMemFreeFcnTy(CUdeviceptr);
 957 static CuMemFreeFcnTy *CuMemFreeFcnPtr;
 958
 959 typedef CUresult CUDAAPI CuModuleUnloadFcnTy(CUmodule);
 960 static CuModuleUnloadFcnTy *CuModuleUnloadFcnPtr;
 961
 962 typedef CUresult CUDAAPI CuProfilerStopFcnTy();
 963 static CuProfilerStopFcnTy *CuProfilerStopFcnPtr;
 964
 965 typedef CUresult CUDAAPI CuCtxDestroyFcnTy(CUcontext);
 966 static CuCtxDestroyFcnTy *CuCtxDestroyFcnPtr;
 967
 968 typedef CUresult CUDAAPI CuInitFcnTy(unsigned int);
 969 static CuInitFcnTy *CuInitFcnPtr;
 970
 971 typedef CUresult CUDAAPI CuDeviceGetCountFcnTy(int *);
 972 static CuDeviceGetCountFcnTy *CuDeviceGetCountFcnPtr;
 973
 974 typedef CUresult CUDAAPI CuCtxCreateFcnTy(CUcontext *, unsigned int, CUdevice);
 975 static CuCtxCreateFcnTy *CuCtxCreateFcnPtr;
 976
 977 typedef CUresult CUDAAPI CuCtxGetCurrentFcnTy(CUcontext *);
 978 static CuCtxGetCurrentFcnTy *CuCtxGetCurrentFcnPtr;
 979
 980 typedef CUresult CUDAAPI CuDeviceGetFcnTy(CUdevice *, int);
 981 static CuDeviceGetFcnTy *CuDeviceGetFcnPtr;
 982
 983 typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *,
 984                                                  unsigned int, CUjit_option *,
 985                                                  void **);
 986 static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;
 987
 988 typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *Module,
 989                                                const void *Image);
 990 static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr;
 991
 992 typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
 993                                                   const char *);
 994 static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr;
 995
 996 typedef CUresult CUDAAPI CuDeviceComputeCapabilityFcnTy(int *, int *, CUdevice);
 997 static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr;
 998
 999 typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
1000 static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;
1001
1002 typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState State,
1003                                             CUjitInputType Type, void *Data,
1004                                             size_t Size, const char *Name,
1005                                             unsigned int NumOptions,
1006                                             CUjit_option *Options,
1007                                             void **OptionValues);
1008 static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr;
1009
1010 typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int NumOptions,
1011                                            CUjit_option *Options,
1012                                            void **OptionValues,
1013                                            CUlinkState *StateOut);
1014 static CuLinkCreateFcnTy *CuLinkCreateFcnPtr;
1015
1016 typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState State, void **CubinOut,
1017                                              size_t *SizeOut);
1018 static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr;
1019
1020 typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState State);
1021 static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;
1022
1023 typedef CUresult CUDAAPI CuCtxSynchronizeFcnTy();
1024 static CuCtxSynchronizeFcnTy *CuCtxSynchronizeFcnPtr;
1025
1026 /* Type-defines of function pointer ot CUDA runtime APIs. */
1027 typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
1028 static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
1029
1030 static void *getAPIHandleCUDA(void *Handle, const char *FuncName) {
1031   char *Err;
1032   void *FuncPtr;
1033   dlerror();
1034   FuncPtr = dlsym(Handle, FuncName);
1035   if ((Err = dlerror()) != 0) {
1036     fprintf(stderr, "Load CUDA driver API failed: %s. \n", Err);
1037     return 0;
1038   }
1039   return FuncPtr;
1040 }
1041
1042 static int initialDeviceAPILibrariesCUDA() {
1043   HandleCuda = dlopen("libcuda.so", RTLD_LAZY);
1044   if (!HandleCuda) {
1045     fprintf(stderr, "Cannot open library: %s. \n", dlerror());
1046     return 0;
1047   }
1048
1049   HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY);
1050   if (!HandleCudaRT) {
1051     fprintf(stderr, "Cannot open library: %s. \n", dlerror());
1052     return 0;
1053   }
1054
1055   return 1;
1056 }
1057
1058 /* Get function pointer to CUDA Driver APIs.
1059  *
1060  * Note that compilers conforming to the ISO C standard are required to
1061  * generate a warning if a conversion from a void * pointer to a function
1062  * pointer is attempted as in the following statements. The warning
1063  * of this kind of cast may not be emitted by clang and new versions of gcc
1064  * as it is valid on POSIX 2008. For compilers required to generate a warning,
1065  * we temporarily disable -Wpedantic, to avoid bloating the output with
1066  * unnecessary warnings.
1067  *
1068  * Reference:
1069  * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html
1070  */
1071 #pragma GCC diagnostic push
1072 #pragma GCC diagnostic ignored "-Wpedantic"
1073 static int initialDeviceAPIsCUDA() {
1074   if (initialDeviceAPILibrariesCUDA() == 0)
1075     return 0;
1076
1077   CuLaunchKernelFcnPtr =
1078       (CuLaunchKernelFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLaunchKernel");
1079
1080   CuMemAllocFcnPtr =
1081       (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2");
1082
1083   CuMemFreeFcnPtr =
1084       (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2");
1085
1086   CuMemcpyDtoHFcnPtr =
1087       (CuMemcpyDtoHFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyDtoH_v2");
1088
1089   CuMemcpyHtoDFcnPtr =
1090       (CuMemcpyHtoDFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyHtoD_v2");
1091
1092   CuModuleUnloadFcnPtr =
1093       (CuModuleUnloadFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleUnload");
1094
1095   CuProfilerStopFcnPtr =
1096       (CuProfilerStopFcnTy *)getAPIHandleCUDA(HandleCuda, "cuProfilerStop");
1097
1098   CuCtxDestroyFcnPtr =
1099       (CuCtxDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxDestroy");
1100
1101   CuInitFcnPtr = (CuInitFcnTy *)getAPIHandleCUDA(HandleCuda, "cuInit");
1102
1103   CuDeviceGetCountFcnPtr =
1104       (CuDeviceGetCountFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetCount");
1105
1106   CuDeviceGetFcnPtr =
1107       (CuDeviceGetFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGet");
1108
1109   CuCtxCreateFcnPtr =
1110       (CuCtxCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxCreate_v2");
1111
1112   CuCtxGetCurrentFcnPtr =
1113       (CuCtxGetCurrentFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxGetCurrent");
1114
1115   CuModuleLoadDataExFcnPtr = (CuModuleLoadDataExFcnTy *)getAPIHandleCUDA(
1116       HandleCuda, "cuModuleLoadDataEx");
1117
1118   CuModuleLoadDataFcnPtr =
1119       (CuModuleLoadDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleLoadData");
1120
1121   CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandleCUDA(
1122       HandleCuda, "cuModuleGetFunction");
1123
1124   CuDeviceComputeCapabilityFcnPtr =
1125       (CuDeviceComputeCapabilityFcnTy *)getAPIHandleCUDA(
1126           HandleCuda, "cuDeviceComputeCapability");
1127
1128   CuDeviceGetNameFcnPtr =
1129       (CuDeviceGetNameFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetName");
1130
1131   CuLinkAddDataFcnPtr =
1132       (CuLinkAddDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkAddData");
1133
1134   CuLinkCreateFcnPtr =
1135       (CuLinkCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkCreate");
1136
1137   CuLinkCompleteFcnPtr =
1138       (CuLinkCompleteFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkComplete");
1139
1140   CuLinkDestroyFcnPtr =
1141       (CuLinkDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkDestroy");
1142
1143   CuCtxSynchronizeFcnPtr =
1144       (CuCtxSynchronizeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxSynchronize");
1145
1146   /* Get function pointer to CUDA Runtime APIs. */
1147   CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandleCUDA(
1148       HandleCudaRT, "cudaThreadSynchronize");
1149
1150   return 1;
1151 }
1152 #pragma GCC diagnostic pop
1153
1154 static PollyGPUContext *initContextCUDA() {
1155   dump_function();
1156   PollyGPUContext *Context;
1157   CUdevice Device;
1158
1159   int Major = 0, Minor = 0, DeviceID = 0;
1160   char DeviceName[256];
1161   int DeviceCount = 0;
1162
1163   static __thread PollyGPUContext *CurrentContext = NULL;
1164
1165   if (CurrentContext)
1166     return CurrentContext;
1167
1168   /* Get API handles. */
1169   if (initialDeviceAPIsCUDA() == 0) {
1170     fprintf(stderr, "Getting the \"handle\" for the CUDA driver API failed.\n");
1171     exit(-1);
1172   }
1173
1174   if (CuInitFcnPtr(0) != CUDA_SUCCESS) {
1175     fprintf(stderr, "Initializing the CUDA driver API failed.\n");
1176     exit(-1);
1177   }
1178
1179   /* Get number of devices that supports CUDA. */
1180   CuDeviceGetCountFcnPtr(&DeviceCount);
1181   if (DeviceCount == 0) {
1182     fprintf(stderr, "There is no device supporting CUDA.\n");
1183     exit(-1);
1184   }
1185
1186   CuDeviceGetFcnPtr(&Device, 0);
1187
1188   /* Get compute capabilities and the device name. */
1189   CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, Device);
1190   CuDeviceGetNameFcnPtr(DeviceName, 256, Device);
1191   debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
1192
1193   /* Create context on the device. */
1194   Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
1195   if (Context == 0) {
1196     fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
1197     exit(-1);
1198   }
1199   Context->Context = malloc(sizeof(CUDAContext));
1200   if (Context->Context == 0) {
1201     fprintf(stderr, "Allocate memory for Polly CUDA context failed.\n");
1202     exit(-1);
1203   }
1204
1205   // In cases where managed memory is used, it is quite likely that
1206   // `cudaMallocManaged` / `polly_mallocManaged` was called before
1207   // `polly_initContext` was called.
1208   //
1209   // If `polly_initContext` calls `CuCtxCreate` when there already was a
1210   // pre-existing context created by the runtime API, this causes code running
1211   // on P100 to hang. So, we query for a pre-existing context to try and use.
1212   // If there is no pre-existing context, we create a new context
1213
1214   // The possible pre-existing context from previous runtime API calls.
1215   CUcontext MaybeRuntimeAPIContext;
1216   if (CuCtxGetCurrentFcnPtr(&MaybeRuntimeAPIContext) != CUDA_SUCCESS) {
1217     fprintf(stderr, "cuCtxGetCurrent failed.\n");
1218     exit(-1);
1219   }
1220
1221   // There was no previous context, initialise it.
1222   if (MaybeRuntimeAPIContext == NULL) {
1223     if (CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0,
1224                           Device) != CUDA_SUCCESS) {
1225       fprintf(stderr, "cuCtxCreateFcnPtr failed.\n");
1226       exit(-1);
1227     }
1228   } else {
1229     ((CUDAContext *)Context->Context)->Cuda = MaybeRuntimeAPIContext;
1230   }
1231
1232   if (CacheMode)
1233     CurrentContext = Context;
1234
1235   return Context;
1236 }
1237
1238 static void freeKernelCUDA(PollyGPUFunction *Kernel) {
1239   dump_function();
1240
1241   if (CacheMode)
1242     return;
1243
1244   if (((CUDAKernel *)Kernel->Kernel)->CudaModule)
1245     CuModuleUnloadFcnPtr(((CUDAKernel *)Kernel->Kernel)->CudaModule);
1246
1247   if (Kernel->Kernel)
1248     free((CUDAKernel *)Kernel->Kernel);
1249
1250   if (Kernel)
1251     free(Kernel);
1252 }
1253
1254 static PollyGPUFunction *getKernelCUDA(const char *BinaryBuffer,
1255                                        const char *KernelName) {
1256   dump_function();
1257
1258   static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
1259   static __thread int NextCacheItem = 0;
1260
1261   for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
1262     // We exploit here the property that all Polly-ACC kernels are allocated
1263     // as global constants, hence a pointer comparision is sufficient to
1264     // determin equality.
1265     if (KernelCache[i] &&
1266         ((CUDAKernel *)KernelCache[i]->Kernel)->BinaryString == BinaryBuffer) {
1267       debug_print("  -> using cached kernel\n");
1268       return KernelCache[i];
1269     }
1270   }
1271
1272   PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
1273   if (Function == 0) {
1274     fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
1275     exit(-1);
1276   }
1277   Function->Kernel = (CUDAKernel *)malloc(sizeof(CUDAKernel));
1278   if (Function->Kernel == 0) {
1279     fprintf(stderr, "Allocate memory for Polly CUDA function failed.\n");
1280     exit(-1);
1281   }
1282
1283   CUresult Res;
1284   CUlinkState LState;
1285   CUjit_option Options[6];
1286   void *OptionVals[6];
1287   float Walltime = 0;
1288   unsigned long LogSize = 8192;
1289   char ErrorLog[8192], InfoLog[8192];
1290   void *CuOut;
1291   size_t OutSize;
1292
1293   // Setup linker options
1294   // Return walltime from JIT compilation
1295   Options[0] = CU_JIT_WALL_TIME;
1296   OptionVals[0] = (void *)&Walltime;
1297   // Pass a buffer for info messages
1298   Options[1] = CU_JIT_INFO_LOG_BUFFER;
1299   OptionVals[1] = (void *)InfoLog;
1300   // Pass the size of the info buffer
1301   Options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
1302   OptionVals[2] = (void *)LogSize;
1303   // Pass a buffer for error message
1304   Options[3] = CU_JIT_ERROR_LOG_BUFFER;
1305   OptionVals[3] = (void *)ErrorLog;
1306   // Pass the size of the error buffer
1307   Options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
1308   OptionVals[4] = (void *)LogSize;
1309   // Make the linker verbose
1310   Options[5] = CU_JIT_LOG_VERBOSE;
1311   OptionVals[5] = (void *)1;
1312
1313   memset(ErrorLog, 0, sizeof(ErrorLog));
1314
1315   CuLinkCreateFcnPtr(6, Options, OptionVals, &LState);
1316   Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)BinaryBuffer,
1317                             strlen(BinaryBuffer) + 1, 0, 0, 0, 0);
1318   if (Res != CUDA_SUCCESS) {
1319     fprintf(stderr, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
1320     exit(-1);
1321   }
1322
1323   Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize);
1324   if (Res != CUDA_SUCCESS) {
1325     fprintf(stderr, "Complete ptx linker step failed.\n");
1326     fprintf(stderr, "\n%s\n", ErrorLog);
1327     exit(-1);
1328   }
1329
1330   debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime,
1331               InfoLog);
1332
1333   Res = CuModuleLoadDataFcnPtr(&(((CUDAKernel *)Function->Kernel)->CudaModule),
1334                                CuOut);
1335   if (Res != CUDA_SUCCESS) {
1336     fprintf(stderr, "Loading ptx assembly text failed.\n");
1337     exit(-1);
1338   }
1339
1340   Res = CuModuleGetFunctionFcnPtr(&(((CUDAKernel *)Function->Kernel)->Cuda),
1341                                   ((CUDAKernel *)Function->Kernel)->CudaModule,
1342                                   KernelName);
1343   if (Res != CUDA_SUCCESS) {
1344     fprintf(stderr, "Loading kernel function failed.\n");
1345     exit(-1);
1346   }
1347
1348   CuLinkDestroyFcnPtr(LState);
1349
1350   ((CUDAKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
1351
1352   if (CacheMode) {
1353     if (KernelCache[NextCacheItem])
1354       freeKernelCUDA(KernelCache[NextCacheItem]);
1355
1356     KernelCache[NextCacheItem] = Function;
1357
1358     NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
1359   }
1360
1361   return Function;
1362 }
1363
1364 static void synchronizeDeviceCUDA() {
1365   dump_function();
1366   if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) {
1367     fprintf(stderr, "Synchronizing device and host memory failed.\n");
1368     exit(-1);
1369   }
1370 }
1371
1372 static void copyFromHostToDeviceCUDA(void *HostData, PollyGPUDevicePtr *DevData,
1373                                      long MemSize) {
1374   dump_function();
1375
1376   CUdeviceptr CuDevData = ((CUDADevicePtr *)DevData->DevicePtr)->Cuda;
1377   CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize);
1378 }
1379
1380 static void copyFromDeviceToHostCUDA(PollyGPUDevicePtr *DevData, void *HostData,
1381                                      long MemSize) {
1382   dump_function();
1383
1384   if (CuMemcpyDtoHFcnPtr(HostData, ((CUDADevicePtr *)DevData->DevicePtr)->Cuda,
1385                          MemSize) != CUDA_SUCCESS) {
1386     fprintf(stderr, "Copying results from device to host memory failed.\n");
1387     exit(-1);
1388   }
1389 }
1390
1391 static void launchKernelCUDA(PollyGPUFunction *Kernel, unsigned int GridDimX,
1392                              unsigned int GridDimY, unsigned int BlockDimX,
1393                              unsigned int BlockDimY, unsigned int BlockDimZ,
1394                              void **Parameters) {
1395   dump_function();
1396
1397   unsigned GridDimZ = 1;
1398   unsigned int SharedMemBytes = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE;
1399   CUstream Stream = 0;
1400   void **Extra = 0;
1401
1402   CUresult Res;
1403   Res =
1404       CuLaunchKernelFcnPtr(((CUDAKernel *)Kernel->Kernel)->Cuda, GridDimX,
1405                            GridDimY, GridDimZ, BlockDimX, BlockDimY, BlockDimZ,
1406                            SharedMemBytes, Stream, Parameters, Extra);
1407   if (Res != CUDA_SUCCESS) {
1408     fprintf(stderr, "Launching CUDA kernel failed.\n");
1409     exit(-1);
1410   }
1411 }
1412
1413 // Maximum number of managed memory pointers.
1414 #define MAX_POINTERS 4000
1415 // For the rationale behing a list of free pointers, see `polly_freeManaged`.
1416 void *g_managedptrs[MAX_POINTERS];
1417 int g_nmanagedptrs = 0;
1418
1419 // Add a pointer as being allocated by cuMallocManaged
1420 void addManagedPtr(void *mem) {
1421   assert(g_nmanagedptrs < MAX_POINTERS && "We have hit the maximum number of "
1422                                           "managed pointers allowed. Increase "
1423                                           "MAX_POINTERS");
1424   g_managedptrs[g_nmanagedptrs++] = mem;
1425 }
1426
1427 int isManagedPtr(void *mem) {
1428   for (int i = 0; i < g_nmanagedptrs; i++) {
1429     if (g_managedptrs[i] == mem)
1430       return 1;
1431   }
1432   return 0;
1433 }
1434
1435 void polly_freeManaged(void *mem) {
1436   dump_function();
1437
1438   // In a real-world program this was used (COSMO), there were more `free`
1439   // calls in the original source than `malloc` calls. Hence, replacing all
1440   // `free`s with `cudaFree` does not work, since we would try to free
1441   // 'illegal' memory.
1442   // As a quick fix, we keep a free list and check if `mem` is a managed memory
1443   // pointer. If it is, we call `cudaFree`.
1444   // If not, we pass it along to the underlying allocator.
1445   // This is a hack, and can be removed if the underlying issue is fixed.
1446   if (isManagedPtr(mem)) {
1447     if (cudaFree(mem) != cudaSuccess) {
1448       fprintf(stderr, "cudaFree failed.\n");
1449       exit(-1);
1450     }
1451     return;
1452   } else {
1453     free(mem);
1454   }
1455 }
1456
1457 void *polly_mallocManaged(size_t size) {
1458   dump_function();
1459   void *a;
1460   if (cudaMallocManaged(&a, size, cudaMemAttachGlobal) != cudaSuccess) {
1461     fprintf(stderr, "cudaMallocManaged failed for size: %zu\n", size);
1462     exit(-1);
1463   }
1464   addManagedPtr(a);
1465   return a;
1466 }
1467
1468 static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) {
1469   dump_function();
1470   CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
1471   CuMemFreeFcnPtr((CUdeviceptr)DevPtr->Cuda);
1472   free(DevPtr);
1473   free(Allocation);
1474 }
1475
1476 static PollyGPUDevicePtr *allocateMemoryForDeviceCUDA(long MemSize) {
1477   dump_function();
1478
1479   PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
1480   if (DevData == 0) {
1481     fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
1482     exit(-1);
1483   }
1484   DevData->DevicePtr = (CUDADevicePtr *)malloc(sizeof(CUDADevicePtr));
1485   if (DevData->DevicePtr == 0) {
1486     fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
1487     exit(-1);
1488   }
1489
1490   CUresult Res =
1491       CuMemAllocFcnPtr(&(((CUDADevicePtr *)DevData->DevicePtr)->Cuda), MemSize);
1492
1493   if (Res != CUDA_SUCCESS) {
1494     fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
1495     exit(-1);
1496   }
1497
1498   return DevData;
1499 }
1500
1501 static void *getDevicePtrCUDA(PollyGPUDevicePtr *Allocation) {
1502   dump_function();
1503
1504   CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
1505   return (void *)DevPtr->Cuda;
1506 }
1507
1508 static void freeContextCUDA(PollyGPUContext *Context) {
1509   dump_function();
1510
1511   CUDAContext *Ctx = (CUDAContext *)Context->Context;
1512   if (Ctx->Cuda) {
1513     CuProfilerStopFcnPtr();
1514     CuCtxDestroyFcnPtr(Ctx->Cuda);
1515     free(Ctx);
1516     free(Context);
1517   }
1518
1519   dlclose(HandleCuda);
1520   dlclose(HandleCudaRT);
1521 }
1522
1523 #endif /* HAS_LIBCUDART */
1524 /******************************************************************************/
1525 /*                                    API                                     */
1526 /******************************************************************************/
1527
1528 PollyGPUContext *polly_initContext() {
1529   DebugMode = getenv("POLLY_DEBUG") != 0;
1530   CacheMode = getenv("POLLY_NOCACHE") == 0;
1531
1532   dump_function();
1533
1534   PollyGPUContext *Context;
1535
1536   switch (Runtime) {
1537 #ifdef HAS_LIBCUDART
1538   case RUNTIME_CUDA:
1539     Context = initContextCUDA();
1540     break;
1541 #endif /* HAS_LIBCUDART */
1542 #ifdef HAS_LIBOPENCL
1543   case RUNTIME_CL:
1544     Context = initContextCL();
1545     break;
1546 #endif /* HAS_LIBOPENCL */
1547   default:
1548     err_runtime();
1549   }
1550
1551   return Context;
1552 }
1553
1554 void polly_freeKernel(PollyGPUFunction *Kernel) {
1555   dump_function();
1556
1557   switch (Runtime) {
1558 #ifdef HAS_LIBCUDART
1559   case RUNTIME_CUDA:
1560     freeKernelCUDA(Kernel);
1561     break;
1562 #endif /* HAS_LIBCUDART */
1563 #ifdef HAS_LIBOPENCL
1564   case RUNTIME_CL:
1565     freeKernelCL(Kernel);
1566     break;
1567 #endif /* HAS_LIBOPENCL */
1568   default:
1569     err_runtime();
1570   }
1571 }
1572
1573 PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
1574                                   const char *KernelName) {
1575   dump_function();
1576
1577   PollyGPUFunction *Function;
1578
1579   switch (Runtime) {
1580 #ifdef HAS_LIBCUDART
1581   case RUNTIME_CUDA:
1582     Function = getKernelCUDA(BinaryBuffer, KernelName);
1583     break;
1584 #endif /* HAS_LIBCUDART */
1585 #ifdef HAS_LIBOPENCL
1586   case RUNTIME_CL:
1587     Function = getKernelCL(BinaryBuffer, KernelName);
1588     break;
1589 #endif /* HAS_LIBOPENCL */
1590   default:
1591     err_runtime();
1592   }
1593
1594   return Function;
1595 }
1596
1597 void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
1598                                 long MemSize) {
1599   dump_function();
1600
1601   switch (Runtime) {
1602 #ifdef HAS_LIBCUDART
1603   case RUNTIME_CUDA:
1604     copyFromHostToDeviceCUDA(HostData, DevData, MemSize);
1605     break;
1606 #endif /* HAS_LIBCUDART */
1607 #ifdef HAS_LIBOPENCL
1608   case RUNTIME_CL:
1609     copyFromHostToDeviceCL(HostData, DevData, MemSize);
1610     break;
1611 #endif /* HAS_LIBOPENCL */
1612   default:
1613     err_runtime();
1614   }
1615 }
1616
1617 void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
1618                                 long MemSize) {
1619   dump_function();
1620
1621   switch (Runtime) {
1622 #ifdef HAS_LIBCUDART
1623   case RUNTIME_CUDA:
1624     copyFromDeviceToHostCUDA(DevData, HostData, MemSize);
1625     break;
1626 #endif /* HAS_LIBCUDART */
1627 #ifdef HAS_LIBOPENCL
1628   case RUNTIME_CL:
1629     copyFromDeviceToHostCL(DevData, HostData, MemSize);
1630     break;
1631 #endif /* HAS_LIBOPENCL */
1632   default:
1633     err_runtime();
1634   }
1635 }
1636
1637 void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
1638                         unsigned int GridDimY, unsigned int BlockDimX,
1639                         unsigned int BlockDimY, unsigned int BlockDimZ,
1640                         void **Parameters) {
1641   dump_function();
1642
1643   switch (Runtime) {
1644 #ifdef HAS_LIBCUDART
1645   case RUNTIME_CUDA:
1646     launchKernelCUDA(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
1647                      BlockDimZ, Parameters);
1648     break;
1649 #endif /* HAS_LIBCUDART */
1650 #ifdef HAS_LIBOPENCL
1651   case RUNTIME_CL:
1652     launchKernelCL(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ,
1653                    Parameters);
1654     break;
1655 #endif /* HAS_LIBOPENCL */
1656   default:
1657     err_runtime();
1658   }
1659 }
1660
1661 void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
1662   dump_function();
1663
1664   switch (Runtime) {
1665 #ifdef HAS_LIBCUDART
1666   case RUNTIME_CUDA:
1667     freeDeviceMemoryCUDA(Allocation);
1668     break;
1669 #endif /* HAS_LIBCUDART */
1670 #ifdef HAS_LIBOPENCL
1671   case RUNTIME_CL:
1672     freeDeviceMemoryCL(Allocation);
1673     break;
1674 #endif /* HAS_LIBOPENCL */
1675   default:
1676     err_runtime();
1677   }
1678 }
1679
1680 PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
1681   dump_function();
1682
1683   PollyGPUDevicePtr *DevData;
1684
1685   switch (Runtime) {
1686 #ifdef HAS_LIBCUDART
1687   case RUNTIME_CUDA:
1688     DevData = allocateMemoryForDeviceCUDA(MemSize);
1689     break;
1690 #endif /* HAS_LIBCUDART */
1691 #ifdef HAS_LIBOPENCL
1692   case RUNTIME_CL:
1693     DevData = allocateMemoryForDeviceCL(MemSize);
1694     break;
1695 #endif /* HAS_LIBOPENCL */
1696   default:
1697     err_runtime();
1698   }
1699
1700   return DevData;
1701 }
1702
1703 void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) {
1704   dump_function();
1705
1706   void *DevPtr;
1707
1708   switch (Runtime) {
1709 #ifdef HAS_LIBCUDART
1710   case RUNTIME_CUDA:
1711     DevPtr = getDevicePtrCUDA(Allocation);
1712     break;
1713 #endif /* HAS_LIBCUDART */
1714 #ifdef HAS_LIBOPENCL
1715   case RUNTIME_CL:
1716     DevPtr = getDevicePtrCL(Allocation);
1717     break;
1718 #endif /* HAS_LIBOPENCL */
1719   default:
1720     err_runtime();
1721   }
1722
1723   return DevPtr;
1724 }
1725
1726 void polly_synchronizeDevice() {
1727   dump_function();
1728
1729   switch (Runtime) {
1730 #ifdef HAS_LIBCUDART
1731   case RUNTIME_CUDA:
1732     synchronizeDeviceCUDA();
1733     break;
1734 #endif /* HAS_LIBCUDART */
1735 #ifdef HAS_LIBOPENCL
1736   case RUNTIME_CL:
1737     synchronizeDeviceCL();
1738     break;
1739 #endif /* HAS_LIBOPENCL */
1740   default:
1741     err_runtime();
1742   }
1743 }
1744
1745 void polly_freeContext(PollyGPUContext *Context) {
1746   dump_function();
1747
1748   if (CacheMode)
1749     return;
1750
1751   switch (Runtime) {
1752 #ifdef HAS_LIBCUDART
1753   case RUNTIME_CUDA:
1754     freeContextCUDA(Context);
1755     break;
1756 #endif /* HAS_LIBCUDART */
1757 #ifdef HAS_LIBOPENCL
1758   case RUNTIME_CL:
1759     freeContextCL(Context);
1760     break;
1761 #endif /* HAS_LIBOPENCL */
1762   default:
1763     err_runtime();
1764   }
1765 }
1766
1767 /* Initialize GPUJIT with CUDA as runtime library. */
1768 PollyGPUContext *polly_initContextCUDA() {
1769 #ifdef HAS_LIBCUDART
1770   Runtime = RUNTIME_CUDA;
1771   return polly_initContext();
1772 #else
1773   fprintf(stderr, "GPU Runtime was built without CUDA support.\n");
1774   exit(-1);
1775 #endif /* HAS_LIBCUDART */
1776 }
1777
1778 /* Initialize GPUJIT with OpenCL as runtime library. */
1779 PollyGPUContext *polly_initContextCL() {
1780 #ifdef HAS_LIBOPENCL
1781   Runtime = RUNTIME_CL;
1782   return polly_initContext();
1783 #else
1784   fprintf(stderr, "GPU Runtime was built without OpenCL support.\n");
1785   exit(-1);
1786 #endif /* HAS_LIBOPENCL */
1787 }