opencl.c

   1 /*
   2  * Copyright 2013      Ecole Normale Superieure
   3  *
   4  * Use of this software is governed by the GNU LGPLv2.1 license
   5  *
   6  * Written by Sven Verdoolaege and Riyadh Baghdadi,
   7  * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
   8  */
   9
  10 #include <ctype.h>
  11 #include <limits.h>
  12 #include <string.h>
  13
  14 #include <isl/aff.h>
  15 #include <isl/ast.h>
  16
  17 #include "opencl.h"
  18 #include "gpu_print.h"
  19 #include "gpu.h"
  20 #include "ppcg.h"
  21 #include "print.h"
  22 #include "schedule.h"
  23
  24 #define min(a, b)  (((a) < (b)) ? (a) : (b))
  25 #define max(a, b)  (((a) > (b)) ? (a) : (b))
  26
  27 /* options are the global options passed to generate_opencl.
  28  * input is the name of the input file.
  29  * output is the user-specified output file name and may be NULL
  30  *      if not specified by the user.
  31  * kernel_c_name is the name of the kernel_c file.
  32  * host_c is the generated source file for the host code.  kernel_c is
  33  * the generated source file for the kernel.  kernel_h is the generated
  34  * header file for the kernel.
  35  */
  36 struct opencl_info {
  37         struct ppcg_options *options;
  38         const char *input;
  39         const char *output;
  40         char kernel_c_name[PATH_MAX];
  41
  42         FILE *host_c;
  43         FILE *kernel_c;
  44         FILE *kernel_h;
  45 };
  46
  47 /* Open the file called "name" for writing or print an error message.
  48  */
  49 static FILE *open_or_croak(const char *name)
  50 {
  51         FILE *file;
  52
  53         file = fopen(name, "w");
  54         if (!file)
  55                 fprintf(stderr, "Failed to open \"%s\" for writing\n", name);
  56         return file;
  57 }
  58
  59 /* Open the host .c file and the kernel .h and .cl files for writing.
  60  * Their names are derived from info->output (or info->input if
  61  * the user did not specify an output file name).
  62  * Add the necessary includes to these files.
  63  *
  64  * Return 0 on success and -1 on failure.
  65  */
  66 static int opencl_open_files(struct opencl_info *info)
  67 {
  68         char name[PATH_MAX];
  69         int len;
  70
  71         if (info->output) {
  72                 const char *ext;
  73
  74                 ext = strrchr(info->output, '.');
  75                 len = ext ? ext - info->output : strlen(info->output);
  76                 memcpy(name, info->output, len);
  77
  78                 info->host_c = open_or_croak(info->output);
  79         } else {
  80                 len = ppcg_extract_base_name(name, info->input);
  81
  82                 strcpy(name + len, "_host.c");
  83                 info->host_c = open_or_croak(name);
  84         }
  85
  86         memcpy(info->kernel_c_name, name, len);
  87         strcpy(info->kernel_c_name + len, "_kernel.cl");
  88         info->kernel_c = open_or_croak(info->kernel_c_name);
  89
  90         strcpy(name + len, "_kernel.h");
  91         info->kernel_h = open_or_croak(name);
  92
  93         if (!info->host_c || !info->kernel_c || !info->host_c)
  94                 return -1;
  95
  96         fprintf(info->host_c, "#include <assert.h>\n");
  97         fprintf(info->host_c, "#include <stdio.h>\n");
  98         fprintf(info->host_c, "#include \"%s\"\n\n", ppcg_base_name(name));
  99         fprintf(info->kernel_h, "#if defined(__APPLE__)\n");
 100         fprintf(info->kernel_h, "#include <OpenCL/opencl.h>\n");
 101         fprintf(info->kernel_h, "#else\n");
 102         fprintf(info->kernel_h, "#include <CL/opencl.h>\n");
 103         fprintf(info->kernel_h, "#endif\n\n");
 104         fprintf(info->kernel_h, "cl_device_id opencl_create_device("
 105                                 "int use_gpu);\n");
 106         fprintf(info->kernel_h, "cl_program opencl_build_program("
 107                                 "cl_context ctx, "
 108                                 "cl_device_id dev, const char *filename, "
 109                                 "const char *opencl_options);\n");
 110         fprintf(info->kernel_h,
 111                 "const char *opencl_error_string(cl_int error);\n");
 112
 113         return 0;
 114 }
 115
 116 /* Close all output files.
 117  */
 118 static void opencl_close_files(struct opencl_info *info)
 119 {
 120         if (info->kernel_c)
 121                 fclose(info->kernel_c);
 122         if (info->kernel_h)
 123                 fclose(info->kernel_h);
 124         if (info->host_c)
 125                 fclose(info->host_c);
 126 }
 127
 128 static __isl_give isl_printer *print_opencl_macros(__isl_take isl_printer *p)
 129 {
 130         const char *macros =
 131                 "#define openclCheckReturn(ret) \\\n"
 132                 "  if (ret != CL_SUCCESS) {\\\n"
 133                 "    fprintf(stderr, \"OpenCL error: %s\\n\", "
 134                 " opencl_error_string(ret)); \\\n"
 135                 "    fflush(stderr); \\\n"
 136                 "    assert(ret == CL_SUCCESS);\\\n  }\n";
 137
 138         p = isl_printer_start_line(p);
 139         p = isl_printer_print_str(p, macros);
 140         p = isl_printer_end_line(p);
 141
 142         return p;
 143 }
 144
 145 static __isl_give isl_printer *opencl_declare_device_arrays(
 146         __isl_take isl_printer *p, struct gpu_prog *prog)
 147 {
 148         int i;
 149
 150         for (i = 0; i < prog->n_array; ++i) {
 151                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
 152                         continue;
 153                 p = isl_printer_start_line(p);
 154                 p = isl_printer_print_str(p, "cl_mem dev_");
 155                 p = isl_printer_print_str(p, prog->array[i].name);
 156                 p = isl_printer_print_str(p, ";");
 157                 p = isl_printer_end_line(p);
 158         }
 159         p = isl_printer_start_line(p);
 160         p = isl_printer_end_line(p);
 161         return p;
 162 }
 163
 164 /* Internal data structure for allocate_device_array.
 165  *
 166  * array is the array that needs to be allocated.
 167  * copy is set if the contents of this array need to be copied to the device.
 168  */
 169 struct opencl_allocate_device_array_data {
 170         struct gpu_array_info *array;
 171         int copy;
 172 };
 173
 174 /* Allocate a device array for data->array and copy the contents to the device
 175  * if data->copy is set.
 176  */
 177 static __isl_give isl_printer *allocate_device_array(__isl_take isl_printer *p,
 178         void *user)
 179 {
 180         struct opencl_allocate_device_array_data *data = user;
 181         struct gpu_array_info *array = data->array;
 182
 183         p = ppcg_start_block(p);
 184
 185         p = isl_printer_start_line(p);
 186         p = isl_printer_print_str(p, "dev_");
 187         p = isl_printer_print_str(p, array->name);
 188         p = isl_printer_print_str(p, " = clCreateBuffer(context, ");
 189         p = isl_printer_print_str(p, "CL_MEM_READ_WRITE");
 190
 191         if (!data->copy)
 192                 p = isl_printer_print_str(p, ", ");
 193         else
 194                 p = isl_printer_print_str(p, " | CL_MEM_COPY_HOST_PTR, ");
 195
 196         p = gpu_array_info_print_size(p, array);
 197
 198         if (!data->copy)
 199                 p = isl_printer_print_str(p, ", NULL");
 200         else if (gpu_array_is_scalar(array)) {
 201                 p = isl_printer_print_str(p, ", &");
 202                 p = isl_printer_print_str(p, array->name);
 203         } else {
 204                 p = isl_printer_print_str(p, ", ");
 205                 p = isl_printer_print_str(p, array->name);
 206         }
 207
 208         p = isl_printer_print_str(p, ", &err);");
 209         p = isl_printer_end_line(p);
 210         p = isl_printer_start_line(p);
 211         p = isl_printer_print_str(p, "openclCheckReturn(err);");
 212         p = isl_printer_end_line(p);
 213
 214         p = ppcg_end_block(p);
 215
 216         return p;
 217 }
 218
 219 /* Allocate device arrays and copy the contents of copy_in arrays into device.
 220  *
 221  * Only perform the allocation for arrays with strictly positive size.
 222  */
 223 static __isl_give isl_printer *opencl_allocate_device_arrays(
 224         __isl_take isl_printer *p, struct gpu_prog *prog)
 225 {
 226         int i, j;
 227
 228         for (i = 0; i < prog->n_array; ++i) {
 229                 struct opencl_allocate_device_array_data data;
 230                 struct gpu_array_info *array = &prog->array[i];
 231                 isl_space *space;
 232                 isl_set *read_i;
 233                 isl_set *guard;
 234                 int empty;
 235
 236                 if (gpu_array_is_read_only_scalar(array))
 237                         continue;
 238
 239                 space = isl_space_copy(array->space);
 240                 read_i = isl_union_set_extract_set(prog->copy_in, space);
 241                 empty = isl_set_fast_is_empty(read_i);
 242                 isl_set_free(read_i);
 243
 244                 guard = gpu_array_positive_size_guard(array);
 245                 data.array = array;
 246                 data.copy = !empty;
 247                 p = ppcg_print_guarded(p, guard, isl_set_copy(prog->context),
 248                                             &allocate_device_array, &data);
 249         }
 250         p = isl_printer_start_line(p);
 251         p = isl_printer_end_line(p);
 252         return p;
 253 }
 254
 255 /* Print a call to the OpenCL clSetKernelArg() function which sets
 256  * the arguments of the kernel.  arg_name and arg_index are the name and the
 257  * index of the kernel argument.  The index of the leftmost argument of
 258  * the kernel is 0 whereas the index of the rightmost argument of the kernel
 259  * is n - 1, where n is the total number of the kernel arguments.
 260  * read_only_scalar is a boolean that indicates whether the argument is a read
 261  * only scalar.
 262  */
 263 static __isl_give isl_printer *opencl_set_kernel_argument(
 264         __isl_take isl_printer *p, int kernel_id,
 265         const char *arg_name, int arg_index, int read_only_scalar)
 266 {
 267         p = isl_printer_start_line(p);
 268         p = isl_printer_print_str(p,
 269                 "openclCheckReturn(clSetKernelArg(kernel");
 270         p = isl_printer_print_int(p, kernel_id);
 271         p = isl_printer_print_str(p, ", ");
 272         p = isl_printer_print_int(p, arg_index);
 273         p = isl_printer_print_str(p, ", sizeof(");
 274
 275         if (read_only_scalar) {
 276                 p = isl_printer_print_str(p, arg_name);
 277                 p = isl_printer_print_str(p, "), &");
 278         } else
 279                 p = isl_printer_print_str(p, "cl_mem), (void *) &dev_");
 280
 281         p = isl_printer_print_str(p, arg_name);
 282         p = isl_printer_print_str(p, "));");
 283         p = isl_printer_end_line(p);
 284
 285         return p;
 286 }
 287
 288 /* Print the block sizes as a list of the sizes in each
 289  * dimension.
 290  */
 291 static __isl_give isl_printer *opencl_print_block_sizes(
 292         __isl_take isl_printer *p, struct ppcg_kernel *kernel)
 293 {
 294         int i;
 295
 296         if (kernel->n_block > 0)
 297                 for (i = 0; i < kernel->n_block; ++i) {
 298                         if (i)
 299                                 p = isl_printer_print_str(p, ", ");
 300                         p = isl_printer_print_int(p, kernel->block_dim[i]);
 301                 }
 302         else
 303                 p = isl_printer_print_str(p, "1");
 304
 305         return p;
 306 }
 307
 308 /* Set the arguments of the OpenCL kernel by printing a call to the OpenCL
 309  * clSetKernelArg() function for each kernel argument.
 310  */
 311 static __isl_give isl_printer *opencl_set_kernel_arguments(
 312         __isl_take isl_printer *p, struct gpu_prog *prog,
 313         struct ppcg_kernel *kernel)
 314 {
 315         int i, n, ro;
 316         unsigned nparam;
 317         isl_space *space;
 318         const char *type;
 319         int arg_index = 0;
 320
 321         for (i = 0; i < prog->n_array; ++i) {
 322                 isl_set *arr;
 323                 int empty;
 324
 325                 space = isl_space_copy(prog->array[i].space);
 326                 arr = isl_union_set_extract_set(kernel->arrays, space);
 327                 empty = isl_set_fast_is_empty(arr);
 328                 isl_set_free(arr);
 329                 if (empty)
 330                         continue;
 331                 ro = gpu_array_is_read_only_scalar(&prog->array[i]);
 332                 opencl_set_kernel_argument(p, kernel->id, prog->array[i].name,
 333                         arg_index, ro);
 334                 arg_index++;
 335         }
 336
 337         space = isl_union_set_get_space(kernel->arrays);
 338         nparam = isl_space_dim(space, isl_dim_param);
 339         for (i = 0; i < nparam; ++i) {
 340                 const char *name;
 341
 342                 name = isl_space_get_dim_name(space, isl_dim_param, i);
 343                 opencl_set_kernel_argument(p, kernel->id, name, arg_index, 1);
 344                 arg_index++;
 345         }
 346         isl_space_free(space);
 347
 348         n = isl_space_dim(kernel->space, isl_dim_set);
 349         for (i = 0; i < n; ++i) {
 350                 const char *name;
 351                 isl_id *id;
 352
 353                 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
 354                 opencl_set_kernel_argument(p, kernel->id, name, arg_index, 1);
 355                 arg_index++;
 356         }
 357
 358         return p;
 359 }
 360
 361 /* Print the arguments to a kernel declaration or call.  If "types" is set,
 362  * then print a declaration (including the types of the arguments).
 363  *
 364  * The arguments are printed in the following order
 365  * - the arrays accessed by the kernel
 366  * - the parameters
 367  * - the host loop iterators
 368  */
 369 static __isl_give isl_printer *opencl_print_kernel_arguments(
 370         __isl_take isl_printer *p, struct gpu_prog *prog,
 371         struct ppcg_kernel *kernel, int types)
 372 {
 373         int i, n;
 374         int first = 1;
 375         unsigned nparam;
 376         isl_space *space;
 377         const char *type;
 378
 379         for (i = 0; i < prog->n_array; ++i) {
 380                 isl_set *arr;
 381                 int empty;
 382
 383                 space = isl_space_copy(prog->array[i].space);
 384                 arr = isl_union_set_extract_set(kernel->arrays, space);
 385                 empty = isl_set_fast_is_empty(arr);
 386                 isl_set_free(arr);
 387                 if (empty)
 388                         continue;
 389
 390                 if (!first)
 391                         p = isl_printer_print_str(p, ", ");
 392
 393                 if (types)
 394                         p = gpu_array_info_print_declaration_argument(p,
 395                                 &prog->array[i], "__global");
 396                 else
 397                         p = gpu_array_info_print_call_argument(p,
 398                                 &prog->array[i]);
 399
 400                 first = 0;
 401         }
 402
 403         space = isl_union_set_get_space(kernel->arrays);
 404         nparam = isl_space_dim(space, isl_dim_param);
 405         for (i = 0; i < nparam; ++i) {
 406                 const char *name;
 407
 408                 name = isl_space_get_dim_name(space, isl_dim_param, i);
 409
 410                 if (!first)
 411                         p = isl_printer_print_str(p, ", ");
 412                 if (types)
 413                         p = isl_printer_print_str(p, "int ");
 414                 p = isl_printer_print_str(p, name);
 415
 416                 first = 0;
 417         }
 418         isl_space_free(space);
 419
 420         n = isl_space_dim(kernel->space, isl_dim_set);
 421         type = isl_options_get_ast_iterator_type(prog->ctx);
 422         for (i = 0; i < n; ++i) {
 423                 const char *name;
 424                 isl_id *id;
 425
 426                 if (!first)
 427                         p = isl_printer_print_str(p, ", ");
 428                 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
 429                 if (types) {
 430                         p = isl_printer_print_str(p, type);
 431                         p = isl_printer_print_str(p, " ");
 432                 }
 433                 p = isl_printer_print_str(p, name);
 434
 435                 first = 0;
 436         }
 437
 438         return p;
 439 }
 440
 441 /* Print the header of the given kernel.
 442  */
 443 static __isl_give isl_printer *opencl_print_kernel_header(
 444         __isl_take isl_printer *p, struct gpu_prog *prog,
 445         struct ppcg_kernel *kernel)
 446 {
 447         p = isl_printer_start_line(p);
 448         p = isl_printer_print_str(p, "__kernel void kernel");
 449         p = isl_printer_print_int(p, kernel->id);
 450         p = isl_printer_print_str(p, "(");
 451         p = opencl_print_kernel_arguments(p, prog, kernel, 1);
 452         p = isl_printer_print_str(p, ")");
 453         p = isl_printer_end_line(p);
 454
 455         return p;
 456 }
 457
 458 /* Unlike the equivalent function in the CUDA backend which prints iterators
 459  * in reverse order to promote coalescing, this function does not print
 460  * iterators in reverse order.  The OpenCL backend currently does not take
 461  * into account any coalescing considerations.
 462  */
 463 static __isl_give isl_printer *opencl_print_kernel_iterators(
 464         __isl_take isl_printer *p, struct ppcg_kernel *kernel)
 465 {
 466         int i, n_grid;
 467         isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
 468         const char *type;
 469
 470         type = isl_options_get_ast_iterator_type(ctx);
 471
 472         n_grid = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
 473         if (n_grid > 0) {
 474                 p = isl_printer_start_line(p);
 475                 p = isl_printer_print_str(p, type);
 476                 p = isl_printer_print_str(p, " ");
 477                 for (i = 0; i < n_grid; ++i) {
 478                         if (i)
 479                                 p = isl_printer_print_str(p, ", ");
 480                         p = isl_printer_print_str(p, "b");
 481                         p = isl_printer_print_int(p, i);
 482                         p = isl_printer_print_str(p, " = get_group_id(");
 483                         p = isl_printer_print_int(p, i);
 484                         p = isl_printer_print_str(p, ")");
 485                 }
 486                 p = isl_printer_print_str(p, ";");
 487                 p = isl_printer_end_line(p);
 488         }
 489
 490         if (kernel->n_block > 0) {
 491                 p = isl_printer_start_line(p);
 492                 p = isl_printer_print_str(p, type);
 493                 p = isl_printer_print_str(p, " ");
 494                 for (i = 0; i < kernel->n_block; ++i) {
 495                         if (i)
 496                                 p = isl_printer_print_str(p, ", ");
 497                         p = isl_printer_print_str(p, "t");
 498                         p = isl_printer_print_int(p, i);
 499                         p = isl_printer_print_str(p, " = get_local_id(");
 500                         p = isl_printer_print_int(p, i);
 501                         p = isl_printer_print_str(p, ")");
 502                 }
 503                 p = isl_printer_print_str(p, ";");
 504                 p = isl_printer_end_line(p);
 505         }
 506
 507         return p;
 508 }
 509
 510 static __isl_give isl_printer *opencl_print_kernel_var(
 511         __isl_take isl_printer *p, struct ppcg_kernel_var *var)
 512 {
 513         int j;
 514         isl_val *v;
 515
 516         p = isl_printer_start_line(p);
 517         if (var->type == ppcg_access_shared)
 518                 p = isl_printer_print_str(p, "__local ");
 519         p = isl_printer_print_str(p, var->array->type);
 520         p = isl_printer_print_str(p, " ");
 521         p = isl_printer_print_str(p, var->name);
 522         for (j = 0; j < var->array->n_index; ++j) {
 523                 p = isl_printer_print_str(p, "[");
 524                 v = isl_vec_get_element_val(var->size, j);
 525                 p = isl_printer_print_val(p, v);
 526                 p = isl_printer_print_str(p, "]");
 527                 isl_val_free(v);
 528         }
 529         p = isl_printer_print_str(p, ";");
 530         p = isl_printer_end_line(p);
 531
 532         return p;
 533 }
 534
 535 static __isl_give isl_printer *opencl_print_kernel_vars(
 536                 __isl_take isl_printer *p, struct ppcg_kernel *kernel)
 537 {
 538         int i;
 539
 540         for (i = 0; i < kernel->n_var; ++i)
 541                 p = opencl_print_kernel_var(p, &kernel->var[i]);
 542
 543         return p;
 544 }
 545
 546 /* Print a call to barrier() which is a sync statement.
 547  * All work-items in a work-group executing the kernel on a processor must
 548  * execute the barrier() function before any are allowed to continue execution
 549  * beyond the barrier.
 550  * The flag CLK_LOCAL_MEM_FENCE makes the barrier function either flush any
 551  * variables stored in local memory or queue a memory fence to ensure correct
 552  * ordering of memory operations to local memory.
 553  * The flag CLK_GLOBAL_MEM_FENCE makes the barrier function queue a memory
 554  * fence to ensure correct ordering of memory operations to global memory.
 555  */
 556 static __isl_give isl_printer *opencl_print_sync(__isl_take isl_printer *p,
 557         struct ppcg_kernel_stmt *stmt)
 558 {
 559         p = isl_printer_start_line(p);
 560         p = isl_printer_print_str(p,
 561                 "barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);");
 562         p = isl_printer_end_line(p);
 563
 564         return p;
 565 }
 566
 567 /* This function is called for each user statement in the AST,
 568  * i.e., for each kernel body statement, copy statement or sync statement.
 569  */
 570 static __isl_give isl_printer *opencl_print_kernel_stmt(
 571         __isl_take isl_printer *p,
 572         __isl_take isl_ast_print_options *print_options,
 573         __isl_keep isl_ast_node *node, void *user)
 574 {
 575         isl_id *id;
 576         struct ppcg_kernel_stmt *stmt;
 577
 578         id = isl_ast_node_get_annotation(node);
 579         stmt = isl_id_get_user(id);
 580         isl_id_free(id);
 581
 582         isl_ast_print_options_free(print_options);
 583
 584         switch (stmt->type) {
 585         case ppcg_kernel_copy:
 586                 return ppcg_kernel_print_copy(p, stmt);
 587         case ppcg_kernel_sync:
 588                 return opencl_print_sync(p, stmt);
 589         case ppcg_kernel_domain:
 590                 return ppcg_kernel_print_domain(p, stmt);
 591         }
 592
 593         return p;
 594 }
 595
 596 /* Return true if there is a double array in prog->array or
 597  * if any of the types in prog->scop involve any doubles.
 598  * To check the latter condition, we simply search for the string "double"
 599  * in the type definitions, which may result in false positives.
 600  */
 601 static __isl_give int any_double_elements(struct gpu_prog *prog)
 602 {
 603         int i;
 604
 605         for (i = 0; i < prog->n_array; ++i)
 606                 if (strcmp(prog->array[i].type, "double") == 0)
 607                         return 1;
 608
 609         for (i = 0; i < prog->scop->n_type; ++i) {
 610                 struct pet_type *type = prog->scop->types[i];
 611
 612                 if (strstr(type->definition, "double"))
 613                         return 1;
 614         }
 615
 616         return 0;
 617 }
 618
 619 /* Prints a #pragma to enable support for double floating-point
 620  * precision.  OpenCL 1.0 adds support for double precision floating-point as
 621  * an optional extension. An application that wants to use double will need to
 622  * include the #pragma OPENCL EXTENSION cl_khr_fp64 : enable directive before
 623  * any double precision data type is declared in the kernel code.
 624  */
 625 static __isl_give isl_printer *opencl_enable_double_support(
 626         __isl_take isl_printer *p)
 627 {
 628         int i;
 629
 630         p = isl_printer_start_line(p);
 631         p = isl_printer_print_str(p, "#pragma OPENCL EXTENSION cl_khr_fp64 :"
 632                 " enable");
 633         p = isl_printer_end_line(p);
 634         p = isl_printer_start_line(p);
 635         p = isl_printer_end_line(p);
 636
 637         return p;
 638 }
 639
 640 static void opencl_print_kernel(struct gpu_prog *prog,
 641         struct ppcg_kernel *kernel, struct opencl_info *opencl)
 642 {
 643         isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
 644         isl_ast_print_options *print_options;
 645         isl_printer *p;
 646
 647         p = isl_printer_to_file(ctx, opencl->kernel_c);
 648         print_options = isl_ast_print_options_alloc(ctx);
 649         print_options = isl_ast_print_options_set_print_user(print_options,
 650                                 &opencl_print_kernel_stmt, NULL);
 651
 652         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 653         p = opencl_print_kernel_header(p, prog, kernel);
 654         p = isl_printer_print_str(p, "{");
 655         p = isl_printer_end_line(p);
 656         p = isl_printer_indent(p, 4);
 657         p = opencl_print_kernel_iterators(p, kernel);
 658         p = opencl_print_kernel_vars(p, kernel);
 659         p = isl_printer_end_line(p);
 660         p = gpu_print_macros(p, kernel->tree);
 661         p = isl_ast_node_print(kernel->tree, p, print_options);
 662         p = isl_printer_print_str(p, "}");
 663         p = isl_printer_end_line(p);
 664         isl_printer_free(p);
 665 }
 666
 667 struct print_host_user_data_opencl {
 668         struct opencl_info *opencl;
 669         struct gpu_prog *prog;
 670 };
 671
 672 /* This function prints the i'th block size multiplied by the i'th grid size,
 673  * where i (a parameter to this function) is one of the possible dimensions of
 674  * grid sizes and block sizes.
 675  * If the dimension of block sizes is not equal to the dimension of grid sizes
 676  * the output is calculated as follows:
 677  *
 678  * Suppose that:
 679  * block_sizes[dim1] is the list of blocks sizes and it contains dim1 elements.
 680  * grid_sizes[dim2] is the list of grid sizes and it contains dim2 elements.
 681  *
 682  * The output is:
 683  * If (i > dim2) then the output is block_sizes[i]
 684  * If (i > dim1) then the output is grid_sizes[i]
 685  */
 686 static __isl_give isl_printer *opencl_print_total_number_of_work_items_for_dim(
 687         __isl_take isl_printer *p, struct ppcg_kernel *kernel, int i)
 688 {
 689         int grid_dim, block_dim;
 690
 691         grid_dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
 692         block_dim = kernel->n_block;
 693
 694         isl_pw_aff *bound_grid;
 695
 696         if (i < min(grid_dim, block_dim)) {
 697                 bound_grid = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
 698                 p = isl_printer_print_str(p, "(");
 699                 p = isl_printer_print_pw_aff(p, bound_grid);
 700                 p = isl_printer_print_str(p, ") * ");
 701                 p = isl_printer_print_int(p, kernel->block_dim[i]);
 702                 isl_pw_aff_free(bound_grid);
 703         } else if (i >= grid_dim)
 704                 p = isl_printer_print_int(p, kernel->block_dim[i]);
 705         else {
 706                 bound_grid = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
 707                 p = isl_printer_print_pw_aff(p, bound_grid);
 708                 isl_pw_aff_free(bound_grid);
 709         }
 710
 711         return p;
 712 }
 713
 714 /* Print a list that represents the total number of work items.  The list is
 715  * constructed by performing an element-wise multiplication of the block sizes
 716  * and the grid sizes.  To explain how the list is constructed, suppose that:
 717  * block_sizes[dim1] is the list of blocks sizes and it contains dim1 elements.
 718  * grid_sizes[dim2] is the list of grid sizes and it contains dim2 elements.
 719  *
 720  * The output of this function is constructed as follows:
 721  * If (dim1 > dim2) then the output is the following list:
 722  * grid_sizes[0]*block_sizes[0], ..., grid_sizes[dim2-1]*block_sizes[dim2-1],
 723  * block_sizes[dim2], ..., block_sizes[dim1-2], block_sizes[dim1-1].
 724  *
 725  * If (dim2 > dim1) then the output is the following list:
 726  * grid_sizes[0]*block_sizes[0], ..., grid_sizes[dim1-1] * block_sizes[dim1-1],
 727  * grid_sizes[dim1], grid_sizes[dim2-2], grid_sizes[dim2-1].
 728  *
 729  * To calculate the total number of work items out of the list constructed by
 730  * this function, the user should multiply the elements of the list.
 731  */
 732 static __isl_give isl_printer *opencl_print_total_number_of_work_items_as_list(
 733         __isl_take isl_printer *p, struct ppcg_kernel *kernel)
 734 {
 735         int i;
 736         int grid_dim, block_dim;
 737
 738         grid_dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
 739         block_dim = kernel->n_block;
 740
 741         if ((grid_dim <= 0) || (block_dim <= 0)) {
 742                 p = isl_printer_print_str(p, "1");
 743                 return p;
 744         }
 745
 746         for (i = 0; i <= max(grid_dim, block_dim) - 1; i++) {
 747                 if (i > 0)
 748                         p = isl_printer_print_str(p, ", ");
 749
 750                 p = opencl_print_total_number_of_work_items_for_dim(p,
 751                         kernel, i);
 752         }
 753
 754         return p;
 755 }
 756
 757 /* Print the user statement of the host code to "p".
 758  *
 759  * In particular, print a block of statements that defines the grid
 760  * and the work group and then launches the kernel.
 761  *
 762  * A grid is composed of many work groups (blocks), each work group holds
 763  * many work-items (threads).
 764  *
 765  * global_work_size[kernel->n_block] represents the total number of work
 766  * items.  It points to an array of kernel->n_block unsigned
 767  * values that describe the total number of work-items that will execute
 768  * the kernel.  The total number of work-items is computed as:
 769  * global_work_size[0] *...* global_work_size[kernel->n_block - 1].
 770  *
 771  * The size of each work group (i.e. the number of work-items in each work
 772  * group) is described using block_size[kernel->n_block].  The total
 773  * number of work-items in a block (work-group) is computed as:
 774  * block_size[0] *... * block_size[kernel->n_block - 1].
 775  *
 776  * For more information check:
 777  * http://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
 778  */
 779 static __isl_give isl_printer *opencl_print_host_user(
 780         __isl_take isl_printer *p,
 781         __isl_take isl_ast_print_options *print_options,
 782         __isl_keep isl_ast_node *node, void *user)
 783 {
 784         isl_id *id;
 785         struct ppcg_kernel *kernel;
 786         struct print_host_user_data_opencl *data;
 787         int i;
 788
 789         id = isl_ast_node_get_annotation(node);
 790         kernel = isl_id_get_user(id);
 791         isl_id_free(id);
 792
 793         data = (struct print_host_user_data_opencl *) user;
 794
 795         p = isl_printer_start_line(p);
 796         p = isl_printer_print_str(p, "{");
 797         p = isl_printer_end_line(p);
 798         p = isl_printer_indent(p, 2);
 799
 800         p = isl_printer_start_line(p);
 801         p = isl_printer_print_str(p, "size_t global_work_size[");
 802
 803         if (kernel->n_block > 0)
 804                 p = isl_printer_print_int(p, kernel->n_block);
 805         else
 806                 p = isl_printer_print_int(p, 1);
 807
 808         p = isl_printer_print_str(p, "] = {");
 809         p = opencl_print_total_number_of_work_items_as_list(p, kernel);
 810         p = isl_printer_print_str(p, "};");
 811         p = isl_printer_end_line(p);
 812
 813         p = isl_printer_start_line(p);
 814         p = isl_printer_print_str(p, "size_t block_size[");
 815
 816         if (kernel->n_block > 0)
 817                 p = isl_printer_print_int(p, kernel->n_block);
 818         else
 819                 p = isl_printer_print_int(p, 1);
 820
 821         p = isl_printer_print_str(p, "] = {");
 822         p = opencl_print_block_sizes(p, kernel);
 823         p = isl_printer_print_str(p, "};");
 824         p = isl_printer_end_line(p);
 825
 826         p = isl_printer_start_line(p);
 827         p = isl_printer_print_str(p, "cl_kernel kernel");
 828         p = isl_printer_print_int(p, kernel->id);
 829         p = isl_printer_print_str(p, " = clCreateKernel(program, \"kernel");
 830         p = isl_printer_print_int(p, kernel->id);
 831         p = isl_printer_print_str(p, "\", &err);");
 832         p = isl_printer_end_line(p);
 833         p = isl_printer_start_line(p);
 834         p = isl_printer_print_str(p, "openclCheckReturn(err);");
 835         p = isl_printer_end_line(p);
 836
 837         opencl_set_kernel_arguments(p, data->prog, kernel);
 838
 839         p = isl_printer_start_line(p);
 840         p = isl_printer_print_str(p, "openclCheckReturn(clEnqueueNDRangeKernel"
 841                 "(queue, kernel");
 842         p = isl_printer_print_int(p, kernel->id);
 843         p = isl_printer_print_str(p, ", ");
 844         if (kernel->n_block > 0)
 845                 p = isl_printer_print_int(p, kernel->n_block);
 846         else
 847                 p = isl_printer_print_int(p, 1);
 848
 849         p = isl_printer_print_str(p, ", NULL, global_work_size,"
 850                                         "block_size,"
 851                                         "0, NULL, NULL));");
 852         p = isl_printer_end_line(p);
 853         p = isl_printer_start_line(p);
 854         p = isl_printer_print_str(p, "openclCheckReturn("
 855                                         "clReleaseKernel(kernel");
 856         p = isl_printer_print_int(p, kernel->id);
 857         p = isl_printer_print_str(p, "));");
 858         p = isl_printer_end_line(p);
 859         p = isl_printer_start_line(p);
 860         p = isl_printer_print_str(p, "clFinish(queue);");
 861         p = isl_printer_end_line(p);
 862         p = isl_printer_indent(p, -2);
 863         p = isl_printer_start_line(p);
 864         p = isl_printer_print_str(p, "}");
 865         p = isl_printer_end_line(p);
 866
 867         p = isl_printer_start_line(p);
 868         p = isl_printer_end_line(p);
 869
 870         opencl_print_kernel(data->prog, kernel, data->opencl);
 871
 872         isl_ast_print_options_free(print_options);
 873
 874         return p;
 875 }
 876
 877 static __isl_give isl_printer *opencl_print_host_code(
 878         __isl_take isl_printer *p, struct gpu_prog *prog,
 879         __isl_keep isl_ast_node *tree, struct opencl_info *opencl)
 880 {
 881         isl_ast_print_options *print_options;
 882         isl_ctx *ctx = isl_ast_node_get_ctx(tree);
 883         struct print_host_user_data_opencl data = { opencl, prog };
 884
 885         print_options = isl_ast_print_options_alloc(ctx);
 886         print_options = isl_ast_print_options_set_print_user(print_options,
 887                                 &opencl_print_host_user, &data);
 888
 889         p = gpu_print_macros(p, tree);
 890         p = isl_ast_node_print(tree, p, print_options);
 891
 892         return p;
 893 }
 894
 895 /* Copy "array" back from the GPU to the host.
 896  */
 897 static __isl_give isl_printer *copy_array_from_device(__isl_take isl_printer *p,
 898         void *user)
 899 {
 900         struct gpu_array_info *array = user;
 901
 902         p = isl_printer_start_line(p);
 903         p = isl_printer_print_str(p, "openclCheckReturn("
 904                                         "clEnqueueReadBuffer(queue,"
 905                                         " dev_");
 906         p = isl_printer_print_str(p, array->name);
 907         p = isl_printer_print_str(p, ", CL_TRUE, 0, ");
 908         p = gpu_array_info_print_size(p, array);
 909
 910         if (gpu_array_is_scalar(array))
 911                 p = isl_printer_print_str(p, ", &");
 912         else
 913                 p = isl_printer_print_str(p, ", ");
 914         p = isl_printer_print_str(p, array->name);
 915         p = isl_printer_print_str(p, ", 0, NULL, NULL));");
 916         p = isl_printer_end_line(p);
 917
 918         return p;
 919 }
 920
 921 /* Copy copy_out arrays back from the GPU to the host.
 922  *
 923  * Only perform the copying for arrays with strictly positive size.
 924  */
 925 static __isl_give isl_printer *opencl_copy_arrays_from_device(
 926         __isl_take isl_printer *p, struct gpu_prog *prog)
 927 {
 928         int i;
 929         isl_union_set *copy_out;
 930         copy_out = isl_union_set_copy(prog->copy_out);
 931
 932         for (i = 0; i < prog->n_array; ++i) {
 933                 struct gpu_array_info *array = &prog->array[i];
 934                 isl_space *space;
 935                 isl_set *copy_out_i;
 936                 isl_set *guard;
 937                 int empty;
 938
 939                 space = isl_space_copy(array->space);
 940                 copy_out_i = isl_union_set_extract_set(copy_out, space);
 941                 empty = isl_set_fast_is_empty(copy_out_i);
 942                 isl_set_free(copy_out_i);
 943                 if (empty)
 944                         continue;
 945
 946                 guard = gpu_array_positive_size_guard(array);
 947                 p = ppcg_print_guarded(p, guard, isl_set_copy(prog->context),
 948                                             &copy_array_from_device, array);
 949         }
 950
 951         isl_union_set_free(copy_out);
 952         p = isl_printer_start_line(p);
 953         p = isl_printer_end_line(p);
 954         return p;
 955 }
 956
 957 /* Create an OpenCL device, context, command queue and build the kernel.
 958  * input is the name of the input file provided to ppcg.
 959  */
 960 static __isl_give isl_printer *opencl_setup(__isl_take isl_printer *p,
 961         const char *input, struct opencl_info *info)
 962 {
 963         int len;
 964
 965         p = isl_printer_start_line(p);
 966         p = isl_printer_print_str(p, "cl_device_id device;");
 967         p = isl_printer_end_line(p);
 968         p = isl_printer_start_line(p);
 969         p = isl_printer_print_str(p, "cl_context context;");
 970         p = isl_printer_end_line(p);
 971         p = isl_printer_start_line(p);
 972         p = isl_printer_print_str(p, "cl_program program;");
 973         p = isl_printer_end_line(p);
 974         p = isl_printer_start_line(p);
 975         p = isl_printer_print_str(p, "cl_command_queue queue;");
 976         p = isl_printer_end_line(p);
 977         p = isl_printer_start_line(p);
 978         p = isl_printer_print_str(p, "cl_int err;");
 979         p = isl_printer_end_line(p);
 980         p = isl_printer_start_line(p);
 981         p = isl_printer_print_str(p, "device = opencl_create_device(");
 982         p = isl_printer_print_int(p, info->options->opencl_use_gpu);
 983         p = isl_printer_print_str(p, ");");
 984         p = isl_printer_end_line(p);
 985         p = isl_printer_start_line(p);
 986         p = isl_printer_print_str(p, "context = clCreateContext(NULL, 1,"
 987                 "&device, NULL, NULL, &err);");
 988         p = isl_printer_end_line(p);
 989         p = isl_printer_start_line(p);
 990         p = isl_printer_print_str(p, "openclCheckReturn(err);");
 991         p = isl_printer_end_line(p);
 992         p = isl_printer_start_line(p);
 993         p = isl_printer_print_str(p, "queue = clCreateCommandQueue"
 994                                         "(context, device, 0, &err);");
 995         p = isl_printer_end_line(p);
 996         p = isl_printer_start_line(p);
 997         p = isl_printer_print_str(p, "openclCheckReturn(err);");
 998         p = isl_printer_end_line(p);
 999
1000         p = isl_printer_start_line(p);
1001         p = isl_printer_print_str(p, "program = opencl_build_program("
1002                                         "context, device, \"");
1003         p = isl_printer_print_str(p, info->kernel_c_name);
1004         p = isl_printer_print_str(p, "\", \"");
1005
1006         if (info->options->opencl_compiler_options)
1007                 p = isl_printer_print_str(p,
1008                                         info->options->opencl_compiler_options);
1009
1010         p = isl_printer_print_str(p, "\");");
1011         p = isl_printer_end_line(p);
1012         p = isl_printer_start_line(p);
1013         p = isl_printer_end_line(p);
1014
1015         return p;
1016 }
1017
1018 static __isl_give isl_printer *opencl_release_cl_objects(
1019         __isl_take isl_printer *p, struct opencl_info *info)
1020 {
1021         p = isl_printer_start_line(p);
1022         p = isl_printer_print_str(p, "openclCheckReturn(clReleaseCommandQueue"
1023                                         "(queue));");
1024         p = isl_printer_end_line(p);
1025         p = isl_printer_start_line(p);
1026         p = isl_printer_print_str(p, "openclCheckReturn(clReleaseProgram"
1027                                         "(program));");
1028         p = isl_printer_end_line(p);
1029         p = isl_printer_start_line(p);
1030         p = isl_printer_print_str(p, "openclCheckReturn(clReleaseContext"
1031                                         "(context));");
1032         p = isl_printer_end_line(p);
1033
1034         return p;
1035 }
1036
1037 /* Free the device array corresponding to "array"
1038  */
1039 static __isl_give isl_printer *release_device_array(__isl_take isl_printer *p,
1040         void *user)
1041 {
1042         struct gpu_array_info *array = user;
1043
1044         p = isl_printer_start_line(p);
1045         p = isl_printer_print_str(p, "openclCheckReturn("
1046                                         "clReleaseMemObject(dev_");
1047         p = isl_printer_print_str(p, array->name);
1048         p = isl_printer_print_str(p, "));");
1049         p = isl_printer_end_line(p);
1050
1051         return p;
1052 }
1053
1054 /* Free the device arrays.
1055  *
1056  * Only free arrays with strictly positive size as those are the only ones
1057  * that have been allocated.
1058  */
1059 static __isl_give isl_printer *opencl_release_device_arrays(
1060         __isl_take isl_printer *p, struct gpu_prog *prog)
1061 {
1062         int i, j;
1063
1064         for (i = 0; i < prog->n_array; ++i) {
1065                 struct gpu_array_info *array = &prog->array[i];
1066                 isl_set *guard;
1067
1068                 if (gpu_array_is_read_only_scalar(array))
1069                         continue;
1070
1071                 guard = gpu_array_positive_size_guard(array);
1072                 p = ppcg_print_guarded(p, guard, isl_set_copy(prog->context),
1073                                             &release_device_array, array);
1074         }
1075         return p;
1076 }
1077
1078 /* Given a gpu_prog "prog" and the corresponding transformed AST
1079  * "tree", print the entire OpenCL code to "p".
1080  */
1081 static __isl_give isl_printer *print_opencl(__isl_take isl_printer *p,
1082         struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
1083         struct gpu_types *types, void *user)
1084 {
1085         struct opencl_info *opencl = user;
1086         isl_printer *kernel;
1087
1088         kernel = isl_printer_to_file(isl_printer_get_ctx(p), opencl->kernel_c);
1089         kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
1090         if (any_double_elements(prog))
1091                 kernel = opencl_enable_double_support(kernel);
1092         kernel = gpu_print_types(kernel, types, prog);
1093         isl_printer_free(kernel);
1094
1095         if (!kernel)
1096                 return isl_printer_free(p);
1097
1098         p = ppcg_start_block(p);
1099
1100         p = print_opencl_macros(p);
1101
1102         p = opencl_declare_device_arrays(p, prog);
1103         p = opencl_setup(p, opencl->input, opencl);
1104         p = opencl_allocate_device_arrays(p, prog);
1105
1106         p = opencl_print_host_code(p, prog, tree, opencl);
1107
1108         p = opencl_copy_arrays_from_device(p, prog);
1109         p = opencl_release_device_arrays(p, prog);
1110         p = opencl_release_cl_objects(p, opencl);
1111
1112         p = ppcg_end_block(p);
1113
1114         return p;
1115 }
1116
1117 /* Transform the code in the file called "input" by replacing
1118  * all scops by corresponding OpenCL code.
1119  * The host code is written to "output" or a name derived from
1120  * "input" if "output" is NULL.
1121  * The kernel code is placed in separate files with names
1122  * derived from "output" or "input".
1123  *
1124  * We let generate_gpu do all the hard work and then let it call
1125  * us back for printing the AST in print_cuda.
1126  *
1127  * To prepare for this printing, we first open the output files
1128  * and we close them after generate_gpu has finished.
1129  */
1130 int generate_opencl(isl_ctx *ctx, struct ppcg_options *options,
1131         const char *input, const char *output)
1132 {
1133         struct opencl_info opencl = { options, input, output };
1134         int r;
1135
1136         r = opencl_open_files(&opencl);
1137
1138         if (r >= 0)
1139                 r = generate_gpu(ctx, input, opencl.host_c, options,
1140                                 &print_opencl, &opencl);
1141
1142         opencl_close_files(&opencl);
1143
1144         return r;
1145 }