opencl.c

   1 /*
   2  * Copyright 2013      Ecole Normale Superieure
   3  *
   4  * Use of this software is governed by the GNU LGPLv2.1 license
   5  *
   6  * Written by Sven Verdoolaege and Riyadh Baghdadi,
   7  * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
   8  */
   9
  10 #include <ctype.h>
  11 #include <limits.h>
  12 #include <string.h>
  13
  14 #include <isl/aff.h>
  15 #include <isl/ast.h>
  16
  17 #include "opencl.h"
  18 #include "gpu_print.h"
  19 #include "gpu.h"
  20 #include "ppcg.h"
  21 #include "print.h"
  22 #include "schedule.h"
  23
  24 #define min(a, b)  (((a) < (b)) ? (a) : (b))
  25 #define max(a, b)  (((a) > (b)) ? (a) : (b))
  26
  27 /* options are the global options passed to generate_opencl.
  28  * input is the name of the input file.
  29  * output is the user-specified output file name and may be NULL
  30  *      if not specified by the user.
  31  * kernel_c_name is the name of the kernel_c file.
  32  * host_c is the generated source file for the host code.  kernel_c is
  33  * the generated source file for the kernel.  kernel_h is the generated
  34  * header file for the kernel.
  35  */
  36 struct opencl_info {
  37         struct ppcg_options *options;
  38         const char *input;
  39         const char *output;
  40         char kernel_c_name[PATH_MAX];
  41
  42         FILE *host_c;
  43         FILE *kernel_c;
  44         FILE *kernel_h;
  45 };
  46
  47 /* Open the host .c file and the kernel .h and .cl files for writing.
  48  * Their names are derived from info->output (or info->input if
  49  * the user did not specify an output file name).
  50  * Add the necessary includes to these files.
  51  */
  52 static void opencl_open_files(struct opencl_info *info)
  53 {
  54         char name[PATH_MAX];
  55         int len;
  56
  57         if (info->output) {
  58                 const char *ext;
  59
  60                 ext = strrchr(info->output, '.');
  61                 len = ext ? ext - info->output : strlen(info->output);
  62                 memcpy(name, info->output, len);
  63
  64                 info->host_c = fopen(info->output, "w");
  65         } else {
  66                 len = ppcg_extract_base_name(name, info->input);
  67
  68                 strcpy(name + len, "_host.c");
  69                 info->host_c = fopen(name, "w");
  70         }
  71
  72         memcpy(info->kernel_c_name, name, len);
  73         strcpy(info->kernel_c_name + len, "_kernel.cl");
  74         info->kernel_c = fopen(info->kernel_c_name, "w");
  75
  76         strcpy(name + len, "_kernel.h");
  77         info->kernel_h = fopen(name, "w");
  78         fprintf(info->host_c, "#include <assert.h>\n");
  79         fprintf(info->host_c, "#include <stdio.h>\n");
  80         fprintf(info->host_c, "#include \"%s\"\n\n", name);
  81         fprintf(info->kernel_h, "#if defined(__APPLE__)\n");
  82         fprintf(info->kernel_h, "#include <OpenCL/opencl.h>\n");
  83         fprintf(info->kernel_h, "#else\n");
  84         fprintf(info->kernel_h, "#include <CL/opencl.h>\n");
  85         fprintf(info->kernel_h, "#endif\n\n");
  86         fprintf(info->kernel_h, "cl_device_id opencl_create_device("
  87                                 "int use_gpu);\n");
  88         fprintf(info->kernel_h, "cl_program opencl_build_program("
  89                                 "cl_context ctx, "
  90                                 "cl_device_id dev, const char *filename, "
  91                                 "const char *opencl_options);\n");
  92         fprintf(info->kernel_h,
  93                 "const char *opencl_error_string(cl_int error);\n");
  94 }
  95
  96 /* Close all output files.
  97  */
  98 static void opencl_close_files(struct opencl_info *info)
  99 {
 100         fclose(info->kernel_c);
 101         fclose(info->kernel_h);
 102         fclose(info->host_c);
 103 }
 104
 105 static __isl_give isl_printer *print_opencl_macros(__isl_take isl_printer *p)
 106 {
 107         const char *macros =
 108                 "#define openclCheckReturn(ret) \\\n"
 109                 "  if (ret != CL_SUCCESS) {\\\n"
 110                 "    fprintf(stderr, \"OpenCL error: %s\\n\", "
 111                 " opencl_error_string(ret)); \\\n"
 112                 "    fflush(stderr); \\\n"
 113                 "    assert(ret == CL_SUCCESS);\\\n  }\n";
 114
 115         p = isl_printer_start_line(p);
 116         p = isl_printer_print_str(p, macros);
 117         p = isl_printer_end_line(p);
 118
 119         return p;
 120 }
 121
 122 static __isl_give isl_printer *opencl_declare_device_arrays(
 123         __isl_take isl_printer *p, struct gpu_prog *prog)
 124 {
 125         int i;
 126
 127         for (i = 0; i < prog->n_array; ++i) {
 128                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
 129                         continue;
 130                 p = isl_printer_start_line(p);
 131                 p = isl_printer_print_str(p, "cl_mem dev_");
 132                 p = isl_printer_print_str(p, prog->array[i].name);
 133                 p = isl_printer_print_str(p, ";");
 134                 p = isl_printer_end_line(p);
 135         }
 136         p = isl_printer_start_line(p);
 137         p = isl_printer_end_line(p);
 138         return p;
 139 }
 140
 141 /* Internal data structure for allocate_device_array.
 142  *
 143  * array is the array that needs to be allocated.
 144  * copy is set if the contents of this array need to be copied to the device.
 145  */
 146 struct opencl_allocate_device_array_data {
 147         struct gpu_array_info *array;
 148         int copy;
 149 };
 150
 151 /* Allocate a device array for data->array and copy the contents to the device
 152  * if data->copy is set.
 153  */
 154 static __isl_give isl_printer *allocate_device_array(__isl_take isl_printer *p,
 155         void *user)
 156 {
 157         struct opencl_allocate_device_array_data *data = user;
 158         struct gpu_array_info *array = data->array;
 159
 160         p = ppcg_start_block(p);
 161
 162         p = isl_printer_start_line(p);
 163         p = isl_printer_print_str(p, "dev_");
 164         p = isl_printer_print_str(p, array->name);
 165         p = isl_printer_print_str(p, " = clCreateBuffer(context, ");
 166         p = isl_printer_print_str(p, "CL_MEM_READ_WRITE");
 167
 168         if (!data->copy)
 169                 p = isl_printer_print_str(p, ", ");
 170         else
 171                 p = isl_printer_print_str(p, " | CL_MEM_COPY_HOST_PTR, ");
 172
 173         p = gpu_array_info_print_size(p, array);
 174
 175         if (!data->copy)
 176                 p = isl_printer_print_str(p, ", NULL");
 177         else if (gpu_array_is_scalar(array)) {
 178                 p = isl_printer_print_str(p, ", &");
 179                 p = isl_printer_print_str(p, array->name);
 180         } else {
 181                 p = isl_printer_print_str(p, ", ");
 182                 p = isl_printer_print_str(p, array->name);
 183         }
 184
 185         p = isl_printer_print_str(p, ", &err);");
 186         p = isl_printer_end_line(p);
 187         p = isl_printer_start_line(p);
 188         p = isl_printer_print_str(p, "openclCheckReturn(err);");
 189         p = isl_printer_end_line(p);
 190
 191         p = ppcg_end_block(p);
 192
 193         return p;
 194 }
 195
 196 /* Allocate device arrays and copy the contents of copy_in arrays into device.
 197  *
 198  * Only perform the allocation for arrays with strictly positive size.
 199  */
 200 static __isl_give isl_printer *opencl_allocate_device_arrays(
 201         __isl_take isl_printer *p, struct gpu_prog *prog)
 202 {
 203         int i, j;
 204
 205         for (i = 0; i < prog->n_array; ++i) {
 206                 struct opencl_allocate_device_array_data data;
 207                 struct gpu_array_info *array = &prog->array[i];
 208                 isl_space *space;
 209                 isl_set *read_i;
 210                 isl_set *guard;
 211                 int empty;
 212
 213                 if (gpu_array_is_read_only_scalar(array))
 214                         continue;
 215
 216                 space = isl_space_copy(array->space);
 217                 read_i = isl_union_set_extract_set(prog->copy_in, space);
 218                 empty = isl_set_fast_is_empty(read_i);
 219                 isl_set_free(read_i);
 220
 221                 guard = gpu_array_positive_size_guard(array);
 222                 data.array = array;
 223                 data.copy = !empty;
 224                 p = ppcg_print_guarded(p, guard, isl_set_copy(prog->context),
 225                                             &allocate_device_array, &data);
 226         }
 227         p = isl_printer_start_line(p);
 228         p = isl_printer_end_line(p);
 229         return p;
 230 }
 231
 232 /* Print a call to the OpenCL clSetKernelArg() function which sets
 233  * the arguments of the kernel.  arg_name and arg_index are the name and the
 234  * index of the kernel argument.  The index of the leftmost argument of
 235  * the kernel is 0 whereas the index of the rightmost argument of the kernel
 236  * is n - 1, where n is the total number of the kernel arguments.
 237  * read_only_scalar is a boolean that indicates whether the argument is a read
 238  * only scalar.
 239  */
 240 static __isl_give isl_printer *opencl_set_kernel_argument(
 241         __isl_take isl_printer *p, int kernel_id,
 242         const char *arg_name, int arg_index, int read_only_scalar)
 243 {
 244         p = isl_printer_start_line(p);
 245         p = isl_printer_print_str(p,
 246                 "openclCheckReturn(clSetKernelArg(kernel");
 247         p = isl_printer_print_int(p, kernel_id);
 248         p = isl_printer_print_str(p, ", ");
 249         p = isl_printer_print_int(p, arg_index);
 250         p = isl_printer_print_str(p, ", sizeof(");
 251
 252         if (read_only_scalar) {
 253                 p = isl_printer_print_str(p, arg_name);
 254                 p = isl_printer_print_str(p, "), &");
 255         } else
 256                 p = isl_printer_print_str(p, "cl_mem), (void *) &dev_");
 257
 258         p = isl_printer_print_str(p, arg_name);
 259         p = isl_printer_print_str(p, "));");
 260         p = isl_printer_end_line(p);
 261
 262         return p;
 263 }
 264
 265 /* Print the block sizes as a list of the sizes in each
 266  * dimension.
 267  */
 268 static __isl_give isl_printer *opencl_print_block_sizes(
 269         __isl_take isl_printer *p, struct ppcg_kernel *kernel)
 270 {
 271         int i;
 272
 273         if (kernel->n_block > 0)
 274                 for (i = 0; i < kernel->n_block; ++i) {
 275                         if (i)
 276                                 p = isl_printer_print_str(p, ", ");
 277                         p = isl_printer_print_int(p, kernel->block_dim[i]);
 278                 }
 279         else
 280                 p = isl_printer_print_str(p, "1");
 281
 282         return p;
 283 }
 284
 285 /* Set the arguments of the OpenCL kernel by printing a call to the OpenCL
 286  * clSetKernelArg() function for each kernel argument.
 287  */
 288 static __isl_give isl_printer *opencl_set_kernel_arguments(
 289         __isl_take isl_printer *p, struct gpu_prog *prog,
 290         struct ppcg_kernel *kernel)
 291 {
 292         int i, n, ro;
 293         unsigned nparam;
 294         isl_space *space;
 295         const char *type;
 296         int arg_index = 0;
 297
 298         for (i = 0; i < prog->n_array; ++i) {
 299                 isl_set *arr;
 300                 int empty;
 301
 302                 space = isl_space_copy(prog->array[i].space);
 303                 arr = isl_union_set_extract_set(kernel->arrays, space);
 304                 empty = isl_set_fast_is_empty(arr);
 305                 isl_set_free(arr);
 306                 if (empty)
 307                         continue;
 308                 ro = gpu_array_is_read_only_scalar(&prog->array[i]);
 309                 opencl_set_kernel_argument(p, kernel->id, prog->array[i].name,
 310                         arg_index, ro);
 311                 arg_index++;
 312         }
 313
 314         space = isl_union_set_get_space(kernel->arrays);
 315         nparam = isl_space_dim(space, isl_dim_param);
 316         for (i = 0; i < nparam; ++i) {
 317                 const char *name;
 318
 319                 name = isl_space_get_dim_name(space, isl_dim_param, i);
 320                 opencl_set_kernel_argument(p, kernel->id, name, arg_index, 1);
 321                 arg_index++;
 322         }
 323         isl_space_free(space);
 324
 325         n = isl_space_dim(kernel->space, isl_dim_set);
 326         for (i = 0; i < n; ++i) {
 327                 const char *name;
 328                 isl_id *id;
 329
 330                 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
 331                 opencl_set_kernel_argument(p, kernel->id, name, arg_index, 1);
 332                 arg_index++;
 333         }
 334
 335         return p;
 336 }
 337
 338 /* Print the arguments to a kernel declaration or call.  If "types" is set,
 339  * then print a declaration (including the types of the arguments).
 340  *
 341  * The arguments are printed in the following order
 342  * - the arrays accessed by the kernel
 343  * - the parameters
 344  * - the host loop iterators
 345  */
 346 static __isl_give isl_printer *opencl_print_kernel_arguments(
 347         __isl_take isl_printer *p, struct gpu_prog *prog,
 348         struct ppcg_kernel *kernel, int types)
 349 {
 350         int i, n;
 351         int first = 1;
 352         unsigned nparam;
 353         isl_space *space;
 354         const char *type;
 355
 356         for (i = 0; i < prog->n_array; ++i) {
 357                 isl_set *arr;
 358                 int empty;
 359
 360                 space = isl_space_copy(prog->array[i].space);
 361                 arr = isl_union_set_extract_set(kernel->arrays, space);
 362                 empty = isl_set_fast_is_empty(arr);
 363                 isl_set_free(arr);
 364                 if (empty)
 365                         continue;
 366
 367                 if (!first)
 368                         p = isl_printer_print_str(p, ", ");
 369
 370                 if (types)
 371                         p = gpu_array_info_print_declaration_argument(p,
 372                                 &prog->array[i], "__global");
 373                 else
 374                         p = gpu_array_info_print_call_argument(p,
 375                                 &prog->array[i]);
 376
 377                 first = 0;
 378         }
 379
 380         space = isl_union_set_get_space(kernel->arrays);
 381         nparam = isl_space_dim(space, isl_dim_param);
 382         for (i = 0; i < nparam; ++i) {
 383                 const char *name;
 384
 385                 name = isl_space_get_dim_name(space, isl_dim_param, i);
 386
 387                 if (!first)
 388                         p = isl_printer_print_str(p, ", ");
 389                 if (types)
 390                         p = isl_printer_print_str(p, "int ");
 391                 p = isl_printer_print_str(p, name);
 392
 393                 first = 0;
 394         }
 395         isl_space_free(space);
 396
 397         n = isl_space_dim(kernel->space, isl_dim_set);
 398         type = isl_options_get_ast_iterator_type(prog->ctx);
 399         for (i = 0; i < n; ++i) {
 400                 const char *name;
 401                 isl_id *id;
 402
 403                 if (!first)
 404                         p = isl_printer_print_str(p, ", ");
 405                 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
 406                 if (types) {
 407                         p = isl_printer_print_str(p, type);
 408                         p = isl_printer_print_str(p, " ");
 409                 }
 410                 p = isl_printer_print_str(p, name);
 411
 412                 first = 0;
 413         }
 414
 415         return p;
 416 }
 417
 418 /* Print the header of the given kernel.
 419  */
 420 static __isl_give isl_printer *opencl_print_kernel_header(
 421         __isl_take isl_printer *p, struct gpu_prog *prog,
 422         struct ppcg_kernel *kernel)
 423 {
 424         p = isl_printer_start_line(p);
 425         p = isl_printer_print_str(p, "__kernel void kernel");
 426         p = isl_printer_print_int(p, kernel->id);
 427         p = isl_printer_print_str(p, "(");
 428         p = opencl_print_kernel_arguments(p, prog, kernel, 1);
 429         p = isl_printer_print_str(p, ")");
 430         p = isl_printer_end_line(p);
 431
 432         return p;
 433 }
 434
 435 /* Unlike the equivalent function in the CUDA backend which prints iterators
 436  * in reverse order to promote coalescing, this function does not print
 437  * iterators in reverse order.  The OpenCL backend currently does not take
 438  * into account any coalescing considerations.
 439  */
 440 static __isl_give isl_printer *opencl_print_kernel_iterators(
 441         __isl_take isl_printer *p, struct ppcg_kernel *kernel)
 442 {
 443         int i, n_grid;
 444         isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
 445         const char *type;
 446
 447         type = isl_options_get_ast_iterator_type(ctx);
 448
 449         n_grid = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
 450         if (n_grid > 0) {
 451                 p = isl_printer_start_line(p);
 452                 p = isl_printer_print_str(p, type);
 453                 p = isl_printer_print_str(p, " ");
 454                 for (i = 0; i < n_grid; ++i) {
 455                         if (i)
 456                                 p = isl_printer_print_str(p, ", ");
 457                         p = isl_printer_print_str(p, "b");
 458                         p = isl_printer_print_int(p, i);
 459                         p = isl_printer_print_str(p, " = get_group_id(");
 460                         p = isl_printer_print_int(p, i);
 461                         p = isl_printer_print_str(p, ")");
 462                 }
 463                 p = isl_printer_print_str(p, ";");
 464                 p = isl_printer_end_line(p);
 465         }
 466
 467         if (kernel->n_block > 0) {
 468                 p = isl_printer_start_line(p);
 469                 p = isl_printer_print_str(p, type);
 470                 p = isl_printer_print_str(p, " ");
 471                 for (i = 0; i < kernel->n_block; ++i) {
 472                         if (i)
 473                                 p = isl_printer_print_str(p, ", ");
 474                         p = isl_printer_print_str(p, "t");
 475                         p = isl_printer_print_int(p, i);
 476                         p = isl_printer_print_str(p, " = get_local_id(");
 477                         p = isl_printer_print_int(p, i);
 478                         p = isl_printer_print_str(p, ")");
 479                 }
 480                 p = isl_printer_print_str(p, ";");
 481                 p = isl_printer_end_line(p);
 482         }
 483
 484         return p;
 485 }
 486
 487 static __isl_give isl_printer *opencl_print_kernel_var(
 488         __isl_take isl_printer *p, struct ppcg_kernel_var *var)
 489 {
 490         int j;
 491         isl_val *v;
 492
 493         p = isl_printer_start_line(p);
 494         if (var->type == ppcg_access_shared)
 495                 p = isl_printer_print_str(p, "__local ");
 496         p = isl_printer_print_str(p, var->array->type);
 497         p = isl_printer_print_str(p, " ");
 498         p = isl_printer_print_str(p, var->name);
 499         for (j = 0; j < var->array->n_index; ++j) {
 500                 p = isl_printer_print_str(p, "[");
 501                 v = isl_vec_get_element_val(var->size, j);
 502                 p = isl_printer_print_val(p, v);
 503                 p = isl_printer_print_str(p, "]");
 504                 isl_val_free(v);
 505         }
 506         p = isl_printer_print_str(p, ";");
 507         p = isl_printer_end_line(p);
 508
 509         return p;
 510 }
 511
 512 static __isl_give isl_printer *opencl_print_kernel_vars(
 513                 __isl_take isl_printer *p, struct ppcg_kernel *kernel)
 514 {
 515         int i;
 516
 517         for (i = 0; i < kernel->n_var; ++i)
 518                 p = opencl_print_kernel_var(p, &kernel->var[i]);
 519
 520         return p;
 521 }
 522
 523 /* Print a call to barrier() which is a sync statement.
 524  * All work-items in a work-group executing the kernel on a processor must
 525  * execute the barrier() function before any are allowed to continue execution
 526  * beyond the barrier.
 527  * The flag CLK_LOCAL_MEM_FENCE makes the barrier function either flush any
 528  * variables stored in local memory or queue a memory fence to ensure correct
 529  * ordering of memory operations to local memory.
 530  * The flag CLK_GLOBAL_MEM_FENCE makes the barrier function queue a memory
 531  * fence to ensure correct ordering of memory operations to global memory.
 532  */
 533 static __isl_give isl_printer *opencl_print_sync(__isl_take isl_printer *p,
 534         struct ppcg_kernel_stmt *stmt)
 535 {
 536         p = isl_printer_start_line(p);
 537         p = isl_printer_print_str(p,
 538                 "barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);");
 539         p = isl_printer_end_line(p);
 540
 541         return p;
 542 }
 543
 544 /* This function is called for each user statement in the AST,
 545  * i.e., for each kernel body statement, copy statement or sync statement.
 546  */
 547 static __isl_give isl_printer *opencl_print_kernel_stmt(
 548         __isl_take isl_printer *p,
 549         __isl_take isl_ast_print_options *print_options,
 550         __isl_keep isl_ast_node *node, void *user)
 551 {
 552         isl_id *id;
 553         struct ppcg_kernel_stmt *stmt;
 554
 555         id = isl_ast_node_get_annotation(node);
 556         stmt = isl_id_get_user(id);
 557         isl_id_free(id);
 558
 559         isl_ast_print_options_free(print_options);
 560
 561         switch (stmt->type) {
 562         case ppcg_kernel_copy:
 563                 return ppcg_kernel_print_copy(p, stmt);
 564         case ppcg_kernel_sync:
 565                 return opencl_print_sync(p, stmt);
 566         case ppcg_kernel_domain:
 567                 return ppcg_kernel_print_domain(p, stmt);
 568         }
 569
 570         return p;
 571 }
 572
 573 /* Return true if there is a double array in prog->array or
 574  * if any of the types in prog->scop involve any doubles.
 575  * To check the latter condition, we simply search for the string "double"
 576  * in the type definitions, which may result in false positives.
 577  */
 578 static __isl_give int any_double_elements(struct gpu_prog *prog)
 579 {
 580         int i;
 581
 582         for (i = 0; i < prog->n_array; ++i)
 583                 if (strcmp(prog->array[i].type, "double") == 0)
 584                         return 1;
 585
 586         for (i = 0; i < prog->scop->n_type; ++i) {
 587                 struct pet_type *type = prog->scop->types[i];
 588
 589                 if (strstr(type->definition, "double"))
 590                         return 1;
 591         }
 592
 593         return 0;
 594 }
 595
 596 /* Prints a #pragma to enable support for double floating-point
 597  * precision.  OpenCL 1.0 adds support for double precision floating-point as
 598  * an optional extension. An application that wants to use double will need to
 599  * include the #pragma OPENCL EXTENSION cl_khr_fp64 : enable directive before
 600  * any double precision data type is declared in the kernel code.
 601  */
 602 static __isl_give isl_printer *opencl_enable_double_support(
 603         __isl_take isl_printer *p)
 604 {
 605         int i;
 606
 607         p = isl_printer_start_line(p);
 608         p = isl_printer_print_str(p, "#pragma OPENCL EXTENSION cl_khr_fp64 :"
 609                 " enable");
 610         p = isl_printer_end_line(p);
 611         p = isl_printer_start_line(p);
 612         p = isl_printer_end_line(p);
 613
 614         return p;
 615 }
 616
 617 static void opencl_print_kernel(struct gpu_prog *prog,
 618         struct ppcg_kernel *kernel, struct opencl_info *opencl)
 619 {
 620         isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
 621         isl_ast_print_options *print_options;
 622         isl_printer *p;
 623
 624         p = isl_printer_to_file(ctx, opencl->kernel_c);
 625         print_options = isl_ast_print_options_alloc(ctx);
 626         print_options = isl_ast_print_options_set_print_user(print_options,
 627                                 &opencl_print_kernel_stmt, NULL);
 628
 629         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 630         p = opencl_print_kernel_header(p, prog, kernel);
 631         p = isl_printer_print_str(p, "{");
 632         p = isl_printer_end_line(p);
 633         p = isl_printer_indent(p, 4);
 634         p = opencl_print_kernel_iterators(p, kernel);
 635         p = opencl_print_kernel_vars(p, kernel);
 636         p = isl_printer_end_line(p);
 637         p = gpu_print_macros(p, kernel->tree);
 638         p = isl_ast_node_print(kernel->tree, p, print_options);
 639         p = isl_printer_print_str(p, "}");
 640         p = isl_printer_end_line(p);
 641         isl_printer_free(p);
 642 }
 643
 644 struct print_host_user_data_opencl {
 645         struct opencl_info *opencl;
 646         struct gpu_prog *prog;
 647 };
 648
 649 /* This function prints the i'th block size multiplied by the i'th grid size,
 650  * where i (a parameter to this function) is one of the possible dimensions of
 651  * grid sizes and block sizes.
 652  * If the dimension of block sizes is not equal to the dimension of grid sizes
 653  * the output is calculated as follows:
 654  *
 655  * Suppose that:
 656  * block_sizes[dim1] is the list of blocks sizes and it contains dim1 elements.
 657  * grid_sizes[dim2] is the list of grid sizes and it contains dim2 elements.
 658  *
 659  * The output is:
 660  * If (i > dim2) then the output is block_sizes[i]
 661  * If (i > dim1) then the output is grid_sizes[i]
 662  */
 663 static __isl_give isl_printer *opencl_print_total_number_of_work_items_for_dim(
 664         __isl_take isl_printer *p, struct ppcg_kernel *kernel, int i)
 665 {
 666         int grid_dim, block_dim;
 667
 668         grid_dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
 669         block_dim = kernel->n_block;
 670
 671         isl_pw_aff *bound_grid;
 672
 673         if (i < min(grid_dim, block_dim)) {
 674                 bound_grid = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
 675                 p = isl_printer_print_str(p, "(");
 676                 p = isl_printer_print_pw_aff(p, bound_grid);
 677                 p = isl_printer_print_str(p, ") * ");
 678                 p = isl_printer_print_int(p, kernel->block_dim[i]);
 679                 isl_pw_aff_free(bound_grid);
 680         } else if (i >= grid_dim)
 681                 p = isl_printer_print_int(p, kernel->block_dim[i]);
 682         else {
 683                 bound_grid = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
 684                 p = isl_printer_print_pw_aff(p, bound_grid);
 685                 isl_pw_aff_free(bound_grid);
 686         }
 687
 688         return p;
 689 }
 690
 691 /* Print a list that represents the total number of work items.  The list is
 692  * constructed by performing an element-wise multiplication of the block sizes
 693  * and the grid sizes.  To explain how the list is constructed, suppose that:
 694  * block_sizes[dim1] is the list of blocks sizes and it contains dim1 elements.
 695  * grid_sizes[dim2] is the list of grid sizes and it contains dim2 elements.
 696  *
 697  * The output of this function is constructed as follows:
 698  * If (dim1 > dim2) then the output is the following list:
 699  * grid_sizes[0]*block_sizes[0], ..., grid_sizes[dim2-1]*block_sizes[dim2-1],
 700  * block_sizes[dim2], ..., block_sizes[dim1-2], block_sizes[dim1-1].
 701  *
 702  * If (dim2 > dim1) then the output is the following list:
 703  * grid_sizes[0]*block_sizes[0], ..., grid_sizes[dim1-1] * block_sizes[dim1-1],
 704  * grid_sizes[dim1], grid_sizes[dim2-2], grid_sizes[dim2-1].
 705  *
 706  * To calculate the total number of work items out of the list constructed by
 707  * this function, the user should multiply the elements of the list.
 708  */
 709 static __isl_give isl_printer *opencl_print_total_number_of_work_items_as_list(
 710         __isl_take isl_printer *p, struct ppcg_kernel *kernel)
 711 {
 712         int i;
 713         int grid_dim, block_dim;
 714
 715         grid_dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
 716         block_dim = kernel->n_block;
 717
 718         if ((grid_dim <= 0) || (block_dim <= 0)) {
 719                 p = isl_printer_print_str(p, "1");
 720                 return p;
 721         }
 722
 723         for (i = 0; i <= max(grid_dim, block_dim) - 1; i++) {
 724                 if (i > 0)
 725                         p = isl_printer_print_str(p, ", ");
 726
 727                 p = opencl_print_total_number_of_work_items_for_dim(p,
 728                         kernel, i);
 729         }
 730
 731         return p;
 732 }
 733
 734 /* Print the user statement of the host code to "p".
 735  *
 736  * In particular, print a block of statements that defines the grid
 737  * and the work group and then launches the kernel.
 738  *
 739  * A grid is composed of many work groups (blocks), each work group holds
 740  * many work-items (threads).
 741  *
 742  * global_work_size[kernel->n_block] represents the total number of work
 743  * items.  It points to an array of kernel->n_block unsigned
 744  * values that describe the total number of work-items that will execute
 745  * the kernel.  The total number of work-items is computed as:
 746  * global_work_size[0] *...* global_work_size[kernel->n_block - 1].
 747  *
 748  * The size of each work group (i.e. the number of work-items in each work
 749  * group) is described using block_size[kernel->n_block].  The total
 750  * number of work-items in a block (work-group) is computed as:
 751  * block_size[0] *... * block_size[kernel->n_block - 1].
 752  *
 753  * For more information check:
 754  * http://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
 755  */
 756 static __isl_give isl_printer *opencl_print_host_user(
 757         __isl_take isl_printer *p,
 758         __isl_take isl_ast_print_options *print_options,
 759         __isl_keep isl_ast_node *node, void *user)
 760 {
 761         isl_id *id;
 762         struct ppcg_kernel *kernel;
 763         struct print_host_user_data_opencl *data;
 764         int i;
 765
 766         id = isl_ast_node_get_annotation(node);
 767         kernel = isl_id_get_user(id);
 768         isl_id_free(id);
 769
 770         data = (struct print_host_user_data_opencl *) user;
 771
 772         p = isl_printer_start_line(p);
 773         p = isl_printer_print_str(p, "{");
 774         p = isl_printer_end_line(p);
 775         p = isl_printer_indent(p, 2);
 776
 777         p = isl_printer_start_line(p);
 778         p = isl_printer_print_str(p, "size_t global_work_size[");
 779
 780         if (kernel->n_block > 0)
 781                 p = isl_printer_print_int(p, kernel->n_block);
 782         else
 783                 p = isl_printer_print_int(p, 1);
 784
 785         p = isl_printer_print_str(p, "] = {");
 786         p = opencl_print_total_number_of_work_items_as_list(p, kernel);
 787         p = isl_printer_print_str(p, "};");
 788         p = isl_printer_end_line(p);
 789
 790         p = isl_printer_start_line(p);
 791         p = isl_printer_print_str(p, "size_t block_size[");
 792
 793         if (kernel->n_block > 0)
 794                 p = isl_printer_print_int(p, kernel->n_block);
 795         else
 796                 p = isl_printer_print_int(p, 1);
 797
 798         p = isl_printer_print_str(p, "] = {");
 799         p = opencl_print_block_sizes(p, kernel);
 800         p = isl_printer_print_str(p, "};");
 801         p = isl_printer_end_line(p);
 802
 803         p = isl_printer_start_line(p);
 804         p = isl_printer_print_str(p, "cl_kernel kernel");
 805         p = isl_printer_print_int(p, kernel->id);
 806         p = isl_printer_print_str(p, " = clCreateKernel(program, \"kernel");
 807         p = isl_printer_print_int(p, kernel->id);
 808         p = isl_printer_print_str(p, "\", &err);");
 809         p = isl_printer_end_line(p);
 810         p = isl_printer_start_line(p);
 811         p = isl_printer_print_str(p, "openclCheckReturn(err);");
 812         p = isl_printer_end_line(p);
 813
 814         opencl_set_kernel_arguments(p, data->prog, kernel);
 815
 816         p = isl_printer_start_line(p);
 817         p = isl_printer_print_str(p, "openclCheckReturn(clEnqueueNDRangeKernel"
 818                 "(queue, kernel");
 819         p = isl_printer_print_int(p, kernel->id);
 820         p = isl_printer_print_str(p, ", ");
 821         if (kernel->n_block > 0)
 822                 p = isl_printer_print_int(p, kernel->n_block);
 823         else
 824                 p = isl_printer_print_int(p, 1);
 825
 826         p = isl_printer_print_str(p, ", NULL, global_work_size,"
 827                                         "block_size,"
 828                                         "0, NULL, NULL));");
 829         p = isl_printer_end_line(p);
 830         p = isl_printer_start_line(p);
 831         p = isl_printer_print_str(p, "openclCheckReturn("
 832                                         "clReleaseKernel(kernel");
 833         p = isl_printer_print_int(p, kernel->id);
 834         p = isl_printer_print_str(p, "));");
 835         p = isl_printer_end_line(p);
 836         p = isl_printer_start_line(p);
 837         p = isl_printer_print_str(p, "clFinish(queue);");
 838         p = isl_printer_end_line(p);
 839         p = isl_printer_indent(p, -2);
 840         p = isl_printer_start_line(p);
 841         p = isl_printer_print_str(p, "}");
 842         p = isl_printer_end_line(p);
 843
 844         p = isl_printer_start_line(p);
 845         p = isl_printer_end_line(p);
 846
 847         opencl_print_kernel(data->prog, kernel, data->opencl);
 848
 849         isl_ast_print_options_free(print_options);
 850
 851         return p;
 852 }
 853
 854 static __isl_give isl_printer *opencl_print_host_code(
 855         __isl_take isl_printer *p, struct gpu_prog *prog,
 856         __isl_keep isl_ast_node *tree, struct opencl_info *opencl)
 857 {
 858         isl_ast_print_options *print_options;
 859         isl_ctx *ctx = isl_ast_node_get_ctx(tree);
 860         struct print_host_user_data_opencl data = { opencl, prog };
 861
 862         print_options = isl_ast_print_options_alloc(ctx);
 863         print_options = isl_ast_print_options_set_print_user(print_options,
 864                                 &opencl_print_host_user, &data);
 865
 866         p = gpu_print_macros(p, tree);
 867         p = isl_ast_node_print(tree, p, print_options);
 868
 869         return p;
 870 }
 871
 872 /* Copy "array" back from the GPU to the host.
 873  */
 874 static __isl_give isl_printer *copy_array_from_device(__isl_take isl_printer *p,
 875         void *user)
 876 {
 877         struct gpu_array_info *array = user;
 878
 879         p = isl_printer_start_line(p);
 880         p = isl_printer_print_str(p, "openclCheckReturn("
 881                                         "clEnqueueReadBuffer(queue,"
 882                                         " dev_");
 883         p = isl_printer_print_str(p, array->name);
 884         p = isl_printer_print_str(p, ", CL_TRUE, 0, ");
 885         p = gpu_array_info_print_size(p, array);
 886
 887         if (gpu_array_is_scalar(array))
 888                 p = isl_printer_print_str(p, ", &");
 889         else
 890                 p = isl_printer_print_str(p, ", ");
 891         p = isl_printer_print_str(p, array->name);
 892         p = isl_printer_print_str(p, ", 0, NULL, NULL));");
 893         p = isl_printer_end_line(p);
 894
 895         return p;
 896 }
 897
 898 /* Copy copy_out arrays back from the GPU to the host.
 899  *
 900  * Only perform the copying for arrays with strictly positive size.
 901  */
 902 static __isl_give isl_printer *opencl_copy_arrays_from_device(
 903         __isl_take isl_printer *p, struct gpu_prog *prog)
 904 {
 905         int i;
 906         isl_union_set *copy_out;
 907         copy_out = isl_union_set_copy(prog->copy_out);
 908
 909         for (i = 0; i < prog->n_array; ++i) {
 910                 struct gpu_array_info *array = &prog->array[i];
 911                 isl_space *space;
 912                 isl_set *copy_out_i;
 913                 isl_set *guard;
 914                 int empty;
 915
 916                 space = isl_space_copy(array->space);
 917                 copy_out_i = isl_union_set_extract_set(copy_out, space);
 918                 empty = isl_set_fast_is_empty(copy_out_i);
 919                 isl_set_free(copy_out_i);
 920                 if (empty)
 921                         continue;
 922
 923                 guard = gpu_array_positive_size_guard(array);
 924                 p = ppcg_print_guarded(p, guard, isl_set_copy(prog->context),
 925                                             &copy_array_from_device, array);
 926         }
 927
 928         isl_union_set_free(copy_out);
 929         p = isl_printer_start_line(p);
 930         p = isl_printer_end_line(p);
 931         return p;
 932 }
 933
 934 /* Create an OpenCL device, context, command queue and build the kernel.
 935  * input is the name of the input file provided to ppcg.
 936  */
 937 static __isl_give isl_printer *opencl_setup(__isl_take isl_printer *p,
 938         const char *input, struct opencl_info *info)
 939 {
 940         int len;
 941
 942         p = isl_printer_start_line(p);
 943         p = isl_printer_print_str(p, "cl_device_id device;");
 944         p = isl_printer_end_line(p);
 945         p = isl_printer_start_line(p);
 946         p = isl_printer_print_str(p, "cl_context context;");
 947         p = isl_printer_end_line(p);
 948         p = isl_printer_start_line(p);
 949         p = isl_printer_print_str(p, "cl_program program;");
 950         p = isl_printer_end_line(p);
 951         p = isl_printer_start_line(p);
 952         p = isl_printer_print_str(p, "cl_command_queue queue;");
 953         p = isl_printer_end_line(p);
 954         p = isl_printer_start_line(p);
 955         p = isl_printer_print_str(p, "cl_int err;");
 956         p = isl_printer_end_line(p);
 957         p = isl_printer_start_line(p);
 958         p = isl_printer_print_str(p, "device = opencl_create_device(");
 959         p = isl_printer_print_int(p, info->options->opencl_use_gpu);
 960         p = isl_printer_print_str(p, ");");
 961         p = isl_printer_end_line(p);
 962         p = isl_printer_start_line(p);
 963         p = isl_printer_print_str(p, "context = clCreateContext(NULL, 1,"
 964                 "&device, NULL, NULL, &err);");
 965         p = isl_printer_end_line(p);
 966         p = isl_printer_start_line(p);
 967         p = isl_printer_print_str(p, "openclCheckReturn(err);");
 968         p = isl_printer_end_line(p);
 969         p = isl_printer_start_line(p);
 970         p = isl_printer_print_str(p, "queue = clCreateCommandQueue"
 971                                         "(context, device, 0, &err);");
 972         p = isl_printer_end_line(p);
 973         p = isl_printer_start_line(p);
 974         p = isl_printer_print_str(p, "openclCheckReturn(err);");
 975         p = isl_printer_end_line(p);
 976
 977         p = isl_printer_start_line(p);
 978         p = isl_printer_print_str(p, "program = opencl_build_program("
 979                                         "context, device, \"");
 980         p = isl_printer_print_str(p, info->kernel_c_name);
 981         p = isl_printer_print_str(p, "\", \"");
 982
 983         if (info->options->opencl_compiler_options)
 984                 p = isl_printer_print_str(p,
 985                                         info->options->opencl_compiler_options);
 986
 987         p = isl_printer_print_str(p, "\");");
 988         p = isl_printer_end_line(p);
 989         p = isl_printer_start_line(p);
 990         p = isl_printer_end_line(p);
 991
 992         return p;
 993 }
 994
 995 static __isl_give isl_printer *opencl_release_cl_objects(
 996         __isl_take isl_printer *p, struct opencl_info *info)
 997 {
 998         p = isl_printer_start_line(p);
 999         p = isl_printer_print_str(p, "openclCheckReturn(clReleaseCommandQueue"
1000                                         "(queue));");
1001         p = isl_printer_end_line(p);
1002         p = isl_printer_start_line(p);
1003         p = isl_printer_print_str(p, "openclCheckReturn(clReleaseProgram"
1004                                         "(program));");
1005         p = isl_printer_end_line(p);
1006         p = isl_printer_start_line(p);
1007         p = isl_printer_print_str(p, "openclCheckReturn(clReleaseContext"
1008                                         "(context));");
1009         p = isl_printer_end_line(p);
1010
1011         return p;
1012 }
1013
1014 /* Free the device array corresponding to "array"
1015  */
1016 static __isl_give isl_printer *release_device_array(__isl_take isl_printer *p,
1017         void *user)
1018 {
1019         struct gpu_array_info *array = user;
1020
1021         p = isl_printer_start_line(p);
1022         p = isl_printer_print_str(p, "openclCheckReturn("
1023                                         "clReleaseMemObject(dev_");
1024         p = isl_printer_print_str(p, array->name);
1025         p = isl_printer_print_str(p, "));");
1026         p = isl_printer_end_line(p);
1027
1028         return p;
1029 }
1030
1031 /* Free the device arrays.
1032  *
1033  * Only free arrays with strictly positive size as those are the only ones
1034  * that have been allocated.
1035  */
1036 static __isl_give isl_printer *opencl_release_device_arrays(
1037         __isl_take isl_printer *p, struct gpu_prog *prog)
1038 {
1039         int i, j;
1040
1041         for (i = 0; i < prog->n_array; ++i) {
1042                 struct gpu_array_info *array = &prog->array[i];
1043                 isl_set *guard;
1044
1045                 if (gpu_array_is_read_only_scalar(array))
1046                         continue;
1047
1048                 guard = gpu_array_positive_size_guard(array);
1049                 p = ppcg_print_guarded(p, guard, isl_set_copy(prog->context),
1050                                             &release_device_array, array);
1051         }
1052         return p;
1053 }
1054
1055 /* Given a gpu_prog "prog" and the corresponding transformed AST
1056  * "tree", print the entire OpenCL code to "p".
1057  */
1058 static __isl_give isl_printer *print_opencl(__isl_take isl_printer *p,
1059         struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
1060         struct gpu_types *types, void *user)
1061 {
1062         struct opencl_info *opencl = user;
1063         isl_printer *kernel;
1064
1065         kernel = isl_printer_to_file(isl_printer_get_ctx(p), opencl->kernel_c);
1066         kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
1067         if (any_double_elements(prog))
1068                 kernel = opencl_enable_double_support(kernel);
1069         kernel = gpu_print_types(kernel, types, prog);
1070         isl_printer_free(kernel);
1071
1072         if (!kernel)
1073                 return isl_printer_free(p);
1074
1075         p = ppcg_start_block(p);
1076
1077         p = print_opencl_macros(p);
1078
1079         p = opencl_declare_device_arrays(p, prog);
1080         p = opencl_setup(p, opencl->input, opencl);
1081         p = opencl_allocate_device_arrays(p, prog);
1082
1083         p = opencl_print_host_code(p, prog, tree, opencl);
1084
1085         p = opencl_copy_arrays_from_device(p, prog);
1086         p = opencl_release_device_arrays(p, prog);
1087         p = opencl_release_cl_objects(p, opencl);
1088
1089         p = ppcg_end_block(p);
1090
1091         return p;
1092 }
1093
1094 /* Transform the code in the file called "input" by replacing
1095  * all scops by corresponding OpenCL code.
1096  * The host code is written to "output" or a name derived from
1097  * "input" if "output" is NULL.
1098  * The kernel code is placed in separate files with names
1099  * derived from "output" or "input".
1100  *
1101  * We let generate_gpu do all the hard work and then let it call
1102  * us back for printing the AST in print_cuda.
1103  *
1104  * To prepare for this printing, we first open the output files
1105  * and we close them after generate_gpu has finished.
1106  */
1107 int generate_opencl(isl_ctx *ctx, struct ppcg_options *options,
1108         const char *input, const char *output)
1109 {
1110         struct opencl_info opencl = { options, input, output };
1111         int r;
1112
1113         opencl_open_files(&opencl);
1114
1115         r = generate_gpu(ctx, input, opencl.host_c, options,
1116                         &print_opencl, &opencl);
1117
1118         opencl_close_files(&opencl);
1119
1120         return r;
1121 }