opencl.c

   1 /*
   2  * Copyright 2013      Ecole Normale Superieure
   3  *
   4  * Use of this software is governed by the GNU LGPLv2.1 license
   5  *
   6  * Written by Sven Verdoolaege and Riyadh Baghdadi,
   7  * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
   8  */
   9
  10 #include <ctype.h>
  11 #include <limits.h>
  12 #include <string.h>
  13
  14 #include <isl/aff.h>
  15 #include <isl/ast.h>
  16
  17 #include "opencl.h"
  18 #include "gpu_print.h"
  19 #include "gpu.h"
  20 #include "ppcg.h"
  21 #include "print.h"
  22 #include "schedule.h"
  23
  24 #define min(a, b)  (((a) < (b)) ? (a) : (b))
  25 #define max(a, b)  (((a) > (b)) ? (a) : (b))
  26
  27 /* options are the global options passed to generate_opencl.
  28  * input is the name of the input file.
  29  * output is the user-specified output file name and may be NULL
  30  *      if not specified by the user.
  31  * kernel_c_name is the name of the kernel_c file.
  32  * host_c is the generated source file for the host code.  kernel_c is
  33  * the generated source file for the kernel.  kernel_h is the generated
  34  * header file for the kernel.
  35  */
  36 struct opencl_info {
  37         struct ppcg_options *options;
  38         const char *input;
  39         const char *output;
  40         char kernel_c_name[PATH_MAX];
  41
  42         FILE *host_c;
  43         FILE *kernel_c;
  44         FILE *kernel_h;
  45 };
  46
  47 /* Open the file called "name" for writing or print an error message.
  48  */
  49 static FILE *open_or_croak(const char *name)
  50 {
  51         FILE *file;
  52
  53         file = fopen(name, "w");
  54         if (!file)
  55                 fprintf(stderr, "Failed to open \"%s\" for writing\n", name);
  56         return file;
  57 }
  58
  59 /* Open the host .c file and the kernel .h and .cl files for writing.
  60  * Their names are derived from info->output (or info->input if
  61  * the user did not specify an output file name).
  62  * Add the necessary includes to these files.
  63  *
  64  * Return 0 on success and -1 on failure.
  65  */
  66 static int opencl_open_files(struct opencl_info *info)
  67 {
  68         char name[PATH_MAX];
  69         int len;
  70
  71         if (info->output) {
  72                 const char *ext;
  73
  74                 ext = strrchr(info->output, '.');
  75                 len = ext ? ext - info->output : strlen(info->output);
  76                 memcpy(name, info->output, len);
  77
  78                 info->host_c = open_or_croak(info->output);
  79         } else {
  80                 len = ppcg_extract_base_name(name, info->input);
  81
  82                 strcpy(name + len, "_host.c");
  83                 info->host_c = open_or_croak(name);
  84         }
  85
  86         memcpy(info->kernel_c_name, name, len);
  87         strcpy(info->kernel_c_name + len, "_kernel.cl");
  88         info->kernel_c = open_or_croak(info->kernel_c_name);
  89
  90         strcpy(name + len, "_kernel.h");
  91         info->kernel_h = open_or_croak(name);
  92
  93         if (!info->host_c || !info->kernel_c || !info->host_c)
  94                 return -1;
  95
  96         fprintf(info->host_c, "#include <assert.h>\n");
  97         fprintf(info->host_c, "#include <stdio.h>\n");
  98         fprintf(info->host_c, "#include \"%s\"\n\n", ppcg_base_name(name));
  99         fprintf(info->kernel_h, "#if defined(__APPLE__)\n");
 100         fprintf(info->kernel_h, "#include <OpenCL/opencl.h>\n");
 101         fprintf(info->kernel_h, "#else\n");
 102         fprintf(info->kernel_h, "#include <CL/opencl.h>\n");
 103         fprintf(info->kernel_h, "#endif\n\n");
 104         fprintf(info->kernel_h, "cl_device_id opencl_create_device("
 105                                 "int use_gpu);\n");
 106         fprintf(info->kernel_h, "cl_program opencl_build_program("
 107                                 "cl_context ctx, "
 108                                 "cl_device_id dev, const char *filename, "
 109                                 "const char *opencl_options);\n");
 110         fprintf(info->kernel_h,
 111                 "const char *opencl_error_string(cl_int error);\n");
 112
 113         return 0;
 114 }
 115
 116 /* Close all output files.
 117  */
 118 static void opencl_close_files(struct opencl_info *info)
 119 {
 120         if (info->kernel_c)
 121                 fclose(info->kernel_c);
 122         if (info->kernel_h)
 123                 fclose(info->kernel_h);
 124         if (info->host_c)
 125                 fclose(info->host_c);
 126 }
 127
 128 static __isl_give isl_printer *opencl_print_host_macros(__isl_take isl_printer *p)
 129 {
 130         const char *macros =
 131                 "#define openclCheckReturn(ret) \\\n"
 132                 "  if (ret != CL_SUCCESS) {\\\n"
 133                 "    fprintf(stderr, \"OpenCL error: %s\\n\", "
 134                 "opencl_error_string(ret)); \\\n"
 135                 "    fflush(stderr); \\\n"
 136                 "    assert(ret == CL_SUCCESS);\\\n  }\n";
 137
 138         p = isl_printer_start_line(p);
 139         p = isl_printer_print_str(p, macros);
 140         p = isl_printer_end_line(p);
 141
 142         p = isl_ast_op_type_print_macro(isl_ast_op_max, p);
 143
 144         return p;
 145 }
 146
 147 static __isl_give isl_printer *opencl_declare_device_arrays(
 148         __isl_take isl_printer *p, struct gpu_prog *prog)
 149 {
 150         int i;
 151
 152         for (i = 0; i < prog->n_array; ++i) {
 153                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
 154                         continue;
 155                 p = isl_printer_start_line(p);
 156                 p = isl_printer_print_str(p, "cl_mem dev_");
 157                 p = isl_printer_print_str(p, prog->array[i].name);
 158                 p = isl_printer_print_str(p, ";");
 159                 p = isl_printer_end_line(p);
 160         }
 161         p = isl_printer_start_line(p);
 162         p = isl_printer_end_line(p);
 163         return p;
 164 }
 165
 166 /* Given an array, check whether its positive size guard expression is
 167  * trivial.
 168  */
 169 static int is_array_positive_size_guard_trivial(struct gpu_array_info *array)
 170 {
 171         isl_set *guard;
 172         int is_trivial;
 173
 174         guard = gpu_array_positive_size_guard(array);
 175         is_trivial = isl_set_plain_is_universe(guard);
 176         isl_set_free(guard);
 177         return is_trivial;
 178 }
 179
 180 /* Allocate a device array for array and copy the contents to the device
 181  * if copy is set.
 182  *
 183  * Emit a max-expression to ensure the device array can contain at least one
 184  * element if the array's positive size guard expression is not trivial.
 185  */
 186 static __isl_give isl_printer *allocate_device_array(__isl_take isl_printer *p,
 187         struct gpu_array_info *array, int copy)
 188 {
 189         int need_lower_bound;
 190
 191         p = ppcg_start_block(p);
 192
 193         p = isl_printer_start_line(p);
 194         p = isl_printer_print_str(p, "dev_");
 195         p = isl_printer_print_str(p, array->name);
 196         p = isl_printer_print_str(p, " = clCreateBuffer(context, ");
 197         p = isl_printer_print_str(p, "CL_MEM_READ_WRITE");
 198
 199         if (!copy)
 200                 p = isl_printer_print_str(p, ", ");
 201         else
 202                 p = isl_printer_print_str(p, " | CL_MEM_COPY_HOST_PTR, ");
 203
 204         need_lower_bound = !is_array_positive_size_guard_trivial(array);
 205         if (need_lower_bound) {
 206                 p = isl_printer_print_str(p, "max(sizeof(");
 207                 p = isl_printer_print_str(p, array->type);
 208                 p = isl_printer_print_str(p, "), ");
 209         }
 210         p = gpu_array_info_print_size(p, array);
 211         if (need_lower_bound)
 212                 p = isl_printer_print_str(p, ")");
 213
 214         if (!copy)
 215                 p = isl_printer_print_str(p, ", NULL");
 216         else if (gpu_array_is_scalar(array)) {
 217                 p = isl_printer_print_str(p, ", &");
 218                 p = isl_printer_print_str(p, array->name);
 219         } else {
 220                 p = isl_printer_print_str(p, ", ");
 221                 p = isl_printer_print_str(p, array->name);
 222         }
 223
 224         p = isl_printer_print_str(p, ", &err);");
 225         p = isl_printer_end_line(p);
 226         p = isl_printer_start_line(p);
 227         p = isl_printer_print_str(p, "openclCheckReturn(err);");
 228         p = isl_printer_end_line(p);
 229
 230         p = ppcg_end_block(p);
 231
 232         return p;
 233 }
 234
 235 /* Allocate device arrays and copy the contents of copy_in arrays into device.
 236  */
 237 static __isl_give isl_printer *opencl_allocate_device_arrays(
 238         __isl_take isl_printer *p, struct gpu_prog *prog)
 239 {
 240         int i, j;
 241
 242         for (i = 0; i < prog->n_array; ++i) {
 243                 struct gpu_array_info *array = &prog->array[i];
 244                 isl_space *space;
 245                 isl_set *read_i;
 246                 int empty;
 247
 248                 if (gpu_array_is_read_only_scalar(array))
 249                         continue;
 250
 251                 space = isl_space_copy(array->space);
 252                 read_i = isl_union_set_extract_set(prog->copy_in, space);
 253                 empty = isl_set_fast_is_empty(read_i);
 254                 isl_set_free(read_i);
 255
 256                 p = allocate_device_array(p, array, !empty);
 257         }
 258         p = isl_printer_start_line(p);
 259         p = isl_printer_end_line(p);
 260         return p;
 261 }
 262
 263 /* Print a call to the OpenCL clSetKernelArg() function which sets
 264  * the arguments of the kernel.  arg_name and arg_index are the name and the
 265  * index of the kernel argument.  The index of the leftmost argument of
 266  * the kernel is 0 whereas the index of the rightmost argument of the kernel
 267  * is n - 1, where n is the total number of the kernel arguments.
 268  * read_only_scalar is a boolean that indicates whether the argument is a read
 269  * only scalar.
 270  */
 271 static __isl_give isl_printer *opencl_set_kernel_argument(
 272         __isl_take isl_printer *p, int kernel_id,
 273         const char *arg_name, int arg_index, int read_only_scalar)
 274 {
 275         p = isl_printer_start_line(p);
 276         p = isl_printer_print_str(p,
 277                 "openclCheckReturn(clSetKernelArg(kernel");
 278         p = isl_printer_print_int(p, kernel_id);
 279         p = isl_printer_print_str(p, ", ");
 280         p = isl_printer_print_int(p, arg_index);
 281         p = isl_printer_print_str(p, ", sizeof(");
 282
 283         if (read_only_scalar) {
 284                 p = isl_printer_print_str(p, arg_name);
 285                 p = isl_printer_print_str(p, "), &");
 286         } else
 287                 p = isl_printer_print_str(p, "cl_mem), (void *) &dev_");
 288
 289         p = isl_printer_print_str(p, arg_name);
 290         p = isl_printer_print_str(p, "));");
 291         p = isl_printer_end_line(p);
 292
 293         return p;
 294 }
 295
 296 /* Print the block sizes as a list of the sizes in each
 297  * dimension.
 298  */
 299 static __isl_give isl_printer *opencl_print_block_sizes(
 300         __isl_take isl_printer *p, struct ppcg_kernel *kernel)
 301 {
 302         int i;
 303
 304         if (kernel->n_block > 0)
 305                 for (i = 0; i < kernel->n_block; ++i) {
 306                         if (i)
 307                                 p = isl_printer_print_str(p, ", ");
 308                         p = isl_printer_print_int(p, kernel->block_dim[i]);
 309                 }
 310         else
 311                 p = isl_printer_print_str(p, "1");
 312
 313         return p;
 314 }
 315
 316 /* Set the arguments of the OpenCL kernel by printing a call to the OpenCL
 317  * clSetKernelArg() function for each kernel argument.
 318  */
 319 static __isl_give isl_printer *opencl_set_kernel_arguments(
 320         __isl_take isl_printer *p, struct gpu_prog *prog,
 321         struct ppcg_kernel *kernel)
 322 {
 323         int i, n, ro;
 324         unsigned nparam;
 325         isl_space *space;
 326         const char *type;
 327         int arg_index = 0;
 328
 329         for (i = 0; i < prog->n_array; ++i) {
 330                 isl_set *arr;
 331                 int empty;
 332
 333                 space = isl_space_copy(prog->array[i].space);
 334                 arr = isl_union_set_extract_set(kernel->arrays, space);
 335                 empty = isl_set_fast_is_empty(arr);
 336                 isl_set_free(arr);
 337                 if (empty)
 338                         continue;
 339                 ro = gpu_array_is_read_only_scalar(&prog->array[i]);
 340                 opencl_set_kernel_argument(p, kernel->id, prog->array[i].name,
 341                         arg_index, ro);
 342                 arg_index++;
 343         }
 344
 345         space = isl_union_set_get_space(kernel->arrays);
 346         nparam = isl_space_dim(space, isl_dim_param);
 347         for (i = 0; i < nparam; ++i) {
 348                 const char *name;
 349
 350                 name = isl_space_get_dim_name(space, isl_dim_param, i);
 351                 opencl_set_kernel_argument(p, kernel->id, name, arg_index, 1);
 352                 arg_index++;
 353         }
 354         isl_space_free(space);
 355
 356         n = isl_space_dim(kernel->space, isl_dim_set);
 357         for (i = 0; i < n; ++i) {
 358                 const char *name;
 359                 isl_id *id;
 360
 361                 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
 362                 opencl_set_kernel_argument(p, kernel->id, name, arg_index, 1);
 363                 arg_index++;
 364         }
 365
 366         return p;
 367 }
 368
 369 /* Print the arguments to a kernel declaration or call.  If "types" is set,
 370  * then print a declaration (including the types of the arguments).
 371  *
 372  * The arguments are printed in the following order
 373  * - the arrays accessed by the kernel
 374  * - the parameters
 375  * - the host loop iterators
 376  */
 377 static __isl_give isl_printer *opencl_print_kernel_arguments(
 378         __isl_take isl_printer *p, struct gpu_prog *prog,
 379         struct ppcg_kernel *kernel, int types)
 380 {
 381         int i, n;
 382         int first = 1;
 383         unsigned nparam;
 384         isl_space *space;
 385         const char *type;
 386
 387         for (i = 0; i < prog->n_array; ++i) {
 388                 isl_set *arr;
 389                 int empty;
 390
 391                 space = isl_space_copy(prog->array[i].space);
 392                 arr = isl_union_set_extract_set(kernel->arrays, space);
 393                 empty = isl_set_fast_is_empty(arr);
 394                 isl_set_free(arr);
 395                 if (empty)
 396                         continue;
 397
 398                 if (!first)
 399                         p = isl_printer_print_str(p, ", ");
 400
 401                 if (types)
 402                         p = gpu_array_info_print_declaration_argument(p,
 403                                 &prog->array[i], "__global");
 404                 else
 405                         p = gpu_array_info_print_call_argument(p,
 406                                 &prog->array[i]);
 407
 408                 first = 0;
 409         }
 410
 411         space = isl_union_set_get_space(kernel->arrays);
 412         nparam = isl_space_dim(space, isl_dim_param);
 413         for (i = 0; i < nparam; ++i) {
 414                 const char *name;
 415
 416                 name = isl_space_get_dim_name(space, isl_dim_param, i);
 417
 418                 if (!first)
 419                         p = isl_printer_print_str(p, ", ");
 420                 if (types)
 421                         p = isl_printer_print_str(p, "int ");
 422                 p = isl_printer_print_str(p, name);
 423
 424                 first = 0;
 425         }
 426         isl_space_free(space);
 427
 428         n = isl_space_dim(kernel->space, isl_dim_set);
 429         type = isl_options_get_ast_iterator_type(prog->ctx);
 430         for (i = 0; i < n; ++i) {
 431                 const char *name;
 432                 isl_id *id;
 433
 434                 if (!first)
 435                         p = isl_printer_print_str(p, ", ");
 436                 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
 437                 if (types) {
 438                         p = isl_printer_print_str(p, type);
 439                         p = isl_printer_print_str(p, " ");
 440                 }
 441                 p = isl_printer_print_str(p, name);
 442
 443                 first = 0;
 444         }
 445
 446         return p;
 447 }
 448
 449 /* Print the header of the given kernel.
 450  */
 451 static __isl_give isl_printer *opencl_print_kernel_header(
 452         __isl_take isl_printer *p, struct gpu_prog *prog,
 453         struct ppcg_kernel *kernel)
 454 {
 455         p = isl_printer_start_line(p);
 456         p = isl_printer_print_str(p, "__kernel void kernel");
 457         p = isl_printer_print_int(p, kernel->id);
 458         p = isl_printer_print_str(p, "(");
 459         p = opencl_print_kernel_arguments(p, prog, kernel, 1);
 460         p = isl_printer_print_str(p, ")");
 461         p = isl_printer_end_line(p);
 462
 463         return p;
 464 }
 465
 466 /* Unlike the equivalent function in the CUDA backend which prints iterators
 467  * in reverse order to promote coalescing, this function does not print
 468  * iterators in reverse order.  The OpenCL backend currently does not take
 469  * into account any coalescing considerations.
 470  */
 471 static __isl_give isl_printer *opencl_print_kernel_iterators(
 472         __isl_take isl_printer *p, struct ppcg_kernel *kernel)
 473 {
 474         int i, n_grid;
 475         isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
 476         const char *type;
 477
 478         type = isl_options_get_ast_iterator_type(ctx);
 479
 480         n_grid = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
 481         if (n_grid > 0) {
 482                 p = isl_printer_start_line(p);
 483                 p = isl_printer_print_str(p, type);
 484                 p = isl_printer_print_str(p, " ");
 485                 for (i = 0; i < n_grid; ++i) {
 486                         if (i)
 487                                 p = isl_printer_print_str(p, ", ");
 488                         p = isl_printer_print_str(p, "b");
 489                         p = isl_printer_print_int(p, i);
 490                         p = isl_printer_print_str(p, " = get_group_id(");
 491                         p = isl_printer_print_int(p, i);
 492                         p = isl_printer_print_str(p, ")");
 493                 }
 494                 p = isl_printer_print_str(p, ";");
 495                 p = isl_printer_end_line(p);
 496         }
 497
 498         if (kernel->n_block > 0) {
 499                 p = isl_printer_start_line(p);
 500                 p = isl_printer_print_str(p, type);
 501                 p = isl_printer_print_str(p, " ");
 502                 for (i = 0; i < kernel->n_block; ++i) {
 503                         if (i)
 504                                 p = isl_printer_print_str(p, ", ");
 505                         p = isl_printer_print_str(p, "t");
 506                         p = isl_printer_print_int(p, i);
 507                         p = isl_printer_print_str(p, " = get_local_id(");
 508                         p = isl_printer_print_int(p, i);
 509                         p = isl_printer_print_str(p, ")");
 510                 }
 511                 p = isl_printer_print_str(p, ";");
 512                 p = isl_printer_end_line(p);
 513         }
 514
 515         return p;
 516 }
 517
 518 static __isl_give isl_printer *opencl_print_kernel_var(
 519         __isl_take isl_printer *p, struct ppcg_kernel_var *var)
 520 {
 521         int j;
 522         isl_val *v;
 523
 524         p = isl_printer_start_line(p);
 525         if (var->type == ppcg_access_shared)
 526                 p = isl_printer_print_str(p, "__local ");
 527         p = isl_printer_print_str(p, var->array->type);
 528         p = isl_printer_print_str(p, " ");
 529         p = isl_printer_print_str(p, var->name);
 530         for (j = 0; j < var->array->n_index; ++j) {
 531                 p = isl_printer_print_str(p, "[");
 532                 v = isl_vec_get_element_val(var->size, j);
 533                 p = isl_printer_print_val(p, v);
 534                 p = isl_printer_print_str(p, "]");
 535                 isl_val_free(v);
 536         }
 537         p = isl_printer_print_str(p, ";");
 538         p = isl_printer_end_line(p);
 539
 540         return p;
 541 }
 542
 543 static __isl_give isl_printer *opencl_print_kernel_vars(
 544                 __isl_take isl_printer *p, struct ppcg_kernel *kernel)
 545 {
 546         int i;
 547
 548         for (i = 0; i < kernel->n_var; ++i)
 549                 p = opencl_print_kernel_var(p, &kernel->var[i]);
 550
 551         return p;
 552 }
 553
 554 /* Print a call to barrier() which is a sync statement.
 555  * All work-items in a work-group executing the kernel on a processor must
 556  * execute the barrier() function before any are allowed to continue execution
 557  * beyond the barrier.
 558  * The flag CLK_LOCAL_MEM_FENCE makes the barrier function either flush any
 559  * variables stored in local memory or queue a memory fence to ensure correct
 560  * ordering of memory operations to local memory.
 561  * The flag CLK_GLOBAL_MEM_FENCE makes the barrier function queue a memory
 562  * fence to ensure correct ordering of memory operations to global memory.
 563  */
 564 static __isl_give isl_printer *opencl_print_sync(__isl_take isl_printer *p,
 565         struct ppcg_kernel_stmt *stmt)
 566 {
 567         p = isl_printer_start_line(p);
 568         p = isl_printer_print_str(p,
 569                 "barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);");
 570         p = isl_printer_end_line(p);
 571
 572         return p;
 573 }
 574
 575 /* This function is called for each user statement in the AST,
 576  * i.e., for each kernel body statement, copy statement or sync statement.
 577  */
 578 static __isl_give isl_printer *opencl_print_kernel_stmt(
 579         __isl_take isl_printer *p,
 580         __isl_take isl_ast_print_options *print_options,
 581         __isl_keep isl_ast_node *node, void *user)
 582 {
 583         isl_id *id;
 584         struct ppcg_kernel_stmt *stmt;
 585
 586         id = isl_ast_node_get_annotation(node);
 587         stmt = isl_id_get_user(id);
 588         isl_id_free(id);
 589
 590         isl_ast_print_options_free(print_options);
 591
 592         switch (stmt->type) {
 593         case ppcg_kernel_copy:
 594                 return ppcg_kernel_print_copy(p, stmt);
 595         case ppcg_kernel_sync:
 596                 return opencl_print_sync(p, stmt);
 597         case ppcg_kernel_domain:
 598                 return ppcg_kernel_print_domain(p, stmt);
 599         }
 600
 601         return p;
 602 }
 603
 604 /* Return true if there is a double array in prog->array or
 605  * if any of the types in prog->scop involve any doubles.
 606  * To check the latter condition, we simply search for the string "double"
 607  * in the type definitions, which may result in false positives.
 608  */
 609 static __isl_give int any_double_elements(struct gpu_prog *prog)
 610 {
 611         int i;
 612
 613         for (i = 0; i < prog->n_array; ++i)
 614                 if (strcmp(prog->array[i].type, "double") == 0)
 615                         return 1;
 616
 617         for (i = 0; i < prog->scop->n_type; ++i) {
 618                 struct pet_type *type = prog->scop->types[i];
 619
 620                 if (strstr(type->definition, "double"))
 621                         return 1;
 622         }
 623
 624         return 0;
 625 }
 626
 627 /* Prints a #pragma to enable support for double floating-point
 628  * precision.  OpenCL 1.0 adds support for double precision floating-point as
 629  * an optional extension. An application that wants to use double will need to
 630  * include the #pragma OPENCL EXTENSION cl_khr_fp64 : enable directive before
 631  * any double precision data type is declared in the kernel code.
 632  */
 633 static __isl_give isl_printer *opencl_enable_double_support(
 634         __isl_take isl_printer *p)
 635 {
 636         int i;
 637
 638         p = isl_printer_start_line(p);
 639         p = isl_printer_print_str(p, "#pragma OPENCL EXTENSION cl_khr_fp64 :"
 640                 " enable");
 641         p = isl_printer_end_line(p);
 642         p = isl_printer_start_line(p);
 643         p = isl_printer_end_line(p);
 644
 645         return p;
 646 }
 647
 648 static void opencl_print_kernel(struct gpu_prog *prog,
 649         struct ppcg_kernel *kernel, struct opencl_info *opencl)
 650 {
 651         isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
 652         isl_ast_print_options *print_options;
 653         isl_printer *p;
 654
 655         p = isl_printer_to_file(ctx, opencl->kernel_c);
 656         print_options = isl_ast_print_options_alloc(ctx);
 657         print_options = isl_ast_print_options_set_print_user(print_options,
 658                                 &opencl_print_kernel_stmt, NULL);
 659
 660         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 661         p = opencl_print_kernel_header(p, prog, kernel);
 662         p = isl_printer_print_str(p, "{");
 663         p = isl_printer_end_line(p);
 664         p = isl_printer_indent(p, 4);
 665         p = opencl_print_kernel_iterators(p, kernel);
 666         p = opencl_print_kernel_vars(p, kernel);
 667         p = isl_printer_end_line(p);
 668         p = gpu_print_macros(p, kernel->tree);
 669         p = isl_ast_node_print(kernel->tree, p, print_options);
 670         p = isl_printer_print_str(p, "}");
 671         p = isl_printer_end_line(p);
 672         isl_printer_free(p);
 673 }
 674
 675 struct print_host_user_data_opencl {
 676         struct opencl_info *opencl;
 677         struct gpu_prog *prog;
 678 };
 679
 680 /* This function prints the i'th block size multiplied by the i'th grid size,
 681  * where i (a parameter to this function) is one of the possible dimensions of
 682  * grid sizes and block sizes.
 683  * If the dimension of block sizes is not equal to the dimension of grid sizes
 684  * the output is calculated as follows:
 685  *
 686  * Suppose that:
 687  * block_sizes[dim1] is the list of blocks sizes and it contains dim1 elements.
 688  * grid_sizes[dim2] is the list of grid sizes and it contains dim2 elements.
 689  *
 690  * The output is:
 691  * If (i > dim2) then the output is block_sizes[i]
 692  * If (i > dim1) then the output is grid_sizes[i]
 693  */
 694 static __isl_give isl_printer *opencl_print_total_number_of_work_items_for_dim(
 695         __isl_take isl_printer *p, struct ppcg_kernel *kernel, int i)
 696 {
 697         int grid_dim, block_dim;
 698
 699         grid_dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
 700         block_dim = kernel->n_block;
 701
 702         isl_pw_aff *bound_grid;
 703
 704         if (i < min(grid_dim, block_dim)) {
 705                 bound_grid = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
 706                 p = isl_printer_print_str(p, "(");
 707                 p = isl_printer_print_pw_aff(p, bound_grid);
 708                 p = isl_printer_print_str(p, ") * ");
 709                 p = isl_printer_print_int(p, kernel->block_dim[i]);
 710                 isl_pw_aff_free(bound_grid);
 711         } else if (i >= grid_dim)
 712                 p = isl_printer_print_int(p, kernel->block_dim[i]);
 713         else {
 714                 bound_grid = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
 715                 p = isl_printer_print_pw_aff(p, bound_grid);
 716                 isl_pw_aff_free(bound_grid);
 717         }
 718
 719         return p;
 720 }
 721
 722 /* Print a list that represents the total number of work items.  The list is
 723  * constructed by performing an element-wise multiplication of the block sizes
 724  * and the grid sizes.  To explain how the list is constructed, suppose that:
 725  * block_sizes[dim1] is the list of blocks sizes and it contains dim1 elements.
 726  * grid_sizes[dim2] is the list of grid sizes and it contains dim2 elements.
 727  *
 728  * The output of this function is constructed as follows:
 729  * If (dim1 > dim2) then the output is the following list:
 730  * grid_sizes[0]*block_sizes[0], ..., grid_sizes[dim2-1]*block_sizes[dim2-1],
 731  * block_sizes[dim2], ..., block_sizes[dim1-2], block_sizes[dim1-1].
 732  *
 733  * If (dim2 > dim1) then the output is the following list:
 734  * grid_sizes[0]*block_sizes[0], ..., grid_sizes[dim1-1] * block_sizes[dim1-1],
 735  * grid_sizes[dim1], grid_sizes[dim2-2], grid_sizes[dim2-1].
 736  *
 737  * To calculate the total number of work items out of the list constructed by
 738  * this function, the user should multiply the elements of the list.
 739  */
 740 static __isl_give isl_printer *opencl_print_total_number_of_work_items_as_list(
 741         __isl_take isl_printer *p, struct ppcg_kernel *kernel)
 742 {
 743         int i;
 744         int grid_dim, block_dim;
 745
 746         grid_dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
 747         block_dim = kernel->n_block;
 748
 749         if ((grid_dim <= 0) || (block_dim <= 0)) {
 750                 p = isl_printer_print_str(p, "1");
 751                 return p;
 752         }
 753
 754         for (i = 0; i <= max(grid_dim, block_dim) - 1; i++) {
 755                 if (i > 0)
 756                         p = isl_printer_print_str(p, ", ");
 757
 758                 p = opencl_print_total_number_of_work_items_for_dim(p,
 759                         kernel, i);
 760         }
 761
 762         return p;
 763 }
 764
 765 /* Print the user statement of the host code to "p".
 766  *
 767  * In particular, print a block of statements that defines the grid
 768  * and the work group and then launches the kernel.
 769  *
 770  * A grid is composed of many work groups (blocks), each work group holds
 771  * many work-items (threads).
 772  *
 773  * global_work_size[kernel->n_block] represents the total number of work
 774  * items.  It points to an array of kernel->n_block unsigned
 775  * values that describe the total number of work-items that will execute
 776  * the kernel.  The total number of work-items is computed as:
 777  * global_work_size[0] *...* global_work_size[kernel->n_block - 1].
 778  *
 779  * The size of each work group (i.e. the number of work-items in each work
 780  * group) is described using block_size[kernel->n_block].  The total
 781  * number of work-items in a block (work-group) is computed as:
 782  * block_size[0] *... * block_size[kernel->n_block - 1].
 783  *
 784  * For more information check:
 785  * http://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
 786  */
 787 static __isl_give isl_printer *opencl_print_host_user(
 788         __isl_take isl_printer *p,
 789         __isl_take isl_ast_print_options *print_options,
 790         __isl_keep isl_ast_node *node, void *user)
 791 {
 792         isl_id *id;
 793         struct ppcg_kernel *kernel;
 794         struct print_host_user_data_opencl *data;
 795         int i;
 796
 797         id = isl_ast_node_get_annotation(node);
 798         kernel = isl_id_get_user(id);
 799         isl_id_free(id);
 800
 801         data = (struct print_host_user_data_opencl *) user;
 802
 803         p = isl_printer_start_line(p);
 804         p = isl_printer_print_str(p, "{");
 805         p = isl_printer_end_line(p);
 806         p = isl_printer_indent(p, 2);
 807
 808         p = isl_printer_start_line(p);
 809         p = isl_printer_print_str(p, "size_t global_work_size[");
 810
 811         if (kernel->n_block > 0)
 812                 p = isl_printer_print_int(p, kernel->n_block);
 813         else
 814                 p = isl_printer_print_int(p, 1);
 815
 816         p = isl_printer_print_str(p, "] = {");
 817         p = opencl_print_total_number_of_work_items_as_list(p, kernel);
 818         p = isl_printer_print_str(p, "};");
 819         p = isl_printer_end_line(p);
 820
 821         p = isl_printer_start_line(p);
 822         p = isl_printer_print_str(p, "size_t block_size[");
 823
 824         if (kernel->n_block > 0)
 825                 p = isl_printer_print_int(p, kernel->n_block);
 826         else
 827                 p = isl_printer_print_int(p, 1);
 828
 829         p = isl_printer_print_str(p, "] = {");
 830         p = opencl_print_block_sizes(p, kernel);
 831         p = isl_printer_print_str(p, "};");
 832         p = isl_printer_end_line(p);
 833
 834         p = isl_printer_start_line(p);
 835         p = isl_printer_print_str(p, "cl_kernel kernel");
 836         p = isl_printer_print_int(p, kernel->id);
 837         p = isl_printer_print_str(p, " = clCreateKernel(program, \"kernel");
 838         p = isl_printer_print_int(p, kernel->id);
 839         p = isl_printer_print_str(p, "\", &err);");
 840         p = isl_printer_end_line(p);
 841         p = isl_printer_start_line(p);
 842         p = isl_printer_print_str(p, "openclCheckReturn(err);");
 843         p = isl_printer_end_line(p);
 844
 845         opencl_set_kernel_arguments(p, data->prog, kernel);
 846
 847         p = isl_printer_start_line(p);
 848         p = isl_printer_print_str(p, "openclCheckReturn(clEnqueueNDRangeKernel"
 849                 "(queue, kernel");
 850         p = isl_printer_print_int(p, kernel->id);
 851         p = isl_printer_print_str(p, ", ");
 852         if (kernel->n_block > 0)
 853                 p = isl_printer_print_int(p, kernel->n_block);
 854         else
 855                 p = isl_printer_print_int(p, 1);
 856
 857         p = isl_printer_print_str(p, ", NULL, global_work_size, "
 858                                         "block_size, "
 859                                         "0, NULL, NULL));");
 860         p = isl_printer_end_line(p);
 861         p = isl_printer_start_line(p);
 862         p = isl_printer_print_str(p, "openclCheckReturn("
 863                                         "clReleaseKernel(kernel");
 864         p = isl_printer_print_int(p, kernel->id);
 865         p = isl_printer_print_str(p, "));");
 866         p = isl_printer_end_line(p);
 867         p = isl_printer_start_line(p);
 868         p = isl_printer_print_str(p, "clFinish(queue);");
 869         p = isl_printer_end_line(p);
 870         p = isl_printer_indent(p, -2);
 871         p = isl_printer_start_line(p);
 872         p = isl_printer_print_str(p, "}");
 873         p = isl_printer_end_line(p);
 874
 875         p = isl_printer_start_line(p);
 876         p = isl_printer_end_line(p);
 877
 878         opencl_print_kernel(data->prog, kernel, data->opencl);
 879
 880         isl_ast_print_options_free(print_options);
 881
 882         return p;
 883 }
 884
 885 static __isl_give isl_printer *opencl_print_host_code(
 886         __isl_take isl_printer *p, struct gpu_prog *prog,
 887         __isl_keep isl_ast_node *tree, struct opencl_info *opencl)
 888 {
 889         isl_ast_print_options *print_options;
 890         isl_ctx *ctx = isl_ast_node_get_ctx(tree);
 891         struct print_host_user_data_opencl data = { opencl, prog };
 892
 893         print_options = isl_ast_print_options_alloc(ctx);
 894         print_options = isl_ast_print_options_set_print_user(print_options,
 895                                 &opencl_print_host_user, &data);
 896
 897         p = gpu_print_macros(p, tree);
 898         p = isl_ast_node_print(tree, p, print_options);
 899
 900         return p;
 901 }
 902
 903 /* Copy "array" back from the GPU to the host.
 904  */
 905 static __isl_give isl_printer *copy_array_from_device(__isl_take isl_printer *p,
 906         void *user)
 907 {
 908         struct gpu_array_info *array = user;
 909
 910         p = isl_printer_start_line(p);
 911         p = isl_printer_print_str(p, "openclCheckReturn("
 912                                         "clEnqueueReadBuffer(queue,"
 913                                         " dev_");
 914         p = isl_printer_print_str(p, array->name);
 915         p = isl_printer_print_str(p, ", CL_TRUE, 0, ");
 916         p = gpu_array_info_print_size(p, array);
 917
 918         if (gpu_array_is_scalar(array))
 919                 p = isl_printer_print_str(p, ", &");
 920         else
 921                 p = isl_printer_print_str(p, ", ");
 922         p = isl_printer_print_str(p, array->name);
 923         p = isl_printer_print_str(p, ", 0, NULL, NULL));");
 924         p = isl_printer_end_line(p);
 925
 926         return p;
 927 }
 928
 929 /* Copy copy_out arrays back from the GPU to the host.
 930  *
 931  * Only perform the copying for arrays with strictly positive size.
 932  */
 933 static __isl_give isl_printer *opencl_copy_arrays_from_device(
 934         __isl_take isl_printer *p, struct gpu_prog *prog)
 935 {
 936         int i;
 937         isl_union_set *copy_out;
 938         copy_out = isl_union_set_copy(prog->copy_out);
 939
 940         for (i = 0; i < prog->n_array; ++i) {
 941                 struct gpu_array_info *array = &prog->array[i];
 942                 isl_space *space;
 943                 isl_set *copy_out_i;
 944                 isl_set *guard;
 945                 int empty;
 946
 947                 space = isl_space_copy(array->space);
 948                 copy_out_i = isl_union_set_extract_set(copy_out, space);
 949                 empty = isl_set_fast_is_empty(copy_out_i);
 950                 isl_set_free(copy_out_i);
 951                 if (empty)
 952                         continue;
 953
 954                 guard = gpu_array_positive_size_guard(array);
 955                 p = ppcg_print_guarded(p, guard, isl_set_copy(prog->context),
 956                                             &copy_array_from_device, array);
 957         }
 958
 959         isl_union_set_free(copy_out);
 960         p = isl_printer_start_line(p);
 961         p = isl_printer_end_line(p);
 962         return p;
 963 }
 964
 965 /* Create an OpenCL device, context, command queue and build the kernel.
 966  * input is the name of the input file provided to ppcg.
 967  */
 968 static __isl_give isl_printer *opencl_setup(__isl_take isl_printer *p,
 969         const char *input, struct opencl_info *info)
 970 {
 971         int len;
 972
 973         p = isl_printer_start_line(p);
 974         p = isl_printer_print_str(p, "cl_device_id device;");
 975         p = isl_printer_end_line(p);
 976         p = isl_printer_start_line(p);
 977         p = isl_printer_print_str(p, "cl_context context;");
 978         p = isl_printer_end_line(p);
 979         p = isl_printer_start_line(p);
 980         p = isl_printer_print_str(p, "cl_program program;");
 981         p = isl_printer_end_line(p);
 982         p = isl_printer_start_line(p);
 983         p = isl_printer_print_str(p, "cl_command_queue queue;");
 984         p = isl_printer_end_line(p);
 985         p = isl_printer_start_line(p);
 986         p = isl_printer_print_str(p, "cl_int err;");
 987         p = isl_printer_end_line(p);
 988         p = isl_printer_start_line(p);
 989         p = isl_printer_print_str(p, "device = opencl_create_device(");
 990         p = isl_printer_print_int(p, info->options->opencl_use_gpu);
 991         p = isl_printer_print_str(p, ");");
 992         p = isl_printer_end_line(p);
 993         p = isl_printer_start_line(p);
 994         p = isl_printer_print_str(p, "context = clCreateContext(NULL, 1, "
 995                 "&device, NULL, NULL, &err);");
 996         p = isl_printer_end_line(p);
 997         p = isl_printer_start_line(p);
 998         p = isl_printer_print_str(p, "openclCheckReturn(err);");
 999         p = isl_printer_end_line(p);
1000         p = isl_printer_start_line(p);
1001         p = isl_printer_print_str(p, "queue = clCreateCommandQueue"
1002                                         "(context, device, 0, &err);");
1003         p = isl_printer_end_line(p);
1004         p = isl_printer_start_line(p);
1005         p = isl_printer_print_str(p, "openclCheckReturn(err);");
1006         p = isl_printer_end_line(p);
1007
1008         p = isl_printer_start_line(p);
1009         p = isl_printer_print_str(p, "program = opencl_build_program("
1010                                         "context, device, \"");
1011         p = isl_printer_print_str(p, info->kernel_c_name);
1012         p = isl_printer_print_str(p, "\", \"");
1013
1014         if (info->options->opencl_compiler_options)
1015                 p = isl_printer_print_str(p,
1016                                         info->options->opencl_compiler_options);
1017
1018         p = isl_printer_print_str(p, "\");");
1019         p = isl_printer_end_line(p);
1020         p = isl_printer_start_line(p);
1021         p = isl_printer_end_line(p);
1022
1023         return p;
1024 }
1025
1026 static __isl_give isl_printer *opencl_release_cl_objects(
1027         __isl_take isl_printer *p, struct opencl_info *info)
1028 {
1029         p = isl_printer_start_line(p);
1030         p = isl_printer_print_str(p, "openclCheckReturn(clReleaseCommandQueue"
1031                                         "(queue));");
1032         p = isl_printer_end_line(p);
1033         p = isl_printer_start_line(p);
1034         p = isl_printer_print_str(p, "openclCheckReturn(clReleaseProgram"
1035                                         "(program));");
1036         p = isl_printer_end_line(p);
1037         p = isl_printer_start_line(p);
1038         p = isl_printer_print_str(p, "openclCheckReturn(clReleaseContext"
1039                                         "(context));");
1040         p = isl_printer_end_line(p);
1041
1042         return p;
1043 }
1044
1045 /* Free the device array corresponding to "array"
1046  */
1047 static __isl_give isl_printer *release_device_array(__isl_take isl_printer *p,
1048         struct gpu_array_info *array)
1049 {
1050         p = isl_printer_start_line(p);
1051         p = isl_printer_print_str(p, "openclCheckReturn("
1052                                         "clReleaseMemObject(dev_");
1053         p = isl_printer_print_str(p, array->name);
1054         p = isl_printer_print_str(p, "));");
1055         p = isl_printer_end_line(p);
1056
1057         return p;
1058 }
1059
1060 /* Free the device arrays.
1061  */
1062 static __isl_give isl_printer *opencl_release_device_arrays(
1063         __isl_take isl_printer *p, struct gpu_prog *prog)
1064 {
1065         int i, j;
1066
1067         for (i = 0; i < prog->n_array; ++i) {
1068                 struct gpu_array_info *array = &prog->array[i];
1069                 if (gpu_array_is_read_only_scalar(array))
1070                         continue;
1071
1072                 p = release_device_array(p, array);
1073         }
1074         return p;
1075 }
1076
1077 /* Given a gpu_prog "prog" and the corresponding transformed AST
1078  * "tree", print the entire OpenCL code to "p".
1079  */
1080 static __isl_give isl_printer *print_opencl(__isl_take isl_printer *p,
1081         struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
1082         struct gpu_types *types, void *user)
1083 {
1084         struct opencl_info *opencl = user;
1085         isl_printer *kernel;
1086
1087         kernel = isl_printer_to_file(isl_printer_get_ctx(p), opencl->kernel_c);
1088         kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
1089         if (any_double_elements(prog))
1090                 kernel = opencl_enable_double_support(kernel);
1091         kernel = gpu_print_types(kernel, types, prog);
1092         isl_printer_free(kernel);
1093
1094         if (!kernel)
1095                 return isl_printer_free(p);
1096
1097         p = ppcg_start_block(p);
1098
1099         p = opencl_print_host_macros(p);
1100
1101         p = opencl_declare_device_arrays(p, prog);
1102         p = opencl_setup(p, opencl->input, opencl);
1103         p = opencl_allocate_device_arrays(p, prog);
1104
1105         p = opencl_print_host_code(p, prog, tree, opencl);
1106
1107         p = opencl_copy_arrays_from_device(p, prog);
1108         p = opencl_release_device_arrays(p, prog);
1109         p = opencl_release_cl_objects(p, opencl);
1110
1111         p = ppcg_end_block(p);
1112
1113         return p;
1114 }
1115
1116 /* Transform the code in the file called "input" by replacing
1117  * all scops by corresponding OpenCL code.
1118  * The host code is written to "output" or a name derived from
1119  * "input" if "output" is NULL.
1120  * The kernel code is placed in separate files with names
1121  * derived from "output" or "input".
1122  *
1123  * We let generate_gpu do all the hard work and then let it call
1124  * us back for printing the AST in print_cuda.
1125  *
1126  * To prepare for this printing, we first open the output files
1127  * and we close them after generate_gpu has finished.
1128  */
1129 int generate_opencl(isl_ctx *ctx, struct ppcg_options *options,
1130         const char *input, const char *output)
1131 {
1132         struct opencl_info opencl = { options, input, output };
1133         int r;
1134
1135         r = opencl_open_files(&opencl);
1136
1137         if (r >= 0)
1138                 r = generate_gpu(ctx, input, opencl.host_c, options,
1139                                 &print_opencl, &opencl);
1140
1141         opencl_close_files(&opencl);
1142
1143         return r;
1144 }