cuda.c

   1 /*
   2  * Copyright 2012      Ecole Normale Superieure
   3  *
   4  * Use of this software is governed by the MIT license
   5  *
   6  * Written by Sven Verdoolaege,
   7  * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
   8  */
   9
  10 #include <isl/aff.h>
  11 #include <isl/ast.h>
  12
  13 #include "cuda_common.h"
  14 #include "cuda.h"
  15 #include "gpu.h"
  16 #include "gpu_print.h"
  17 #include "print.h"
  18 #include "util.h"
  19
  20 static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
  21 {
  22         const char *macros =
  23                 "#define cudaCheckReturn(ret) \\\n"
  24                 "  do { \\\n"
  25                 "    cudaError_t cudaCheckReturn_e = (ret); \\\n"
  26                 "    if (cudaCheckReturn_e != cudaSuccess) { \\\n"
  27                 "      fprintf(stderr, \"CUDA error: %s\\n\", "
  28                 "cudaGetErrorString(cudaCheckReturn_e)); \\\n"
  29                 "      fflush(stderr); \\\n"
  30                 "    } \\\n"
  31                 "    assert(cudaCheckReturn_e == cudaSuccess); \\\n"
  32                 "  } while(0)\n"
  33                 "#define cudaCheckKernel() \\\n"
  34                 "  do { \\\n"
  35                 "    cudaCheckReturn(cudaGetLastError()); \\\n"
  36                 "  } while(0)\n\n";
  37
  38         p = isl_printer_print_str(p, macros);
  39         return p;
  40 }
  41
  42 /* Print a declaration for the device array corresponding to "array" on "p".
  43  */
  44 static __isl_give isl_printer *declare_device_array(__isl_take isl_printer *p,
  45         struct gpu_array_info *array)
  46 {
  47         int i;
  48
  49         p = isl_printer_start_line(p);
  50         p = isl_printer_print_str(p, array->type);
  51         p = isl_printer_print_str(p, " ");
  52         if (!array->linearize && array->n_index > 1)
  53                 p = isl_printer_print_str(p, "(");
  54         p = isl_printer_print_str(p, "*dev_");
  55         p = isl_printer_print_str(p, array->name);
  56         if (!array->linearize && array->n_index > 1) {
  57                 p = isl_printer_print_str(p, ")");
  58                 for (i = 1; i < array->n_index; i++) {
  59                         p = isl_printer_print_str(p, "[");
  60                         p = isl_printer_print_pw_aff(p, array->bound[i]);
  61                         p = isl_printer_print_str(p, "]");
  62                 }
  63         }
  64         p = isl_printer_print_str(p, ";");
  65         p = isl_printer_end_line(p);
  66
  67         return p;
  68 }
  69
  70 static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
  71         struct gpu_prog *prog)
  72 {
  73         int i;
  74
  75         for (i = 0; i < prog->n_array; ++i) {
  76                 if (!gpu_array_requires_device_allocation(&prog->array[i]))
  77                         continue;
  78
  79                 p = declare_device_array(p, &prog->array[i]);
  80         }
  81         p = isl_printer_start_line(p);
  82         p = isl_printer_end_line(p);
  83         return p;
  84 }
  85
  86 static __isl_give isl_printer *allocate_device_arrays(
  87         __isl_take isl_printer *p, struct gpu_prog *prog)
  88 {
  89         int i;
  90
  91         for (i = 0; i < prog->n_array; ++i) {
  92                 if (!gpu_array_requires_device_allocation(&prog->array[i]))
  93                         continue;
  94                 p = isl_printer_start_line(p);
  95                 p = isl_printer_print_str(p,
  96                         "cudaCheckReturn(cudaMalloc((void **) &dev_");
  97                 p = isl_printer_print_str(p, prog->array[i].name);
  98                 p = isl_printer_print_str(p, ", ");
  99                 p = gpu_array_info_print_size(p, &prog->array[i]);
 100                 p = isl_printer_print_str(p, "));");
 101                 p = isl_printer_end_line(p);
 102         }
 103         p = isl_printer_start_line(p);
 104         p = isl_printer_end_line(p);
 105         return p;
 106 }
 107
 108 static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
 109         struct gpu_prog *prog)
 110 {
 111         int i;
 112
 113         for (i = 0; i < prog->n_array; ++i) {
 114                 if (!gpu_array_requires_device_allocation(&prog->array[i]))
 115                         continue;
 116                 p = isl_printer_start_line(p);
 117                 p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
 118                 p = isl_printer_print_str(p, prog->array[i].name);
 119                 p = isl_printer_print_str(p, "));");
 120                 p = isl_printer_end_line(p);
 121         }
 122
 123         return p;
 124 }
 125
 126 /* Print code to "p" for copying "array" from the host to the device
 127  * in its entirety.  The bounds on the extent of "array" have
 128  * been precomputed in extract_array_info and are used in
 129  * gpu_array_info_print_size.
 130  */
 131 static __isl_give isl_printer *copy_array_to_device(__isl_take isl_printer *p,
 132         struct gpu_array_info *array)
 133 {
 134         p = isl_printer_start_line(p);
 135         p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
 136         p = isl_printer_print_str(p, array->name);
 137         p = isl_printer_print_str(p, ", ");
 138
 139         if (gpu_array_is_scalar(array))
 140                 p = isl_printer_print_str(p, "&");
 141         p = isl_printer_print_str(p, array->name);
 142         p = isl_printer_print_str(p, ", ");
 143
 144         p = gpu_array_info_print_size(p, array);
 145         p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
 146         p = isl_printer_end_line(p);
 147
 148         return p;
 149 }
 150
 151 /* Print code to "p" for copying "array" back from the device to the host
 152  * in its entirety.  The bounds on the extent of "array" have
 153  * been precomputed in extract_array_info and are used in
 154  * gpu_array_info_print_size.
 155  */
 156 static __isl_give isl_printer *copy_array_from_device(
 157         __isl_take isl_printer *p, struct gpu_array_info *array)
 158 {
 159         p = isl_printer_start_line(p);
 160         p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
 161         if (gpu_array_is_scalar(array))
 162                 p = isl_printer_print_str(p, "&");
 163         p = isl_printer_print_str(p, array->name);
 164         p = isl_printer_print_str(p, ", dev_");
 165         p = isl_printer_print_str(p, array->name);
 166         p = isl_printer_print_str(p, ", ");
 167         p = gpu_array_info_print_size(p, array);
 168         p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
 169         p = isl_printer_end_line(p);
 170
 171         return p;
 172 }
 173
 174 static void print_reverse_list(FILE *out, int len, int *list)
 175 {
 176         int i;
 177
 178         if (!out || len == 0)
 179                 return;
 180
 181         fprintf(out, "(");
 182         for (i = 0; i < len; ++i) {
 183                 if (i)
 184                         fprintf(out, ", ");
 185                 fprintf(out, "%d", list[len - 1 - i]);
 186         }
 187         fprintf(out, ")");
 188 }
 189
 190 /* Print the effective grid size as a list of the sizes in each
 191  * dimension, from innermost to outermost.
 192  */
 193 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
 194         struct ppcg_kernel *kernel)
 195 {
 196         int i;
 197         int dim;
 198
 199         dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
 200         if (dim == 0)
 201                 return p;
 202
 203         p = isl_printer_print_str(p, "(");
 204         for (i = dim - 1; i >= 0; --i) {
 205                 isl_pw_aff *bound;
 206
 207                 bound = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
 208                 p = isl_printer_print_pw_aff(p, bound);
 209                 isl_pw_aff_free(bound);
 210
 211                 if (i > 0)
 212                         p = isl_printer_print_str(p, ", ");
 213         }
 214
 215         p = isl_printer_print_str(p, ")");
 216
 217         return p;
 218 }
 219
 220 /* Print the grid definition.
 221  */
 222 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
 223         struct ppcg_kernel *kernel)
 224 {
 225         p = isl_printer_start_line(p);
 226         p = isl_printer_print_str(p, "dim3 k");
 227         p = isl_printer_print_int(p, kernel->id);
 228         p = isl_printer_print_str(p, "_dimGrid");
 229         p = print_grid_size(p, kernel);
 230         p = isl_printer_print_str(p, ";");
 231         p = isl_printer_end_line(p);
 232
 233         return p;
 234 }
 235
 236 /* Print the arguments to a kernel declaration or call.  If "types" is set,
 237  * then print a declaration (including the types of the arguments).
 238  *
 239  * The arguments are printed in the following order
 240  * - the arrays accessed by the kernel
 241  * - the parameters
 242  * - the host loop iterators
 243  */
 244 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
 245         struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
 246 {
 247         int i, n;
 248         int first = 1;
 249         unsigned nparam;
 250         isl_space *space;
 251         const char *type;
 252
 253         for (i = 0; i < prog->n_array; ++i) {
 254                 int required;
 255
 256                 required = ppcg_kernel_requires_array_argument(kernel, i);
 257                 if (required < 0)
 258                         return isl_printer_free(p);
 259                 if (!required)
 260                         continue;
 261
 262                 if (!first)
 263                         p = isl_printer_print_str(p, ", ");
 264
 265                 if (types)
 266                         p = gpu_array_info_print_declaration_argument(p,
 267                                 &prog->array[i], NULL);
 268                 else
 269                         p = gpu_array_info_print_call_argument(p,
 270                                 &prog->array[i]);
 271
 272                 first = 0;
 273         }
 274
 275         space = isl_union_set_get_space(kernel->arrays);
 276         nparam = isl_space_dim(space, isl_dim_param);
 277         for (i = 0; i < nparam; ++i) {
 278                 const char *name;
 279
 280                 name = isl_space_get_dim_name(space, isl_dim_param, i);
 281
 282                 if (!first)
 283                         p = isl_printer_print_str(p, ", ");
 284                 if (types)
 285                         p = isl_printer_print_str(p, "int ");
 286                 p = isl_printer_print_str(p, name);
 287
 288                 first = 0;
 289         }
 290         isl_space_free(space);
 291
 292         n = isl_space_dim(kernel->space, isl_dim_set);
 293         type = isl_options_get_ast_iterator_type(prog->ctx);
 294         for (i = 0; i < n; ++i) {
 295                 const char *name;
 296
 297                 if (!first)
 298                         p = isl_printer_print_str(p, ", ");
 299                 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
 300                 if (types) {
 301                         p = isl_printer_print_str(p, type);
 302                         p = isl_printer_print_str(p, " ");
 303                 }
 304                 p = isl_printer_print_str(p, name);
 305
 306                 first = 0;
 307         }
 308
 309         return p;
 310 }
 311
 312 /* Print the header of the given kernel.
 313  */
 314 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
 315         struct gpu_prog *prog, struct ppcg_kernel *kernel)
 316 {
 317         p = isl_printer_start_line(p);
 318         p = isl_printer_print_str(p, "__global__ void kernel");
 319         p = isl_printer_print_int(p, kernel->id);
 320         p = isl_printer_print_str(p, "(");
 321         p = print_kernel_arguments(p, prog, kernel, 1);
 322         p = isl_printer_print_str(p, ")");
 323
 324         return p;
 325 }
 326
 327 /* Print the header of the given kernel to both gen->cuda.kernel_h
 328  * and gen->cuda.kernel_c.
 329  */
 330 static void print_kernel_headers(struct gpu_prog *prog,
 331         struct ppcg_kernel *kernel, struct cuda_info *cuda)
 332 {
 333         isl_printer *p;
 334
 335         p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
 336         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 337         p = print_kernel_header(p, prog, kernel);
 338         p = isl_printer_print_str(p, ";");
 339         p = isl_printer_end_line(p);
 340         isl_printer_free(p);
 341
 342         p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
 343         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 344         p = print_kernel_header(p, prog, kernel);
 345         p = isl_printer_end_line(p);
 346         isl_printer_free(p);
 347 }
 348
 349 static void print_indent(FILE *dst, int indent)
 350 {
 351         fprintf(dst, "%*s", indent, "");
 352 }
 353
 354 /* Print a list of iterators of type "type" with names "ids" to "out".
 355  * Each iterator is assigned one of the cuda identifiers in cuda_dims.
 356  * In particular, the last iterator is assigned the x identifier
 357  * (the first in the list of cuda identifiers).
 358  */
 359 static void print_iterators(FILE *out, const char *type,
 360         __isl_keep isl_id_list *ids, const char *cuda_dims[])
 361 {
 362         int i, n;
 363
 364         n = isl_id_list_n_id(ids);
 365         if (n <= 0)
 366                 return;
 367         print_indent(out, 4);
 368         fprintf(out, "%s ", type);
 369         for (i = 0; i < n; ++i) {
 370                 isl_id *id;
 371
 372                 if (i)
 373                         fprintf(out, ", ");
 374                 id = isl_id_list_get_id(ids, i);
 375                 fprintf(out, "%s = %s", isl_id_get_name(id),
 376                         cuda_dims[n - 1 - i]);
 377                 isl_id_free(id);
 378         }
 379         fprintf(out, ";\n");
 380 }
 381
 382 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
 383 {
 384         isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
 385         const char *type;
 386         const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
 387         const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
 388                                         "threadIdx.z" };
 389
 390         type = isl_options_get_ast_iterator_type(ctx);
 391
 392         print_iterators(out, type, kernel->block_ids, block_dims);
 393         print_iterators(out, type, kernel->thread_ids, thread_dims);
 394 }
 395
 396 static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p,
 397         struct ppcg_kernel_var *var)
 398 {
 399         int j;
 400
 401         p = isl_printer_start_line(p);
 402         if (var->type == ppcg_access_shared)
 403                 p = isl_printer_print_str(p, "__shared__ ");
 404         p = isl_printer_print_str(p, var->array->type);
 405         p = isl_printer_print_str(p, " ");
 406         p = isl_printer_print_str(p,  var->name);
 407         for (j = 0; j < var->array->n_index; ++j) {
 408                 isl_val *v;
 409
 410                 p = isl_printer_print_str(p, "[");
 411                 v = isl_vec_get_element_val(var->size, j);
 412                 p = isl_printer_print_val(p, v);
 413                 isl_val_free(v);
 414                 p = isl_printer_print_str(p, "]");
 415         }
 416         p = isl_printer_print_str(p, ";");
 417         p = isl_printer_end_line(p);
 418
 419         return p;
 420 }
 421
 422 static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p,
 423         struct ppcg_kernel *kernel)
 424 {
 425         int i;
 426
 427         for (i = 0; i < kernel->n_var; ++i)
 428                 p = print_kernel_var(p, &kernel->var[i]);
 429
 430         return p;
 431 }
 432
 433 /* Print a sync statement.
 434  */
 435 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
 436         struct ppcg_kernel_stmt *stmt)
 437 {
 438         p = isl_printer_start_line(p);
 439         p = isl_printer_print_str(p, "__syncthreads();");
 440         p = isl_printer_end_line(p);
 441
 442         return p;
 443 }
 444
 445 /* This function is called for each user statement in the AST,
 446  * i.e., for each kernel body statement, copy statement or sync statement.
 447  */
 448 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
 449         __isl_take isl_ast_print_options *print_options,
 450         __isl_keep isl_ast_node *node, void *user)
 451 {
 452         isl_id *id;
 453         struct ppcg_kernel_stmt *stmt;
 454
 455         id = isl_ast_node_get_annotation(node);
 456         stmt = isl_id_get_user(id);
 457         isl_id_free(id);
 458
 459         isl_ast_print_options_free(print_options);
 460
 461         switch (stmt->type) {
 462         case ppcg_kernel_copy:
 463                 return ppcg_kernel_print_copy(p, stmt);
 464         case ppcg_kernel_sync:
 465                 return print_sync(p, stmt);
 466         case ppcg_kernel_domain:
 467                 return ppcg_kernel_print_domain(p, stmt);
 468         }
 469
 470         return p;
 471 }
 472
 473 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
 474         struct cuda_info *cuda)
 475 {
 476         isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
 477         isl_ast_print_options *print_options;
 478         isl_printer *p;
 479
 480         print_kernel_headers(prog, kernel, cuda);
 481         fprintf(cuda->kernel_c, "{\n");
 482         print_kernel_iterators(cuda->kernel_c, kernel);
 483
 484         p = isl_printer_to_file(ctx, cuda->kernel_c);
 485         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 486         p = isl_printer_indent(p, 4);
 487
 488         p = print_kernel_vars(p, kernel);
 489         p = isl_printer_end_line(p);
 490         p = ppcg_set_macro_names(p);
 491         p = isl_ast_op_type_print_macro(isl_ast_op_fdiv_q, p);
 492         p = ppcg_print_macros(p, kernel->tree);
 493
 494         print_options = isl_ast_print_options_alloc(ctx);
 495         print_options = isl_ast_print_options_set_print_user(print_options,
 496                                                     &print_kernel_stmt, NULL);
 497         p = isl_ast_node_print(kernel->tree, p, print_options);
 498         isl_printer_free(p);
 499
 500         fprintf(cuda->kernel_c, "}\n");
 501 }
 502
 503 /* Print code for initializing the device for execution of the transformed
 504  * code.  This includes declaring locally defined variables as well as
 505  * declaring and allocating the required copies of arrays on the device.
 506  */
 507 static __isl_give isl_printer *init_device(__isl_take isl_printer *p,
 508         struct gpu_prog *prog)
 509 {
 510         p = print_cuda_macros(p);
 511
 512         p = gpu_print_local_declarations(p, prog);
 513         p = declare_device_arrays(p, prog);
 514         p = allocate_device_arrays(p, prog);
 515
 516         return p;
 517 }
 518
 519 /* Print code for clearing the device after execution of the transformed code.
 520  * In particular, free the memory that was allocated on the device.
 521  */
 522 static __isl_give isl_printer *clear_device(__isl_take isl_printer *p,
 523         struct gpu_prog *prog)
 524 {
 525         p = free_device_arrays(p, prog);
 526
 527         return p;
 528 }
 529
 530 /* Print a statement for copying an array to or from the device,
 531  * or for initializing or clearing the device.
 532  * The statement identifier of a copying node is called
 533  * "to_device_<array name>" or "from_device_<array name>" and
 534  * its user pointer points to the gpu_array_info of the array
 535  * that needs to be copied.
 536  * The node for initializing the device is called "init_device".
 537  * The node for clearing the device is called "clear_device".
 538  *
 539  * Extract the array (if any) from the identifier and call
 540  * init_device, clear_device, copy_array_to_device or copy_array_from_device.
 541  */
 542 static __isl_give isl_printer *print_device_node(__isl_take isl_printer *p,
 543         __isl_keep isl_ast_node *node, struct gpu_prog *prog)
 544 {
 545         isl_ast_expr *expr, *arg;
 546         isl_id *id;
 547         const char *name;
 548         struct gpu_array_info *array;
 549
 550         expr = isl_ast_node_user_get_expr(node);
 551         arg = isl_ast_expr_get_op_arg(expr, 0);
 552         id = isl_ast_expr_get_id(arg);
 553         name = isl_id_get_name(id);
 554         array = isl_id_get_user(id);
 555         isl_id_free(id);
 556         isl_ast_expr_free(arg);
 557         isl_ast_expr_free(expr);
 558
 559         if (!name)
 560                 return isl_printer_free(p);
 561         if (!strcmp(name, "init_device"))
 562                 return init_device(p, prog);
 563         if (!strcmp(name, "clear_device"))
 564                 return clear_device(p, prog);
 565         if (!array)
 566                 return isl_printer_free(p);
 567
 568         if (!prefixcmp(name, "to_device"))
 569                 return copy_array_to_device(p, array);
 570         else
 571                 return copy_array_from_device(p, array);
 572 }
 573
 574 struct print_host_user_data {
 575         struct cuda_info *cuda;
 576         struct gpu_prog *prog;
 577 };
 578
 579 /* Print the user statement of the host code to "p".
 580  *
 581  * The host code may contain original user statements, kernel launches,
 582  * statements that copy data to/from the device and statements
 583  * the initialize or clear the device.
 584  * The original user statements and the kernel launches have
 585  * an associated annotation, while the other statements do not.
 586  * The latter are handled by print_device_node.
 587  * The annotation on the user statements is called "user".
 588  *
 589  * In case of a kernel launch, print a block of statements that
 590  * defines the grid and the block and then launches the kernel.
 591  */
 592 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
 593         __isl_take isl_ast_print_options *print_options,
 594         __isl_keep isl_ast_node *node, void *user)
 595 {
 596         isl_id *id;
 597         int is_user;
 598         struct ppcg_kernel *kernel;
 599         struct ppcg_kernel_stmt *stmt;
 600         struct print_host_user_data *data;
 601
 602         isl_ast_print_options_free(print_options);
 603
 604         data = (struct print_host_user_data *) user;
 605
 606         id = isl_ast_node_get_annotation(node);
 607         if (!id)
 608                 return print_device_node(p, node, data->prog);
 609
 610         is_user = !strcmp(isl_id_get_name(id), "user");
 611         kernel = is_user ? NULL : isl_id_get_user(id);
 612         stmt = is_user ? isl_id_get_user(id) : NULL;
 613         isl_id_free(id);
 614
 615         if (is_user)
 616                 return ppcg_kernel_print_domain(p, stmt);
 617
 618         p = isl_printer_start_line(p);
 619         p = isl_printer_print_str(p, "{");
 620         p = isl_printer_end_line(p);
 621         p = isl_printer_indent(p, 2);
 622
 623         p = isl_printer_start_line(p);
 624         p = isl_printer_print_str(p, "dim3 k");
 625         p = isl_printer_print_int(p, kernel->id);
 626         p = isl_printer_print_str(p, "_dimBlock");
 627         print_reverse_list(isl_printer_get_file(p),
 628                                 kernel->n_block, kernel->block_dim);
 629         p = isl_printer_print_str(p, ";");
 630         p = isl_printer_end_line(p);
 631
 632         p = print_grid(p, kernel);
 633
 634         p = isl_printer_start_line(p);
 635         p = isl_printer_print_str(p, "kernel");
 636         p = isl_printer_print_int(p, kernel->id);
 637         p = isl_printer_print_str(p, " <<<k");
 638         p = isl_printer_print_int(p, kernel->id);
 639         p = isl_printer_print_str(p, "_dimGrid, k");
 640         p = isl_printer_print_int(p, kernel->id);
 641         p = isl_printer_print_str(p, "_dimBlock>>> (");
 642         p = print_kernel_arguments(p, data->prog, kernel, 0);
 643         p = isl_printer_print_str(p, ");");
 644         p = isl_printer_end_line(p);
 645
 646         p = isl_printer_start_line(p);
 647         p = isl_printer_print_str(p, "cudaCheckKernel();");
 648         p = isl_printer_end_line(p);
 649
 650         p = isl_printer_indent(p, -2);
 651         p = isl_printer_start_line(p);
 652         p = isl_printer_print_str(p, "}");
 653         p = isl_printer_end_line(p);
 654
 655         p = isl_printer_start_line(p);
 656         p = isl_printer_end_line(p);
 657
 658         print_kernel(data->prog, kernel, data->cuda);
 659
 660         return p;
 661 }
 662
 663 static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
 664         struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
 665         struct cuda_info *cuda)
 666 {
 667         isl_ast_print_options *print_options;
 668         isl_ctx *ctx = isl_ast_node_get_ctx(tree);
 669         struct print_host_user_data data = { cuda, prog };
 670
 671         print_options = isl_ast_print_options_alloc(ctx);
 672         print_options = isl_ast_print_options_set_print_user(print_options,
 673                                                 &print_host_user, &data);
 674
 675         p = ppcg_print_macros(p, tree);
 676         p = isl_ast_node_print(tree, p, print_options);
 677
 678         return p;
 679 }
 680
 681 /* Given a gpu_prog "prog" and the corresponding transformed AST
 682  * "tree", print the entire CUDA code to "p".
 683  * "types" collects the types for which a definition has already
 684  * been printed.
 685  */
 686 static __isl_give isl_printer *print_cuda(__isl_take isl_printer *p,
 687         struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
 688         struct gpu_types *types, void *user)
 689 {
 690         struct cuda_info *cuda = user;
 691         isl_printer *kernel;
 692
 693         kernel = isl_printer_to_file(isl_printer_get_ctx(p), cuda->kernel_c);
 694         kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
 695         kernel = gpu_print_types(kernel, types, prog);
 696         isl_printer_free(kernel);
 697
 698         if (!kernel)
 699                 return isl_printer_free(p);
 700
 701         p = print_host_code(p, prog, tree, cuda);
 702
 703         return p;
 704 }
 705
 706 /* Transform the code in the file called "input" by replacing
 707  * all scops by corresponding CUDA code.
 708  * The names of the output files are derived from "input".
 709  *
 710  * We let generate_gpu do all the hard work and then let it call
 711  * us back for printing the AST in print_cuda.
 712  *
 713  * To prepare for this printing, we first open the output files
 714  * and we close them after generate_gpu has finished.
 715  */
 716 int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
 717         const char *input)
 718 {
 719         struct cuda_info cuda;
 720         int r;
 721
 722         cuda_open_files(&cuda, input);
 723
 724         r = generate_gpu(ctx, input, cuda.host_c, options, &print_cuda, &cuda);
 725
 726         cuda_close_files(&cuda);
 727
 728         return r;
 729 }