cuda.c

   1 /*
   2  * Copyright 2012      Ecole Normale Superieure
   3  *
   4  * Use of this software is governed by the MIT license
   5  *
   6  * Written by Sven Verdoolaege,
   7  * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
   8  */
   9
  10 #include <isl/aff.h>
  11 #include <isl/ast.h>
  12
  13 #include "cuda_common.h"
  14 #include "cuda.h"
  15 #include "gpu.h"
  16 #include "gpu_print.h"
  17 #include "print.h"
  18
  19 static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
  20 {
  21         const char *macros =
  22                 "#define cudaCheckReturn(ret) \\\n"
  23                 "  do { \\\n"
  24                 "    cudaError_t cudaCheckReturn_e = (ret); \\\n"
  25                 "    if (cudaCheckReturn_e != cudaSuccess) { \\\n"
  26                 "      fprintf(stderr, \"CUDA error: %s\\n\", "
  27                 "cudaGetErrorString(cudaCheckReturn_e)); \\\n"
  28                 "      fflush(stderr); \\\n"
  29                 "    } \\\n"
  30                 "    assert(cudaCheckReturn_e == cudaSuccess); \\\n"
  31                 "  } while(0)\n"
  32                 "#define cudaCheckKernel() \\\n"
  33                 "  do { \\\n"
  34                 "    cudaCheckReturn(cudaGetLastError()); \\\n"
  35                 "  } while(0)\n\n";
  36
  37         p = isl_printer_print_str(p, macros);
  38         return p;
  39 }
  40
  41 /* Print a declaration for the device array corresponding to "array" on "p".
  42  */
  43 static __isl_give isl_printer *declare_device_array(__isl_take isl_printer *p,
  44         struct gpu_array_info *array)
  45 {
  46         int i;
  47
  48         p = isl_printer_start_line(p);
  49         p = isl_printer_print_str(p, array->type);
  50         p = isl_printer_print_str(p, " ");
  51         if (!array->linearize && array->n_index > 1)
  52                 p = isl_printer_print_str(p, "(");
  53         p = isl_printer_print_str(p, "*dev_");
  54         p = isl_printer_print_str(p, array->name);
  55         if (!array->linearize && array->n_index > 1) {
  56                 p = isl_printer_print_str(p, ")");
  57                 for (i = 1; i < array->n_index; i++) {
  58                         p = isl_printer_print_str(p, "[");
  59                         p = isl_printer_print_pw_aff(p, array->bound[i]);
  60                         p = isl_printer_print_str(p, "]");
  61                 }
  62         }
  63         p = isl_printer_print_str(p, ";");
  64         p = isl_printer_end_line(p);
  65
  66         return p;
  67 }
  68
  69 static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
  70         struct gpu_prog *prog)
  71 {
  72         int i;
  73
  74         for (i = 0; i < prog->n_array; ++i) {
  75                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
  76                         continue;
  77                 if (!prog->array[i].accessed)
  78                         continue;
  79
  80                 p = declare_device_array(p, &prog->array[i]);
  81         }
  82         p = isl_printer_start_line(p);
  83         p = isl_printer_end_line(p);
  84         return p;
  85 }
  86
  87 static __isl_give isl_printer *allocate_device_arrays(
  88         __isl_take isl_printer *p, struct gpu_prog *prog)
  89 {
  90         int i;
  91
  92         for (i = 0; i < prog->n_array; ++i) {
  93                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
  94                         continue;
  95                 if (!prog->array[i].accessed)
  96                         continue;
  97                 p = isl_printer_start_line(p);
  98                 p = isl_printer_print_str(p,
  99                         "cudaCheckReturn(cudaMalloc((void **) &dev_");
 100                 p = isl_printer_print_str(p, prog->array[i].name);
 101                 p = isl_printer_print_str(p, ", ");
 102                 p = gpu_array_info_print_size(p, &prog->array[i]);
 103                 p = isl_printer_print_str(p, "));");
 104                 p = isl_printer_end_line(p);
 105         }
 106         p = isl_printer_start_line(p);
 107         p = isl_printer_end_line(p);
 108         return p;
 109 }
 110
 111 static __isl_give isl_printer *copy_arrays_to_device(__isl_take isl_printer *p,
 112         struct gpu_prog *prog)
 113 {
 114         int i;
 115
 116         for (i = 0; i < prog->n_array; ++i) {
 117                 isl_space *space;
 118                 isl_set *read_i;
 119                 int empty;
 120
 121                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
 122                         continue;
 123
 124                 space = isl_space_copy(prog->array[i].space);
 125                 read_i = isl_union_set_extract_set(prog->copy_in, space);
 126                 empty = isl_set_plain_is_empty(read_i);
 127                 isl_set_free(read_i);
 128                 if (empty)
 129                         continue;
 130
 131                 p = isl_printer_start_line(p);
 132                 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
 133                 p = isl_printer_print_str(p, prog->array[i].name);
 134                 p = isl_printer_print_str(p, ", ");
 135
 136                 if (gpu_array_is_scalar(&prog->array[i]))
 137                         p = isl_printer_print_str(p, "&");
 138                 p = isl_printer_print_str(p, prog->array[i].name);
 139                 p = isl_printer_print_str(p, ", ");
 140
 141                 p = gpu_array_info_print_size(p, &prog->array[i]);
 142                 p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
 143                 p = isl_printer_end_line(p);
 144         }
 145         p = isl_printer_start_line(p);
 146         p = isl_printer_end_line(p);
 147         return p;
 148 }
 149
 150 static void print_reverse_list(FILE *out, int len, int *list)
 151 {
 152         int i;
 153
 154         if (len == 0)
 155                 return;
 156
 157         fprintf(out, "(");
 158         for (i = 0; i < len; ++i) {
 159                 if (i)
 160                         fprintf(out, ", ");
 161                 fprintf(out, "%d", list[len - 1 - i]);
 162         }
 163         fprintf(out, ")");
 164 }
 165
 166 /* Print the effective grid size as a list of the sizes in each
 167  * dimension, from innermost to outermost.
 168  */
 169 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
 170         struct ppcg_kernel *kernel)
 171 {
 172         int i;
 173         int dim;
 174
 175         dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
 176         if (dim == 0)
 177                 return p;
 178
 179         p = isl_printer_print_str(p, "(");
 180         for (i = dim - 1; i >= 0; --i) {
 181                 isl_pw_aff *bound;
 182
 183                 bound = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
 184                 p = isl_printer_print_pw_aff(p, bound);
 185                 isl_pw_aff_free(bound);
 186
 187                 if (i > 0)
 188                         p = isl_printer_print_str(p, ", ");
 189         }
 190
 191         p = isl_printer_print_str(p, ")");
 192
 193         return p;
 194 }
 195
 196 /* Print the grid definition.
 197  */
 198 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
 199         struct ppcg_kernel *kernel)
 200 {
 201         p = isl_printer_start_line(p);
 202         p = isl_printer_print_str(p, "dim3 k");
 203         p = isl_printer_print_int(p, kernel->id);
 204         p = isl_printer_print_str(p, "_dimGrid");
 205         p = print_grid_size(p, kernel);
 206         p = isl_printer_print_str(p, ";");
 207         p = isl_printer_end_line(p);
 208
 209         return p;
 210 }
 211
 212 /* Print the arguments to a kernel declaration or call.  If "types" is set,
 213  * then print a declaration (including the types of the arguments).
 214  *
 215  * The arguments are printed in the following order
 216  * - the arrays accessed by the kernel
 217  * - the parameters
 218  * - the host loop iterators
 219  */
 220 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
 221         struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
 222 {
 223         int i, n;
 224         int first = 1;
 225         unsigned nparam;
 226         isl_space *space;
 227         const char *type;
 228
 229         for (i = 0; i < prog->n_array; ++i) {
 230                 isl_set *arr;
 231                 int empty;
 232
 233                 space = isl_space_copy(prog->array[i].space);
 234                 arr = isl_union_set_extract_set(kernel->arrays, space);
 235                 empty = isl_set_plain_is_empty(arr);
 236                 isl_set_free(arr);
 237                 if (empty)
 238                         continue;
 239
 240                 if (!first)
 241                         p = isl_printer_print_str(p, ", ");
 242
 243                 if (types)
 244                         p = gpu_array_info_print_declaration_argument(p,
 245                                 &prog->array[i], NULL);
 246                 else
 247                         p = gpu_array_info_print_call_argument(p,
 248                                 &prog->array[i]);
 249
 250                 first = 0;
 251         }
 252
 253         space = isl_union_set_get_space(kernel->arrays);
 254         nparam = isl_space_dim(space, isl_dim_param);
 255         for (i = 0; i < nparam; ++i) {
 256                 const char *name;
 257
 258                 name = isl_space_get_dim_name(space, isl_dim_param, i);
 259
 260                 if (!first)
 261                         p = isl_printer_print_str(p, ", ");
 262                 if (types)
 263                         p = isl_printer_print_str(p, "int ");
 264                 p = isl_printer_print_str(p, name);
 265
 266                 first = 0;
 267         }
 268         isl_space_free(space);
 269
 270         n = isl_space_dim(kernel->space, isl_dim_set);
 271         type = isl_options_get_ast_iterator_type(prog->ctx);
 272         for (i = 0; i < n; ++i) {
 273                 const char *name;
 274
 275                 if (!first)
 276                         p = isl_printer_print_str(p, ", ");
 277                 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
 278                 if (types) {
 279                         p = isl_printer_print_str(p, type);
 280                         p = isl_printer_print_str(p, " ");
 281                 }
 282                 p = isl_printer_print_str(p, name);
 283
 284                 first = 0;
 285         }
 286
 287         return p;
 288 }
 289
 290 /* Print the header of the given kernel.
 291  */
 292 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
 293         struct gpu_prog *prog, struct ppcg_kernel *kernel)
 294 {
 295         p = isl_printer_start_line(p);
 296         p = isl_printer_print_str(p, "__global__ void kernel");
 297         p = isl_printer_print_int(p, kernel->id);
 298         p = isl_printer_print_str(p, "(");
 299         p = print_kernel_arguments(p, prog, kernel, 1);
 300         p = isl_printer_print_str(p, ")");
 301
 302         return p;
 303 }
 304
 305 /* Print the header of the given kernel to both gen->cuda.kernel_h
 306  * and gen->cuda.kernel_c.
 307  */
 308 static void print_kernel_headers(struct gpu_prog *prog,
 309         struct ppcg_kernel *kernel, struct cuda_info *cuda)
 310 {
 311         isl_printer *p;
 312
 313         p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
 314         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 315         p = print_kernel_header(p, prog, kernel);
 316         p = isl_printer_print_str(p, ";");
 317         p = isl_printer_end_line(p);
 318         isl_printer_free(p);
 319
 320         p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
 321         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 322         p = print_kernel_header(p, prog, kernel);
 323         p = isl_printer_end_line(p);
 324         isl_printer_free(p);
 325 }
 326
 327 static void print_indent(FILE *dst, int indent)
 328 {
 329         fprintf(dst, "%*s", indent, "");
 330 }
 331
 332 /* Print a list of iterators of type "type" with names "ids" to "out".
 333  * Each iterator is assigned one of the cuda identifiers in cuda_dims.
 334  * In particular, the last iterator is assigned the x identifier
 335  * (the first in the list of cuda identifiers).
 336  */
 337 static void print_iterators(FILE *out, const char *type,
 338         __isl_keep isl_id_list *ids, const char *cuda_dims[])
 339 {
 340         int i, n;
 341
 342         n = isl_id_list_n_id(ids);
 343         if (n <= 0)
 344                 return;
 345         print_indent(out, 4);
 346         fprintf(out, "%s ", type);
 347         for (i = 0; i < n; ++i) {
 348                 isl_id *id;
 349
 350                 if (i)
 351                         fprintf(out, ", ");
 352                 id = isl_id_list_get_id(ids, i);
 353                 fprintf(out, "%s = %s", isl_id_get_name(id),
 354                         cuda_dims[n - 1 - i]);
 355                 isl_id_free(id);
 356         }
 357         fprintf(out, ";\n");
 358 }
 359
 360 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
 361 {
 362         isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
 363         const char *type;
 364         const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
 365         const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
 366                                         "threadIdx.z" };
 367
 368         type = isl_options_get_ast_iterator_type(ctx);
 369
 370         print_iterators(out, type, kernel->block_ids, block_dims);
 371         print_iterators(out, type, kernel->thread_ids, thread_dims);
 372 }
 373
 374 static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p,
 375         struct ppcg_kernel_var *var)
 376 {
 377         int j;
 378
 379         p = isl_printer_start_line(p);
 380         if (var->type == ppcg_access_shared)
 381                 p = isl_printer_print_str(p, "__shared__ ");
 382         p = isl_printer_print_str(p, var->array->type);
 383         p = isl_printer_print_str(p, " ");
 384         p = isl_printer_print_str(p,  var->name);
 385         for (j = 0; j < var->array->n_index; ++j) {
 386                 isl_val *v;
 387
 388                 p = isl_printer_print_str(p, "[");
 389                 v = isl_vec_get_element_val(var->size, j);
 390                 p = isl_printer_print_val(p, v);
 391                 isl_val_free(v);
 392                 p = isl_printer_print_str(p, "]");
 393         }
 394         p = isl_printer_print_str(p, ";");
 395         p = isl_printer_end_line(p);
 396
 397         return p;
 398 }
 399
 400 static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p,
 401         struct ppcg_kernel *kernel)
 402 {
 403         int i;
 404
 405         for (i = 0; i < kernel->n_var; ++i)
 406                 p = print_kernel_var(p, &kernel->var[i]);
 407
 408         return p;
 409 }
 410
 411 /* Print a sync statement.
 412  */
 413 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
 414         struct ppcg_kernel_stmt *stmt)
 415 {
 416         p = isl_printer_start_line(p);
 417         p = isl_printer_print_str(p, "__syncthreads();");
 418         p = isl_printer_end_line(p);
 419
 420         return p;
 421 }
 422
 423 /* This function is called for each user statement in the AST,
 424  * i.e., for each kernel body statement, copy statement or sync statement.
 425  */
 426 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
 427         __isl_take isl_ast_print_options *print_options,
 428         __isl_keep isl_ast_node *node, void *user)
 429 {
 430         isl_id *id;
 431         struct ppcg_kernel_stmt *stmt;
 432
 433         id = isl_ast_node_get_annotation(node);
 434         stmt = isl_id_get_user(id);
 435         isl_id_free(id);
 436
 437         isl_ast_print_options_free(print_options);
 438
 439         switch (stmt->type) {
 440         case ppcg_kernel_copy:
 441                 return ppcg_kernel_print_copy(p, stmt);
 442         case ppcg_kernel_sync:
 443                 return print_sync(p, stmt);
 444         case ppcg_kernel_domain:
 445                 return ppcg_kernel_print_domain(p, stmt);
 446         }
 447
 448         return p;
 449 }
 450
 451 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
 452         struct cuda_info *cuda)
 453 {
 454         isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
 455         isl_ast_print_options *print_options;
 456         isl_printer *p;
 457
 458         print_kernel_headers(prog, kernel, cuda);
 459         fprintf(cuda->kernel_c, "{\n");
 460         print_kernel_iterators(cuda->kernel_c, kernel);
 461
 462         p = isl_printer_to_file(ctx, cuda->kernel_c);
 463         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 464         p = isl_printer_indent(p, 4);
 465
 466         p = print_kernel_vars(p, kernel);
 467         p = isl_printer_end_line(p);
 468         p = gpu_print_macros(p, kernel->tree);
 469
 470         print_options = isl_ast_print_options_alloc(ctx);
 471         print_options = isl_ast_print_options_set_print_user(print_options,
 472                                                     &print_kernel_stmt, NULL);
 473         p = isl_ast_node_print(kernel->tree, p, print_options);
 474         isl_printer_free(p);
 475
 476         fprintf(cuda->kernel_c, "}\n");
 477 }
 478
 479 struct print_host_user_data {
 480         struct cuda_info *cuda;
 481         struct gpu_prog *prog;
 482 };
 483
 484 /* Print the user statement of the host code to "p".
 485  *
 486  * In particular, print a block of statements that defines the grid
 487  * and the block and then launches the kernel.
 488  */
 489 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
 490         __isl_take isl_ast_print_options *print_options,
 491         __isl_keep isl_ast_node *node, void *user)
 492 {
 493         isl_id *id;
 494         struct ppcg_kernel *kernel;
 495         struct print_host_user_data *data;
 496
 497         id = isl_ast_node_get_annotation(node);
 498         kernel = isl_id_get_user(id);
 499         isl_id_free(id);
 500
 501         data = (struct print_host_user_data *) user;
 502
 503         p = isl_printer_start_line(p);
 504         p = isl_printer_print_str(p, "{");
 505         p = isl_printer_end_line(p);
 506         p = isl_printer_indent(p, 2);
 507
 508         p = isl_printer_start_line(p);
 509         p = isl_printer_print_str(p, "dim3 k");
 510         p = isl_printer_print_int(p, kernel->id);
 511         p = isl_printer_print_str(p, "_dimBlock");
 512         print_reverse_list(isl_printer_get_file(p),
 513                                 kernel->n_block, kernel->block_dim);
 514         p = isl_printer_print_str(p, ";");
 515         p = isl_printer_end_line(p);
 516
 517         p = print_grid(p, kernel);
 518
 519         p = isl_printer_start_line(p);
 520         p = isl_printer_print_str(p, "kernel");
 521         p = isl_printer_print_int(p, kernel->id);
 522         p = isl_printer_print_str(p, " <<<k");
 523         p = isl_printer_print_int(p, kernel->id);
 524         p = isl_printer_print_str(p, "_dimGrid, k");
 525         p = isl_printer_print_int(p, kernel->id);
 526         p = isl_printer_print_str(p, "_dimBlock>>> (");
 527         p = print_kernel_arguments(p, data->prog, kernel, 0);
 528         p = isl_printer_print_str(p, ");");
 529         p = isl_printer_end_line(p);
 530
 531         p = isl_printer_start_line(p);
 532         p = isl_printer_print_str(p, "cudaCheckKernel();");
 533         p = isl_printer_end_line(p);
 534
 535         p = isl_printer_indent(p, -2);
 536         p = isl_printer_start_line(p);
 537         p = isl_printer_print_str(p, "}");
 538         p = isl_printer_end_line(p);
 539
 540         p = isl_printer_start_line(p);
 541         p = isl_printer_end_line(p);
 542
 543         print_kernel(data->prog, kernel, data->cuda);
 544
 545         isl_ast_print_options_free(print_options);
 546
 547         return p;
 548 }
 549
 550 static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
 551         struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
 552         struct cuda_info *cuda)
 553 {
 554         isl_ast_print_options *print_options;
 555         isl_ctx *ctx = isl_ast_node_get_ctx(tree);
 556         struct print_host_user_data data = { cuda, prog };
 557
 558         print_options = isl_ast_print_options_alloc(ctx);
 559         print_options = isl_ast_print_options_set_print_user(print_options,
 560                                                 &print_host_user, &data);
 561
 562         p = gpu_print_macros(p, tree);
 563         p = isl_ast_node_print(tree, p, print_options);
 564
 565         return p;
 566 }
 567
 568 /* For each array that needs to be copied out (based on prog->copy_out),
 569  * copy the contents back from the GPU to the host.
 570  *
 571  * If any element of a given array appears in prog->copy_out, then its
 572  * entire extent is in prog->copy_out.  The bounds on this extent have
 573  * been precomputed in extract_array_info and are used in
 574  * gpu_array_info_print_size.
 575  */
 576 static __isl_give isl_printer *copy_arrays_from_device(
 577         __isl_take isl_printer *p, struct gpu_prog *prog)
 578 {
 579         int i;
 580         isl_union_set *copy_out;
 581         copy_out = isl_union_set_copy(prog->copy_out);
 582
 583         for (i = 0; i < prog->n_array; ++i) {
 584                 isl_space *space;
 585                 isl_set *copy_out_i;
 586                 int empty;
 587
 588                 space = isl_space_copy(prog->array[i].space);
 589                 copy_out_i = isl_union_set_extract_set(copy_out, space);
 590                 empty = isl_set_plain_is_empty(copy_out_i);
 591                 isl_set_free(copy_out_i);
 592                 if (empty)
 593                         continue;
 594
 595                 p = isl_printer_start_line(p);
 596                 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
 597                 if (gpu_array_is_scalar(&prog->array[i]))
 598                         p = isl_printer_print_str(p, "&");
 599                 p = isl_printer_print_str(p, prog->array[i].name);
 600                 p = isl_printer_print_str(p, ", dev_");
 601                 p = isl_printer_print_str(p, prog->array[i].name);
 602                 p = isl_printer_print_str(p, ", ");
 603                 p = gpu_array_info_print_size(p, &prog->array[i]);
 604                 p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
 605                 p = isl_printer_end_line(p);
 606         }
 607
 608         isl_union_set_free(copy_out);
 609         p = isl_printer_start_line(p);
 610         p = isl_printer_end_line(p);
 611         return p;
 612 }
 613
 614 static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
 615         struct gpu_prog *prog)
 616 {
 617         int i;
 618
 619         for (i = 0; i < prog->n_array; ++i) {
 620                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
 621                         continue;
 622                 if (!prog->array[i].accessed)
 623                         continue;
 624                 p = isl_printer_start_line(p);
 625                 p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
 626                 p = isl_printer_print_str(p, prog->array[i].name);
 627                 p = isl_printer_print_str(p, "));");
 628                 p = isl_printer_end_line(p);
 629         }
 630
 631         return p;
 632 }
 633
 634 /* Given a gpu_prog "prog" and the corresponding transformed AST
 635  * "tree", print the entire CUDA code to "p".
 636  * "types" collects the types for which a definition has already
 637  * been printed.
 638  */
 639 static __isl_give isl_printer *print_cuda(__isl_take isl_printer *p,
 640         struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
 641         struct gpu_types *types, void *user)
 642 {
 643         struct cuda_info *cuda = user;
 644         isl_printer *kernel;
 645
 646         kernel = isl_printer_to_file(isl_printer_get_ctx(p), cuda->kernel_c);
 647         kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
 648         kernel = gpu_print_types(kernel, types, prog);
 649         isl_printer_free(kernel);
 650
 651         if (!kernel)
 652                 return isl_printer_free(p);
 653
 654         p = ppcg_start_block(p);
 655
 656         p = print_cuda_macros(p);
 657
 658         p = declare_device_arrays(p, prog);
 659         p = allocate_device_arrays(p, prog);
 660         p = copy_arrays_to_device(p, prog);
 661
 662         p = print_host_code(p, prog, tree, cuda);
 663
 664         p = copy_arrays_from_device(p, prog);
 665         p = free_device_arrays(p, prog);
 666
 667         p = ppcg_end_block(p);
 668
 669         return p;
 670 }
 671
 672 /* Transform the code in the file called "input" by replacing
 673  * all scops by corresponding CUDA code.
 674  * The names of the output files are derived from "input".
 675  *
 676  * We let generate_gpu do all the hard work and then let it call
 677  * us back for printing the AST in print_cuda.
 678  *
 679  * To prepare for this printing, we first open the output files
 680  * and we close them after generate_gpu has finished.
 681  */
 682 int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
 683         const char *input)
 684 {
 685         struct cuda_info cuda;
 686         int r;
 687
 688         cuda_open_files(&cuda, input);
 689
 690         r = generate_gpu(ctx, input, cuda.host_c, options, &print_cuda, &cuda);
 691
 692         cuda_close_files(&cuda);
 693
 694         return r;
 695 }