cuda.c

   1 /*
   2  * Copyright 2012      Ecole Normale Superieure
   3  *
   4  * Use of this software is governed by the MIT license
   5  *
   6  * Written by Sven Verdoolaege,
   7  * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
   8  */
   9
  10 #include <isl/aff.h>
  11 #include <isl/ast.h>
  12
  13 #include "cuda_common.h"
  14 #include "cuda.h"
  15 #include "gpu.h"
  16 #include "gpu_print.h"
  17 #include "print.h"
  18
  19 static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
  20 {
  21         const char *macros =
  22                 "#define cudaCheckReturn(ret) \\\n"
  23                 "  do { \\\n"
  24                 "    cudaError_t cudaCheckReturn_e = (ret); \\\n"
  25                 "    if (cudaCheckReturn_e != cudaSuccess) { \\\n"
  26                 "      fprintf(stderr, \"CUDA error: %s\\n\", "
  27                 "cudaGetErrorString(cudaCheckReturn_e)); \\\n"
  28                 "      fflush(stderr); \\\n"
  29                 "    } \\\n"
  30                 "    assert(cudaCheckReturn_e == cudaSuccess); \\\n"
  31                 "  } while(0)\n"
  32                 "#define cudaCheckKernel() \\\n"
  33                 "  do { \\\n"
  34                 "    cudaCheckReturn(cudaGetLastError()); \\\n"
  35                 "  } while(0)\n\n";
  36
  37         p = isl_printer_print_str(p, macros);
  38         return p;
  39 }
  40
  41 /* Print a declaration for the device array corresponding to "array" on "p".
  42  */
  43 static __isl_give isl_printer *declare_device_array(__isl_take isl_printer *p,
  44         struct gpu_array_info *array)
  45 {
  46         int i;
  47
  48         p = isl_printer_start_line(p);
  49         p = isl_printer_print_str(p, array->type);
  50         p = isl_printer_print_str(p, " ");
  51         if (!array->linearize && array->n_index > 1)
  52                 p = isl_printer_print_str(p, "(");
  53         p = isl_printer_print_str(p, "*dev_");
  54         p = isl_printer_print_str(p, array->name);
  55         if (!array->linearize && array->n_index > 1) {
  56                 p = isl_printer_print_str(p, ")");
  57                 for (i = 1; i < array->n_index; i++) {
  58                         p = isl_printer_print_str(p, "[");
  59                         p = isl_printer_print_pw_aff(p, array->bound[i]);
  60                         p = isl_printer_print_str(p, "]");
  61                 }
  62         }
  63         p = isl_printer_print_str(p, ";");
  64         p = isl_printer_end_line(p);
  65
  66         return p;
  67 }
  68
  69 static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
  70         struct gpu_prog *prog)
  71 {
  72         int i;
  73
  74         for (i = 0; i < prog->n_array; ++i) {
  75                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
  76                         continue;
  77
  78                 p = declare_device_array(p, &prog->array[i]);
  79         }
  80         p = isl_printer_start_line(p);
  81         p = isl_printer_end_line(p);
  82         return p;
  83 }
  84
  85 static __isl_give isl_printer *allocate_device_arrays(
  86         __isl_take isl_printer *p, struct gpu_prog *prog)
  87 {
  88         int i;
  89
  90         for (i = 0; i < prog->n_array; ++i) {
  91                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
  92                         continue;
  93                 p = isl_printer_start_line(p);
  94                 p = isl_printer_print_str(p,
  95                         "cudaCheckReturn(cudaMalloc((void **) &dev_");
  96                 p = isl_printer_print_str(p, prog->array[i].name);
  97                 p = isl_printer_print_str(p, ", ");
  98                 p = gpu_array_info_print_size(p, &prog->array[i]);
  99                 p = isl_printer_print_str(p, "));");
 100                 p = isl_printer_end_line(p);
 101         }
 102         p = isl_printer_start_line(p);
 103         p = isl_printer_end_line(p);
 104         return p;
 105 }
 106
 107 static __isl_give isl_printer *copy_arrays_to_device(__isl_take isl_printer *p,
 108         struct gpu_prog *prog)
 109 {
 110         int i;
 111
 112         for (i = 0; i < prog->n_array; ++i) {
 113                 isl_space *dim;
 114                 isl_set *read_i;
 115                 int empty;
 116
 117                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
 118                         continue;
 119
 120                 dim = isl_space_copy(prog->array[i].space);
 121                 read_i = isl_union_set_extract_set(prog->copy_in, dim);
 122                 empty = isl_set_plain_is_empty(read_i);
 123                 isl_set_free(read_i);
 124                 if (empty)
 125                         continue;
 126
 127                 p = isl_printer_start_line(p);
 128                 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
 129                 p = isl_printer_print_str(p, prog->array[i].name);
 130                 p = isl_printer_print_str(p, ", ");
 131
 132                 if (gpu_array_is_scalar(&prog->array[i]))
 133                         p = isl_printer_print_str(p, "&");
 134                 p = isl_printer_print_str(p, prog->array[i].name);
 135                 p = isl_printer_print_str(p, ", ");
 136
 137                 p = gpu_array_info_print_size(p, &prog->array[i]);
 138                 p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
 139                 p = isl_printer_end_line(p);
 140         }
 141         p = isl_printer_start_line(p);
 142         p = isl_printer_end_line(p);
 143         return p;
 144 }
 145
 146 static void print_reverse_list(FILE *out, int len, int *list)
 147 {
 148         int i;
 149
 150         if (len == 0)
 151                 return;
 152
 153         fprintf(out, "(");
 154         for (i = 0; i < len; ++i) {
 155                 if (i)
 156                         fprintf(out, ", ");
 157                 fprintf(out, "%d", list[len - 1 - i]);
 158         }
 159         fprintf(out, ")");
 160 }
 161
 162 /* Print the effective grid size as a list of the sizes in each
 163  * dimension, from innermost to outermost.
 164  */
 165 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
 166         struct ppcg_kernel *kernel)
 167 {
 168         int i;
 169         int dim;
 170
 171         dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
 172         if (dim == 0)
 173                 return p;
 174
 175         p = isl_printer_print_str(p, "(");
 176         for (i = dim - 1; i >= 0; --i) {
 177                 isl_pw_aff *bound;
 178
 179                 bound = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
 180                 p = isl_printer_print_pw_aff(p, bound);
 181                 isl_pw_aff_free(bound);
 182
 183                 if (i > 0)
 184                         p = isl_printer_print_str(p, ", ");
 185         }
 186
 187         p = isl_printer_print_str(p, ")");
 188
 189         return p;
 190 }
 191
 192 /* Print the grid definition.
 193  */
 194 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
 195         struct ppcg_kernel *kernel)
 196 {
 197         p = isl_printer_start_line(p);
 198         p = isl_printer_print_str(p, "dim3 k");
 199         p = isl_printer_print_int(p, kernel->id);
 200         p = isl_printer_print_str(p, "_dimGrid");
 201         p = print_grid_size(p, kernel);
 202         p = isl_printer_print_str(p, ";");
 203         p = isl_printer_end_line(p);
 204
 205         return p;
 206 }
 207
 208 /* Print the arguments to a kernel declaration or call.  If "types" is set,
 209  * then print a declaration (including the types of the arguments).
 210  *
 211  * The arguments are printed in the following order
 212  * - the arrays accessed by the kernel
 213  * - the parameters
 214  * - the host loop iterators
 215  */
 216 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
 217         struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
 218 {
 219         int i, n;
 220         int first = 1;
 221         unsigned nparam;
 222         isl_space *space;
 223         const char *type;
 224
 225         for (i = 0; i < prog->n_array; ++i) {
 226                 isl_set *arr;
 227                 int empty;
 228
 229                 space = isl_space_copy(prog->array[i].space);
 230                 arr = isl_union_set_extract_set(kernel->arrays, space);
 231                 empty = isl_set_plain_is_empty(arr);
 232                 isl_set_free(arr);
 233                 if (empty)
 234                         continue;
 235
 236                 if (!first)
 237                         p = isl_printer_print_str(p, ", ");
 238
 239                 if (types)
 240                         p = gpu_array_info_print_declaration_argument(p,
 241                                 &prog->array[i], NULL);
 242                 else
 243                         p = gpu_array_info_print_call_argument(p,
 244                                 &prog->array[i]);
 245
 246                 first = 0;
 247         }
 248
 249         space = isl_union_set_get_space(kernel->arrays);
 250         nparam = isl_space_dim(space, isl_dim_param);
 251         for (i = 0; i < nparam; ++i) {
 252                 const char *name;
 253
 254                 name = isl_space_get_dim_name(space, isl_dim_param, i);
 255
 256                 if (!first)
 257                         p = isl_printer_print_str(p, ", ");
 258                 if (types)
 259                         p = isl_printer_print_str(p, "int ");
 260                 p = isl_printer_print_str(p, name);
 261
 262                 first = 0;
 263         }
 264         isl_space_free(space);
 265
 266         n = isl_space_dim(kernel->space, isl_dim_set);
 267         type = isl_options_get_ast_iterator_type(prog->ctx);
 268         for (i = 0; i < n; ++i) {
 269                 const char *name;
 270
 271                 if (!first)
 272                         p = isl_printer_print_str(p, ", ");
 273                 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
 274                 if (types) {
 275                         p = isl_printer_print_str(p, type);
 276                         p = isl_printer_print_str(p, " ");
 277                 }
 278                 p = isl_printer_print_str(p, name);
 279
 280                 first = 0;
 281         }
 282
 283         return p;
 284 }
 285
 286 /* Print the header of the given kernel.
 287  */
 288 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
 289         struct gpu_prog *prog, struct ppcg_kernel *kernel)
 290 {
 291         p = isl_printer_start_line(p);
 292         p = isl_printer_print_str(p, "__global__ void kernel");
 293         p = isl_printer_print_int(p, kernel->id);
 294         p = isl_printer_print_str(p, "(");
 295         p = print_kernel_arguments(p, prog, kernel, 1);
 296         p = isl_printer_print_str(p, ")");
 297
 298         return p;
 299 }
 300
 301 /* Print the header of the given kernel to both gen->cuda.kernel_h
 302  * and gen->cuda.kernel_c.
 303  */
 304 static void print_kernel_headers(struct gpu_prog *prog,
 305         struct ppcg_kernel *kernel, struct cuda_info *cuda)
 306 {
 307         isl_printer *p;
 308
 309         p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
 310         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 311         p = print_kernel_header(p, prog, kernel);
 312         p = isl_printer_print_str(p, ";");
 313         p = isl_printer_end_line(p);
 314         isl_printer_free(p);
 315
 316         p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
 317         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 318         p = print_kernel_header(p, prog, kernel);
 319         p = isl_printer_end_line(p);
 320         isl_printer_free(p);
 321 }
 322
 323 static void print_indent(FILE *dst, int indent)
 324 {
 325         fprintf(dst, "%*s", indent, "");
 326 }
 327
 328 /* Print a list of iterators of type "type" with names "ids" to "out".
 329  * Each iterator is assigned one of the cuda identifiers in cuda_dims.
 330  * In particular, the last iterator is assigned the x identifier
 331  * (the first in the list of cuda identifiers).
 332  */
 333 static void print_iterators(FILE *out, const char *type,
 334         __isl_keep isl_id_list *ids, const char *cuda_dims[])
 335 {
 336         int i, n;
 337
 338         n = isl_id_list_n_id(ids);
 339         if (n <= 0)
 340                 return;
 341         print_indent(out, 4);
 342         fprintf(out, "%s ", type);
 343         for (i = 0; i < n; ++i) {
 344                 isl_id *id;
 345
 346                 if (i)
 347                         fprintf(out, ", ");
 348                 id = isl_id_list_get_id(ids, i);
 349                 fprintf(out, "%s%d = %s", isl_id_get_name(id), i,
 350                         cuda_dims[n - 1 - i]);
 351                 isl_id_free(id);
 352         }
 353         fprintf(out, ";\n");
 354 }
 355
 356 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
 357 {
 358         isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
 359         const char *type;
 360         const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
 361         const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
 362                                         "threadIdx.z" };
 363
 364         type = isl_options_get_ast_iterator_type(ctx);
 365
 366         print_iterators(out, type, kernel->block_ids, block_dims);
 367         print_iterators(out, type, kernel->thread_ids, thread_dims);
 368 }
 369
 370 static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p,
 371         struct ppcg_kernel_var *var)
 372 {
 373         int j;
 374
 375         p = isl_printer_start_line(p);
 376         if (var->type == ppcg_access_shared)
 377                 p = isl_printer_print_str(p, "__shared__ ");
 378         p = isl_printer_print_str(p, var->array->type);
 379         p = isl_printer_print_str(p, " ");
 380         p = isl_printer_print_str(p,  var->name);
 381         for (j = 0; j < var->array->n_index; ++j) {
 382                 isl_val *v;
 383
 384                 p = isl_printer_print_str(p, "[");
 385                 v = isl_vec_get_element_val(var->size, j);
 386                 p = isl_printer_print_val(p, v);
 387                 isl_val_free(v);
 388                 p = isl_printer_print_str(p, "]");
 389         }
 390         p = isl_printer_print_str(p, ";");
 391         p = isl_printer_end_line(p);
 392
 393         return p;
 394 }
 395
 396 static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p,
 397         struct ppcg_kernel *kernel)
 398 {
 399         int i;
 400
 401         for (i = 0; i < kernel->n_var; ++i)
 402                 p = print_kernel_var(p, &kernel->var[i]);
 403
 404         return p;
 405 }
 406
 407 /* Print a sync statement.
 408  */
 409 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
 410         struct ppcg_kernel_stmt *stmt)
 411 {
 412         p = isl_printer_start_line(p);
 413         p = isl_printer_print_str(p, "__syncthreads();");
 414         p = isl_printer_end_line(p);
 415
 416         return p;
 417 }
 418
 419 /* This function is called for each user statement in the AST,
 420  * i.e., for each kernel body statement, copy statement or sync statement.
 421  */
 422 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
 423         __isl_take isl_ast_print_options *print_options,
 424         __isl_keep isl_ast_node *node, void *user)
 425 {
 426         isl_id *id;
 427         struct ppcg_kernel_stmt *stmt;
 428
 429         id = isl_ast_node_get_annotation(node);
 430         stmt = isl_id_get_user(id);
 431         isl_id_free(id);
 432
 433         isl_ast_print_options_free(print_options);
 434
 435         switch (stmt->type) {
 436         case ppcg_kernel_copy:
 437                 return ppcg_kernel_print_copy(p, stmt);
 438         case ppcg_kernel_sync:
 439                 return print_sync(p, stmt);
 440         case ppcg_kernel_domain:
 441                 return ppcg_kernel_print_domain(p, stmt);
 442         }
 443
 444         return p;
 445 }
 446
 447 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
 448         struct cuda_info *cuda)
 449 {
 450         isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
 451         isl_ast_print_options *print_options;
 452         isl_printer *p;
 453
 454         print_kernel_headers(prog, kernel, cuda);
 455         fprintf(cuda->kernel_c, "{\n");
 456         print_kernel_iterators(cuda->kernel_c, kernel);
 457
 458         p = isl_printer_to_file(ctx, cuda->kernel_c);
 459         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 460         p = isl_printer_indent(p, 4);
 461
 462         p = print_kernel_vars(p, kernel);
 463         p = isl_printer_end_line(p);
 464         p = gpu_print_macros(p, kernel->tree);
 465
 466         print_options = isl_ast_print_options_alloc(ctx);
 467         print_options = isl_ast_print_options_set_print_user(print_options,
 468                                                     &print_kernel_stmt, NULL);
 469         p = isl_ast_node_print(kernel->tree, p, print_options);
 470         isl_printer_free(p);
 471
 472         fprintf(cuda->kernel_c, "}\n");
 473 }
 474
 475 struct print_host_user_data {
 476         struct cuda_info *cuda;
 477         struct gpu_prog *prog;
 478 };
 479
 480 /* Print the user statement of the host code to "p".
 481  *
 482  * In particular, print a block of statements that defines the grid
 483  * and the block and then launches the kernel.
 484  */
 485 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
 486         __isl_take isl_ast_print_options *print_options,
 487         __isl_keep isl_ast_node *node, void *user)
 488 {
 489         isl_id *id;
 490         struct ppcg_kernel *kernel;
 491         struct print_host_user_data *data;
 492
 493         id = isl_ast_node_get_annotation(node);
 494         kernel = isl_id_get_user(id);
 495         isl_id_free(id);
 496
 497         data = (struct print_host_user_data *) user;
 498
 499         p = isl_printer_start_line(p);
 500         p = isl_printer_print_str(p, "{");
 501         p = isl_printer_end_line(p);
 502         p = isl_printer_indent(p, 2);
 503
 504         p = isl_printer_start_line(p);
 505         p = isl_printer_print_str(p, "dim3 k");
 506         p = isl_printer_print_int(p, kernel->id);
 507         p = isl_printer_print_str(p, "_dimBlock");
 508         print_reverse_list(isl_printer_get_file(p),
 509                                 kernel->n_block, kernel->block_dim);
 510         p = isl_printer_print_str(p, ";");
 511         p = isl_printer_end_line(p);
 512
 513         p = print_grid(p, kernel);
 514
 515         p = isl_printer_start_line(p);
 516         p = isl_printer_print_str(p, "kernel");
 517         p = isl_printer_print_int(p, kernel->id);
 518         p = isl_printer_print_str(p, " <<<k");
 519         p = isl_printer_print_int(p, kernel->id);
 520         p = isl_printer_print_str(p, "_dimGrid, k");
 521         p = isl_printer_print_int(p, kernel->id);
 522         p = isl_printer_print_str(p, "_dimBlock>>> (");
 523         p = print_kernel_arguments(p, data->prog, kernel, 0);
 524         p = isl_printer_print_str(p, ");");
 525         p = isl_printer_end_line(p);
 526
 527         p = isl_printer_start_line(p);
 528         p = isl_printer_print_str(p, "cudaCheckKernel();");
 529         p = isl_printer_end_line(p);
 530
 531         p = isl_printer_indent(p, -2);
 532         p = isl_printer_start_line(p);
 533         p = isl_printer_print_str(p, "}");
 534         p = isl_printer_end_line(p);
 535
 536         p = isl_printer_start_line(p);
 537         p = isl_printer_end_line(p);
 538
 539         print_kernel(data->prog, kernel, data->cuda);
 540
 541         isl_ast_print_options_free(print_options);
 542
 543         return p;
 544 }
 545
 546 static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
 547         struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
 548         struct cuda_info *cuda)
 549 {
 550         isl_ast_print_options *print_options;
 551         isl_ctx *ctx = isl_ast_node_get_ctx(tree);
 552         struct print_host_user_data data = { cuda, prog };
 553
 554         print_options = isl_ast_print_options_alloc(ctx);
 555         print_options = isl_ast_print_options_set_print_user(print_options,
 556                                                 &print_host_user, &data);
 557
 558         p = gpu_print_macros(p, tree);
 559         p = isl_ast_node_print(tree, p, print_options);
 560
 561         return p;
 562 }
 563
 564 /* For each array that needs to be copied out (based on prog->copy_out),
 565  * copy the contents back from the GPU to the host.
 566  *
 567  * If any element of a given array appears in prog->copy_out, then its
 568  * entire extent is in prog->copy_out.  The bounds on this extent have
 569  * been precomputed in extract_array_info and are used in
 570  * gpu_array_info_print_size.
 571  */
 572 static __isl_give isl_printer *copy_arrays_from_device(
 573         __isl_take isl_printer *p, struct gpu_prog *prog)
 574 {
 575         int i;
 576         isl_union_set *copy_out;
 577         copy_out = isl_union_set_copy(prog->copy_out);
 578
 579         for (i = 0; i < prog->n_array; ++i) {
 580                 isl_space *dim;
 581                 isl_set *copy_out_i;
 582                 int empty;
 583
 584                 dim = isl_space_copy(prog->array[i].space);
 585                 copy_out_i = isl_union_set_extract_set(copy_out, dim);
 586                 empty = isl_set_plain_is_empty(copy_out_i);
 587                 isl_set_free(copy_out_i);
 588                 if (empty)
 589                         continue;
 590
 591                 p = isl_printer_start_line(p);
 592                 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
 593                 if (gpu_array_is_scalar(&prog->array[i]))
 594                         p = isl_printer_print_str(p, "&");
 595                 p = isl_printer_print_str(p, prog->array[i].name);
 596                 p = isl_printer_print_str(p, ", dev_");
 597                 p = isl_printer_print_str(p, prog->array[i].name);
 598                 p = isl_printer_print_str(p, ", ");
 599                 p = gpu_array_info_print_size(p, &prog->array[i]);
 600                 p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
 601                 p = isl_printer_end_line(p);
 602         }
 603
 604         isl_union_set_free(copy_out);
 605         p = isl_printer_start_line(p);
 606         p = isl_printer_end_line(p);
 607         return p;
 608 }
 609
 610 static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
 611         struct gpu_prog *prog)
 612 {
 613         int i;
 614
 615         for (i = 0; i < prog->n_array; ++i) {
 616                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
 617                         continue;
 618                 p = isl_printer_start_line(p);
 619                 p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
 620                 p = isl_printer_print_str(p, prog->array[i].name);
 621                 p = isl_printer_print_str(p, "));");
 622                 p = isl_printer_end_line(p);
 623         }
 624
 625         return p;
 626 }
 627
 628 /* Given a gpu_prog "prog" and the corresponding transformed AST
 629  * "tree", print the entire CUDA code to "p".
 630  * "types" collects the types for which a definition has already
 631  * been printed.
 632  */
 633 static __isl_give isl_printer *print_cuda(__isl_take isl_printer *p,
 634         struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
 635         struct gpu_types *types, void *user)
 636 {
 637         struct cuda_info *cuda = user;
 638         isl_printer *kernel;
 639
 640         kernel = isl_printer_to_file(isl_printer_get_ctx(p), cuda->kernel_c);
 641         kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
 642         kernel = gpu_print_types(kernel, types, prog);
 643         isl_printer_free(kernel);
 644
 645         if (!kernel)
 646                 return isl_printer_free(p);
 647
 648         p = ppcg_start_block(p);
 649
 650         p = print_cuda_macros(p);
 651
 652         p = declare_device_arrays(p, prog);
 653         p = allocate_device_arrays(p, prog);
 654         p = copy_arrays_to_device(p, prog);
 655
 656         p = print_host_code(p, prog, tree, cuda);
 657
 658         p = copy_arrays_from_device(p, prog);
 659         p = free_device_arrays(p, prog);
 660
 661         p = ppcg_end_block(p);
 662
 663         return p;
 664 }
 665
 666 /* Transform the code in the file called "input" by replacing
 667  * all scops by corresponding CUDA code.
 668  * The names of the output files are derived from "input".
 669  *
 670  * We let generate_gpu do all the hard work and then let it call
 671  * us back for printing the AST in print_cuda.
 672  *
 673  * To prepare for this printing, we first open the output files
 674  * and we close them after generate_gpu has finished.
 675  */
 676 int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
 677         const char *input)
 678 {
 679         struct cuda_info cuda;
 680         int r;
 681
 682         cuda_open_files(&cuda, input);
 683
 684         r = generate_gpu(ctx, input, cuda.host_c, options, &print_cuda, &cuda);
 685
 686         cuda_close_files(&cuda);
 687
 688         return r;
 689 }