cuda.c

   1 /*
   2  * Copyright 2012      Ecole Normale Superieure
   3  *
   4  * Use of this software is governed by the MIT license
   5  *
   6  * Written by Sven Verdoolaege,
   7  * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
   8  */
   9
  10 #include <isl/aff.h>
  11 #include <isl/ast.h>
  12
  13 #include "cuda_common.h"
  14 #include "cuda.h"
  15 #include "gpu.h"
  16 #include "gpu_print.h"
  17 #include "print.h"
  18
  19 static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
  20 {
  21         const char *macros =
  22                 "#define cudaCheckReturn(ret) \\\n"
  23                 "  do { \\\n"
  24                 "    cudaError_t cudaCheckReturn_e = (ret); \\\n"
  25                 "    if (cudaCheckReturn_e != cudaSuccess) { \\\n"
  26                 "      fprintf(stderr, \"CUDA error: %s\\n\", "
  27                 "cudaGetErrorString(cudaCheckReturn_e)); \\\n"
  28                 "      fflush(stderr); \\\n"
  29                 "    } \\\n"
  30                 "    assert(cudaCheckReturn_e == cudaSuccess); \\\n"
  31                 "  } while(0)\n"
  32                 "#define cudaCheckKernel() \\\n"
  33                 "  do { \\\n"
  34                 "    cudaCheckReturn(cudaGetLastError()); \\\n"
  35                 "  } while(0)\n\n";
  36
  37         p = isl_printer_print_str(p, macros);
  38         return p;
  39 }
  40
  41 /* Print a declaration for the device array corresponding to "array" on "p".
  42  */
  43 static __isl_give isl_printer *declare_device_array(__isl_take isl_printer *p,
  44         struct gpu_array_info *array)
  45 {
  46         int i;
  47
  48         p = isl_printer_start_line(p);
  49         p = isl_printer_print_str(p, array->type);
  50         p = isl_printer_print_str(p, " ");
  51         if (!array->linearize && array->n_index > 1)
  52                 p = isl_printer_print_str(p, "(");
  53         p = isl_printer_print_str(p, "*dev_");
  54         p = isl_printer_print_str(p, array->name);
  55         if (!array->linearize && array->n_index > 1) {
  56                 p = isl_printer_print_str(p, ")");
  57                 for (i = 1; i < array->n_index; i++) {
  58                         p = isl_printer_print_str(p, "[");
  59                         p = isl_printer_print_pw_aff(p, array->bound[i]);
  60                         p = isl_printer_print_str(p, "]");
  61                 }
  62         }
  63         p = isl_printer_print_str(p, ";");
  64         p = isl_printer_end_line(p);
  65
  66         return p;
  67 }
  68
  69 static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
  70         struct gpu_prog *prog)
  71 {
  72         int i;
  73
  74         for (i = 0; i < prog->n_array; ++i) {
  75                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
  76                         continue;
  77                 if (!prog->array[i].accessed)
  78                         continue;
  79
  80                 p = declare_device_array(p, &prog->array[i]);
  81         }
  82         p = isl_printer_start_line(p);
  83         p = isl_printer_end_line(p);
  84         return p;
  85 }
  86
  87 static __isl_give isl_printer *allocate_device_arrays(
  88         __isl_take isl_printer *p, struct gpu_prog *prog)
  89 {
  90         int i;
  91
  92         for (i = 0; i < prog->n_array; ++i) {
  93                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
  94                         continue;
  95                 if (!prog->array[i].accessed)
  96                         continue;
  97                 p = isl_printer_start_line(p);
  98                 p = isl_printer_print_str(p,
  99                         "cudaCheckReturn(cudaMalloc((void **) &dev_");
 100                 p = isl_printer_print_str(p, prog->array[i].name);
 101                 p = isl_printer_print_str(p, ", ");
 102                 p = gpu_array_info_print_size(p, &prog->array[i]);
 103                 p = isl_printer_print_str(p, "));");
 104                 p = isl_printer_end_line(p);
 105         }
 106         p = isl_printer_start_line(p);
 107         p = isl_printer_end_line(p);
 108         return p;
 109 }
 110
 111 /* Print code to "p" for copying "array" from the host to the device
 112  * in its entirety.  The bounds on the extent of "array" have
 113  * been precomputed in extract_array_info and are used in
 114  * gpu_array_info_print_size.
 115  */
 116 static __isl_give isl_printer *copy_array_to_device(__isl_take isl_printer *p,
 117         struct gpu_array_info *array)
 118 {
 119         p = isl_printer_start_line(p);
 120         p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
 121         p = isl_printer_print_str(p, array->name);
 122         p = isl_printer_print_str(p, ", ");
 123
 124         if (gpu_array_is_scalar(array))
 125                 p = isl_printer_print_str(p, "&");
 126         p = isl_printer_print_str(p, array->name);
 127         p = isl_printer_print_str(p, ", ");
 128
 129         p = gpu_array_info_print_size(p, array);
 130         p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
 131         p = isl_printer_end_line(p);
 132
 133         return p;
 134 }
 135
 136 /* Print code to "p" for copying "array" back from the device to the host
 137  * in its entirety.  The bounds on the extent of "array" have
 138  * been precomputed in extract_array_info and are used in
 139  * gpu_array_info_print_size.
 140  */
 141 static __isl_give isl_printer *copy_array_from_device(
 142         __isl_take isl_printer *p, struct gpu_array_info *array)
 143 {
 144         p = isl_printer_start_line(p);
 145         p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
 146         if (gpu_array_is_scalar(array))
 147                 p = isl_printer_print_str(p, "&");
 148         p = isl_printer_print_str(p, array->name);
 149         p = isl_printer_print_str(p, ", dev_");
 150         p = isl_printer_print_str(p, array->name);
 151         p = isl_printer_print_str(p, ", ");
 152         p = gpu_array_info_print_size(p, array);
 153         p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
 154         p = isl_printer_end_line(p);
 155
 156         return p;
 157 }
 158
 159 static __isl_give isl_printer *copy_arrays_to_device(__isl_take isl_printer *p,
 160         struct gpu_prog *prog)
 161 {
 162         int i;
 163
 164         for (i = 0; i < prog->n_array; ++i) {
 165                 isl_space *space;
 166                 isl_set *read_i;
 167                 int empty;
 168
 169                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
 170                         continue;
 171
 172                 space = isl_space_copy(prog->array[i].space);
 173                 read_i = isl_union_set_extract_set(prog->copy_in, space);
 174                 empty = isl_set_plain_is_empty(read_i);
 175                 isl_set_free(read_i);
 176                 if (empty)
 177                         continue;
 178
 179                 p = copy_array_to_device(p, &prog->array[i]);
 180         }
 181         p = isl_printer_start_line(p);
 182         p = isl_printer_end_line(p);
 183         return p;
 184 }
 185
 186 static void print_reverse_list(FILE *out, int len, int *list)
 187 {
 188         int i;
 189
 190         if (len == 0)
 191                 return;
 192
 193         fprintf(out, "(");
 194         for (i = 0; i < len; ++i) {
 195                 if (i)
 196                         fprintf(out, ", ");
 197                 fprintf(out, "%d", list[len - 1 - i]);
 198         }
 199         fprintf(out, ")");
 200 }
 201
 202 /* Print the effective grid size as a list of the sizes in each
 203  * dimension, from innermost to outermost.
 204  */
 205 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
 206         struct ppcg_kernel *kernel)
 207 {
 208         int i;
 209         int dim;
 210
 211         dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
 212         if (dim == 0)
 213                 return p;
 214
 215         p = isl_printer_print_str(p, "(");
 216         for (i = dim - 1; i >= 0; --i) {
 217                 isl_pw_aff *bound;
 218
 219                 bound = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
 220                 p = isl_printer_print_pw_aff(p, bound);
 221                 isl_pw_aff_free(bound);
 222
 223                 if (i > 0)
 224                         p = isl_printer_print_str(p, ", ");
 225         }
 226
 227         p = isl_printer_print_str(p, ")");
 228
 229         return p;
 230 }
 231
 232 /* Print the grid definition.
 233  */
 234 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
 235         struct ppcg_kernel *kernel)
 236 {
 237         p = isl_printer_start_line(p);
 238         p = isl_printer_print_str(p, "dim3 k");
 239         p = isl_printer_print_int(p, kernel->id);
 240         p = isl_printer_print_str(p, "_dimGrid");
 241         p = print_grid_size(p, kernel);
 242         p = isl_printer_print_str(p, ";");
 243         p = isl_printer_end_line(p);
 244
 245         return p;
 246 }
 247
 248 /* Print the arguments to a kernel declaration or call.  If "types" is set,
 249  * then print a declaration (including the types of the arguments).
 250  *
 251  * The arguments are printed in the following order
 252  * - the arrays accessed by the kernel
 253  * - the parameters
 254  * - the host loop iterators
 255  */
 256 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
 257         struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
 258 {
 259         int i, n;
 260         int first = 1;
 261         unsigned nparam;
 262         isl_space *space;
 263         const char *type;
 264
 265         for (i = 0; i < prog->n_array; ++i) {
 266                 isl_set *arr;
 267                 int empty;
 268
 269                 space = isl_space_copy(prog->array[i].space);
 270                 arr = isl_union_set_extract_set(kernel->arrays, space);
 271                 empty = isl_set_plain_is_empty(arr);
 272                 isl_set_free(arr);
 273                 if (empty)
 274                         continue;
 275
 276                 if (!first)
 277                         p = isl_printer_print_str(p, ", ");
 278
 279                 if (types)
 280                         p = gpu_array_info_print_declaration_argument(p,
 281                                 &prog->array[i], NULL);
 282                 else
 283                         p = gpu_array_info_print_call_argument(p,
 284                                 &prog->array[i]);
 285
 286                 first = 0;
 287         }
 288
 289         space = isl_union_set_get_space(kernel->arrays);
 290         nparam = isl_space_dim(space, isl_dim_param);
 291         for (i = 0; i < nparam; ++i) {
 292                 const char *name;
 293
 294                 name = isl_space_get_dim_name(space, isl_dim_param, i);
 295
 296                 if (!first)
 297                         p = isl_printer_print_str(p, ", ");
 298                 if (types)
 299                         p = isl_printer_print_str(p, "int ");
 300                 p = isl_printer_print_str(p, name);
 301
 302                 first = 0;
 303         }
 304         isl_space_free(space);
 305
 306         n = isl_space_dim(kernel->space, isl_dim_set);
 307         type = isl_options_get_ast_iterator_type(prog->ctx);
 308         for (i = 0; i < n; ++i) {
 309                 const char *name;
 310
 311                 if (!first)
 312                         p = isl_printer_print_str(p, ", ");
 313                 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
 314                 if (types) {
 315                         p = isl_printer_print_str(p, type);
 316                         p = isl_printer_print_str(p, " ");
 317                 }
 318                 p = isl_printer_print_str(p, name);
 319
 320                 first = 0;
 321         }
 322
 323         return p;
 324 }
 325
 326 /* Print the header of the given kernel.
 327  */
 328 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
 329         struct gpu_prog *prog, struct ppcg_kernel *kernel)
 330 {
 331         p = isl_printer_start_line(p);
 332         p = isl_printer_print_str(p, "__global__ void kernel");
 333         p = isl_printer_print_int(p, kernel->id);
 334         p = isl_printer_print_str(p, "(");
 335         p = print_kernel_arguments(p, prog, kernel, 1);
 336         p = isl_printer_print_str(p, ")");
 337
 338         return p;
 339 }
 340
 341 /* Print the header of the given kernel to both gen->cuda.kernel_h
 342  * and gen->cuda.kernel_c.
 343  */
 344 static void print_kernel_headers(struct gpu_prog *prog,
 345         struct ppcg_kernel *kernel, struct cuda_info *cuda)
 346 {
 347         isl_printer *p;
 348
 349         p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
 350         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 351         p = print_kernel_header(p, prog, kernel);
 352         p = isl_printer_print_str(p, ";");
 353         p = isl_printer_end_line(p);
 354         isl_printer_free(p);
 355
 356         p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
 357         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 358         p = print_kernel_header(p, prog, kernel);
 359         p = isl_printer_end_line(p);
 360         isl_printer_free(p);
 361 }
 362
 363 static void print_indent(FILE *dst, int indent)
 364 {
 365         fprintf(dst, "%*s", indent, "");
 366 }
 367
 368 /* Print a list of iterators of type "type" with names "ids" to "out".
 369  * Each iterator is assigned one of the cuda identifiers in cuda_dims.
 370  * In particular, the last iterator is assigned the x identifier
 371  * (the first in the list of cuda identifiers).
 372  */
 373 static void print_iterators(FILE *out, const char *type,
 374         __isl_keep isl_id_list *ids, const char *cuda_dims[])
 375 {
 376         int i, n;
 377
 378         n = isl_id_list_n_id(ids);
 379         if (n <= 0)
 380                 return;
 381         print_indent(out, 4);
 382         fprintf(out, "%s ", type);
 383         for (i = 0; i < n; ++i) {
 384                 isl_id *id;
 385
 386                 if (i)
 387                         fprintf(out, ", ");
 388                 id = isl_id_list_get_id(ids, i);
 389                 fprintf(out, "%s = %s", isl_id_get_name(id),
 390                         cuda_dims[n - 1 - i]);
 391                 isl_id_free(id);
 392         }
 393         fprintf(out, ";\n");
 394 }
 395
 396 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
 397 {
 398         isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
 399         const char *type;
 400         const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
 401         const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
 402                                         "threadIdx.z" };
 403
 404         type = isl_options_get_ast_iterator_type(ctx);
 405
 406         print_iterators(out, type, kernel->block_ids, block_dims);
 407         print_iterators(out, type, kernel->thread_ids, thread_dims);
 408 }
 409
 410 static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p,
 411         struct ppcg_kernel_var *var)
 412 {
 413         int j;
 414
 415         p = isl_printer_start_line(p);
 416         if (var->type == ppcg_access_shared)
 417                 p = isl_printer_print_str(p, "__shared__ ");
 418         p = isl_printer_print_str(p, var->array->type);
 419         p = isl_printer_print_str(p, " ");
 420         p = isl_printer_print_str(p,  var->name);
 421         for (j = 0; j < var->array->n_index; ++j) {
 422                 isl_val *v;
 423
 424                 p = isl_printer_print_str(p, "[");
 425                 v = isl_vec_get_element_val(var->size, j);
 426                 p = isl_printer_print_val(p, v);
 427                 isl_val_free(v);
 428                 p = isl_printer_print_str(p, "]");
 429         }
 430         p = isl_printer_print_str(p, ";");
 431         p = isl_printer_end_line(p);
 432
 433         return p;
 434 }
 435
 436 static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p,
 437         struct ppcg_kernel *kernel)
 438 {
 439         int i;
 440
 441         for (i = 0; i < kernel->n_var; ++i)
 442                 p = print_kernel_var(p, &kernel->var[i]);
 443
 444         return p;
 445 }
 446
 447 /* Print a sync statement.
 448  */
 449 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
 450         struct ppcg_kernel_stmt *stmt)
 451 {
 452         p = isl_printer_start_line(p);
 453         p = isl_printer_print_str(p, "__syncthreads();");
 454         p = isl_printer_end_line(p);
 455
 456         return p;
 457 }
 458
 459 /* This function is called for each user statement in the AST,
 460  * i.e., for each kernel body statement, copy statement or sync statement.
 461  */
 462 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
 463         __isl_take isl_ast_print_options *print_options,
 464         __isl_keep isl_ast_node *node, void *user)
 465 {
 466         isl_id *id;
 467         struct ppcg_kernel_stmt *stmt;
 468
 469         id = isl_ast_node_get_annotation(node);
 470         stmt = isl_id_get_user(id);
 471         isl_id_free(id);
 472
 473         isl_ast_print_options_free(print_options);
 474
 475         switch (stmt->type) {
 476         case ppcg_kernel_copy:
 477                 return ppcg_kernel_print_copy(p, stmt);
 478         case ppcg_kernel_sync:
 479                 return print_sync(p, stmt);
 480         case ppcg_kernel_domain:
 481                 return ppcg_kernel_print_domain(p, stmt);
 482         }
 483
 484         return p;
 485 }
 486
 487 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
 488         struct cuda_info *cuda)
 489 {
 490         isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
 491         isl_ast_print_options *print_options;
 492         isl_printer *p;
 493
 494         print_kernel_headers(prog, kernel, cuda);
 495         fprintf(cuda->kernel_c, "{\n");
 496         print_kernel_iterators(cuda->kernel_c, kernel);
 497
 498         p = isl_printer_to_file(ctx, cuda->kernel_c);
 499         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 500         p = isl_printer_indent(p, 4);
 501
 502         p = print_kernel_vars(p, kernel);
 503         p = isl_printer_end_line(p);
 504         p = gpu_print_macros(p, kernel->tree);
 505
 506         print_options = isl_ast_print_options_alloc(ctx);
 507         print_options = isl_ast_print_options_set_print_user(print_options,
 508                                                     &print_kernel_stmt, NULL);
 509         p = isl_ast_node_print(kernel->tree, p, print_options);
 510         isl_printer_free(p);
 511
 512         fprintf(cuda->kernel_c, "}\n");
 513 }
 514
 515 struct print_host_user_data {
 516         struct cuda_info *cuda;
 517         struct gpu_prog *prog;
 518 };
 519
 520 /* Print the user statement of the host code to "p".
 521  *
 522  * In particular, print a block of statements that defines the grid
 523  * and the block and then launches the kernel.
 524  */
 525 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
 526         __isl_take isl_ast_print_options *print_options,
 527         __isl_keep isl_ast_node *node, void *user)
 528 {
 529         isl_id *id;
 530         struct ppcg_kernel *kernel;
 531         struct print_host_user_data *data;
 532
 533         id = isl_ast_node_get_annotation(node);
 534         kernel = isl_id_get_user(id);
 535         isl_id_free(id);
 536
 537         data = (struct print_host_user_data *) user;
 538
 539         p = isl_printer_start_line(p);
 540         p = isl_printer_print_str(p, "{");
 541         p = isl_printer_end_line(p);
 542         p = isl_printer_indent(p, 2);
 543
 544         p = isl_printer_start_line(p);
 545         p = isl_printer_print_str(p, "dim3 k");
 546         p = isl_printer_print_int(p, kernel->id);
 547         p = isl_printer_print_str(p, "_dimBlock");
 548         print_reverse_list(isl_printer_get_file(p),
 549                                 kernel->n_block, kernel->block_dim);
 550         p = isl_printer_print_str(p, ";");
 551         p = isl_printer_end_line(p);
 552
 553         p = print_grid(p, kernel);
 554
 555         p = isl_printer_start_line(p);
 556         p = isl_printer_print_str(p, "kernel");
 557         p = isl_printer_print_int(p, kernel->id);
 558         p = isl_printer_print_str(p, " <<<k");
 559         p = isl_printer_print_int(p, kernel->id);
 560         p = isl_printer_print_str(p, "_dimGrid, k");
 561         p = isl_printer_print_int(p, kernel->id);
 562         p = isl_printer_print_str(p, "_dimBlock>>> (");
 563         p = print_kernel_arguments(p, data->prog, kernel, 0);
 564         p = isl_printer_print_str(p, ");");
 565         p = isl_printer_end_line(p);
 566
 567         p = isl_printer_start_line(p);
 568         p = isl_printer_print_str(p, "cudaCheckKernel();");
 569         p = isl_printer_end_line(p);
 570
 571         p = isl_printer_indent(p, -2);
 572         p = isl_printer_start_line(p);
 573         p = isl_printer_print_str(p, "}");
 574         p = isl_printer_end_line(p);
 575
 576         p = isl_printer_start_line(p);
 577         p = isl_printer_end_line(p);
 578
 579         print_kernel(data->prog, kernel, data->cuda);
 580
 581         isl_ast_print_options_free(print_options);
 582
 583         return p;
 584 }
 585
 586 static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
 587         struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
 588         struct cuda_info *cuda)
 589 {
 590         isl_ast_print_options *print_options;
 591         isl_ctx *ctx = isl_ast_node_get_ctx(tree);
 592         struct print_host_user_data data = { cuda, prog };
 593
 594         print_options = isl_ast_print_options_alloc(ctx);
 595         print_options = isl_ast_print_options_set_print_user(print_options,
 596                                                 &print_host_user, &data);
 597
 598         p = gpu_print_macros(p, tree);
 599         p = isl_ast_node_print(tree, p, print_options);
 600
 601         return p;
 602 }
 603
 604 /* For each array that needs to be copied out (based on prog->copy_out),
 605  * copy the contents back from the GPU to the host.
 606  *
 607  * If any element of a given array appears in prog->copy_out, then its
 608  * entire extent is in prog->copy_out.  The bounds on this extent have
 609  * been precomputed in extract_array_info and are used in
 610  * gpu_array_info_print_size.
 611  */
 612 static __isl_give isl_printer *copy_arrays_from_device(
 613         __isl_take isl_printer *p, struct gpu_prog *prog)
 614 {
 615         int i;
 616         isl_union_set *copy_out;
 617         copy_out = isl_union_set_copy(prog->copy_out);
 618
 619         for (i = 0; i < prog->n_array; ++i) {
 620                 isl_space *space;
 621                 isl_set *copy_out_i;
 622                 int empty;
 623
 624                 space = isl_space_copy(prog->array[i].space);
 625                 copy_out_i = isl_union_set_extract_set(copy_out, space);
 626                 empty = isl_set_plain_is_empty(copy_out_i);
 627                 isl_set_free(copy_out_i);
 628                 if (empty)
 629                         continue;
 630
 631                 p = copy_array_from_device(p, &prog->array[i]);
 632         }
 633
 634         isl_union_set_free(copy_out);
 635         p = isl_printer_start_line(p);
 636         p = isl_printer_end_line(p);
 637         return p;
 638 }
 639
 640 static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
 641         struct gpu_prog *prog)
 642 {
 643         int i;
 644
 645         for (i = 0; i < prog->n_array; ++i) {
 646                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
 647                         continue;
 648                 if (!prog->array[i].accessed)
 649                         continue;
 650                 p = isl_printer_start_line(p);
 651                 p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
 652                 p = isl_printer_print_str(p, prog->array[i].name);
 653                 p = isl_printer_print_str(p, "));");
 654                 p = isl_printer_end_line(p);
 655         }
 656
 657         return p;
 658 }
 659
 660 /* Given a gpu_prog "prog" and the corresponding transformed AST
 661  * "tree", print the entire CUDA code to "p".
 662  * "types" collects the types for which a definition has already
 663  * been printed.
 664  */
 665 static __isl_give isl_printer *print_cuda(__isl_take isl_printer *p,
 666         struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
 667         struct gpu_types *types, void *user)
 668 {
 669         struct cuda_info *cuda = user;
 670         isl_printer *kernel;
 671
 672         kernel = isl_printer_to_file(isl_printer_get_ctx(p), cuda->kernel_c);
 673         kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
 674         kernel = gpu_print_types(kernel, types, prog);
 675         isl_printer_free(kernel);
 676
 677         if (!kernel)
 678                 return isl_printer_free(p);
 679
 680         p = ppcg_start_block(p);
 681
 682         p = print_cuda_macros(p);
 683
 684         p = declare_device_arrays(p, prog);
 685         p = allocate_device_arrays(p, prog);
 686         p = copy_arrays_to_device(p, prog);
 687
 688         p = print_host_code(p, prog, tree, cuda);
 689
 690         p = copy_arrays_from_device(p, prog);
 691         p = free_device_arrays(p, prog);
 692
 693         p = ppcg_end_block(p);
 694
 695         return p;
 696 }
 697
 698 /* Transform the code in the file called "input" by replacing
 699  * all scops by corresponding CUDA code.
 700  * The names of the output files are derived from "input".
 701  *
 702  * We let generate_gpu do all the hard work and then let it call
 703  * us back for printing the AST in print_cuda.
 704  *
 705  * To prepare for this printing, we first open the output files
 706  * and we close them after generate_gpu has finished.
 707  */
 708 int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
 709         const char *input)
 710 {
 711         struct cuda_info cuda;
 712         int r;
 713
 714         cuda_open_files(&cuda, input);
 715
 716         r = generate_gpu(ctx, input, cuda.host_c, options, &print_cuda, &cuda);
 717
 718         cuda_close_files(&cuda);
 719
 720         return r;
 721 }