cuda.c

   1 /*
   2  * Copyright 2012      Ecole Normale Superieure
   3  *
   4  * Use of this software is governed by the GNU LGPLv2.1 license
   5  *
   6  * Written by Sven Verdoolaege,
   7  * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
   8  */
   9
  10 #include <isl/aff.h>
  11 #include <isl/ast.h>
  12
  13 #include "cuda_common.h"
  14 #include "cuda.h"
  15 #include "gpu.h"
  16 #include "pet_printer.h"
  17 #include "print.h"
  18 #include "schedule.h"
  19
  20 static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
  21 {
  22         const char *macros =
  23                 "#define cudaCheckReturn(ret) \\\n"
  24                 "  do { \\\n"
  25                 "    cudaError_t e = (ret); \\\n"
  26                 "    if (e != cudaSuccess) { \\\n"
  27                 "      fprintf(stderr, \"CUDA error: %s\\n\", "
  28                 "cudaGetErrorString(e)); \\\n"
  29                 "      fflush(stderr); \\\n"
  30                 "    } \\\n"
  31                 "    assert(e == cudaSuccess); \\\n"
  32                 "  } while(0)\n"
  33                 "#define cudaCheckKernel() \\\n"
  34                 "  do { \\\n"
  35                 "    cudaError_t e = cudaGetLastError(); \\\n"
  36                 "    cudaCheckReturn(e); \\\n"
  37                 "  } while(0)\n\n";
  38
  39         p = isl_printer_print_str(p, macros);
  40         return p;
  41 }
  42
  43 static __isl_give isl_printer *print_array_size(__isl_take isl_printer *prn,
  44         struct gpu_array_info *array)
  45 {
  46         int i;
  47
  48         for (i = 0; i < array->n_index; ++i) {
  49                 prn = isl_printer_print_str(prn, "(");
  50                 prn = isl_printer_print_pw_aff(prn, array->bound[i]);
  51                 prn = isl_printer_print_str(prn, ") * ");
  52         }
  53         prn = isl_printer_print_str(prn, "sizeof(");
  54         prn = isl_printer_print_str(prn, array->type);
  55         prn = isl_printer_print_str(prn, ")");
  56
  57         return prn;
  58 }
  59
  60 static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
  61         struct gpu_prog *prog)
  62 {
  63         int i;
  64
  65         for (i = 0; i < prog->n_array; ++i) {
  66                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
  67                         continue;
  68                 p = isl_printer_start_line(p);
  69                 p = isl_printer_print_str(p, prog->array[i].type);
  70                 p = isl_printer_print_str(p, " *dev_");
  71                 p = isl_printer_print_str(p, prog->array[i].name);
  72                 p = isl_printer_print_str(p, ";");
  73                 p = isl_printer_end_line(p);
  74         }
  75         p = isl_printer_start_line(p);
  76         p = isl_printer_end_line(p);
  77         return p;
  78 }
  79
  80 static __isl_give isl_printer *allocate_device_arrays(
  81         __isl_take isl_printer *p, struct gpu_prog *prog)
  82 {
  83         int i;
  84
  85         for (i = 0; i < prog->n_array; ++i) {
  86                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
  87                         continue;
  88                 p = isl_printer_start_line(p);
  89                 p = isl_printer_print_str(p,
  90                         "cudaCheckReturn(cudaMalloc((void **) &dev_");
  91                 p = isl_printer_print_str(p, prog->array[i].name);
  92                 p = isl_printer_print_str(p, ", ");
  93                 p = print_array_size(p, &prog->array[i]);
  94                 p = isl_printer_print_str(p, "));");
  95                 p = isl_printer_end_line(p);
  96         }
  97         p = isl_printer_start_line(p);
  98         p = isl_printer_end_line(p);
  99         return p;
 100 }
 101
 102 static __isl_give isl_printer *copy_arrays_to_device(__isl_take isl_printer *p,
 103         struct gpu_prog *prog)
 104 {
 105         int i;
 106
 107         for (i = 0; i < prog->n_array; ++i) {
 108                 isl_space *dim;
 109                 isl_set *read_i;
 110                 int empty;
 111
 112                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
 113                         continue;
 114
 115                 dim = isl_space_copy(prog->array[i].dim);
 116                 read_i = isl_union_set_extract_set(prog->copy_in, dim);
 117                 empty = isl_set_fast_is_empty(read_i);
 118                 isl_set_free(read_i);
 119                 if (empty)
 120                         continue;
 121
 122                 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
 123                 p = isl_printer_print_str(p, prog->array[i].name);
 124                 p = isl_printer_print_str(p, ", ");
 125
 126                 if (gpu_array_is_scalar(&prog->array[i]))
 127                         p = isl_printer_print_str(p, "&");
 128                 p = isl_printer_print_str(p, prog->array[i].name);
 129                 p = isl_printer_print_str(p, ", ");
 130
 131                 p = print_array_size(p, &prog->array[i]);
 132                 p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
 133                 p = isl_printer_end_line(p);
 134         }
 135         p = isl_printer_start_line(p);
 136         p = isl_printer_end_line(p);
 137         return p;
 138 }
 139
 140 static void print_reverse_list(FILE *out, int len, int *list)
 141 {
 142         int i;
 143
 144         if (len == 0)
 145                 return;
 146
 147         fprintf(out, "(");
 148         for (i = 0; i < len; ++i) {
 149                 if (i)
 150                         fprintf(out, ", ");
 151                 fprintf(out, "%d", list[len - 1 - i]);
 152         }
 153         fprintf(out, ")");
 154 }
 155
 156 /* Print the effective grid size as a list of the sizes in each
 157  * dimension, from innermost to outermost.
 158  */
 159 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
 160         struct ppcg_kernel *kernel)
 161 {
 162         int i;
 163         int dim;
 164
 165         dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
 166         if (dim == 0)
 167                 return p;
 168
 169         p = isl_printer_print_str(p, "(");
 170         for (i = dim - 1; i >= 0; --i) {
 171                 isl_pw_aff *bound;
 172
 173                 bound = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
 174                 p = isl_printer_print_pw_aff(p, bound);
 175                 isl_pw_aff_free(bound);
 176
 177                 if (i > 0)
 178                         p = isl_printer_print_str(p, ", ");
 179         }
 180
 181         p = isl_printer_print_str(p, ")");
 182
 183         return p;
 184 }
 185
 186 /* Print the grid definition.
 187  */
 188 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
 189         struct ppcg_kernel *kernel)
 190 {
 191         p = isl_printer_start_line(p);
 192         p = isl_printer_print_str(p, "dim3 k");
 193         p = isl_printer_print_int(p, kernel->id);
 194         p = isl_printer_print_str(p, "_dimGrid");
 195         p = print_grid_size(p, kernel);
 196         p = isl_printer_print_str(p, ";");
 197         p = isl_printer_end_line(p);
 198
 199         return p;
 200 }
 201
 202 /* Print the arguments to a kernel declaration or call.  If "types" is set,
 203  * then print a declaration (including the types of the arguments).
 204  *
 205  * The arguments are printed in the following order
 206  * - the arrays accessed by the kernel
 207  * - the parameters
 208  * - the host loop iterators
 209  */
 210 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
 211         struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
 212 {
 213         int i, n;
 214         int first = 1;
 215         unsigned nparam;
 216         isl_space *space;
 217         const char *type;
 218
 219         for (i = 0; i < prog->n_array; ++i) {
 220                 isl_set *arr;
 221                 int empty;
 222
 223                 space = isl_space_copy(prog->array[i].dim);
 224                 arr = isl_union_set_extract_set(kernel->arrays, space);
 225                 empty = isl_set_fast_is_empty(arr);
 226                 isl_set_free(arr);
 227                 if (empty)
 228                         continue;
 229
 230                 if (!first)
 231                         p = isl_printer_print_str(p, ", ");
 232
 233                 if (types) {
 234                         p = isl_printer_print_str(p, prog->array[i].type);
 235                         p = isl_printer_print_str(p, " ");
 236                 }
 237
 238                 if (gpu_array_is_read_only_scalar(&prog->array[i])) {
 239                         p = isl_printer_print_str(p, prog->array[i].name);
 240                 } else {
 241                         if (types)
 242                                 p = isl_printer_print_str(p, "*");
 243                         else
 244                                 p = isl_printer_print_str(p, "dev_");
 245                         p = isl_printer_print_str(p, prog->array[i].name);
 246                 }
 247
 248                 first = 0;
 249         }
 250
 251         space = isl_union_set_get_space(kernel->arrays);
 252         nparam = isl_space_dim(space, isl_dim_param);
 253         for (i = 0; i < nparam; ++i) {
 254                 const char *name;
 255
 256                 name = isl_space_get_dim_name(space, isl_dim_param, i);
 257
 258                 if (!first)
 259                         p = isl_printer_print_str(p, ", ");
 260                 if (types)
 261                         p = isl_printer_print_str(p, "int ");
 262                 p = isl_printer_print_str(p, name);
 263
 264                 first = 0;
 265         }
 266         isl_space_free(space);
 267
 268         n = isl_space_dim(kernel->space, isl_dim_set);
 269         type = isl_options_get_ast_iterator_type(prog->ctx);
 270         for (i = 0; i < n; ++i) {
 271                 const char *name;
 272                 isl_id *id;
 273
 274                 if (!first)
 275                         p = isl_printer_print_str(p, ", ");
 276                 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
 277                 if (types) {
 278                         p = isl_printer_print_str(p, type);
 279                         p = isl_printer_print_str(p, " ");
 280                 }
 281                 p = isl_printer_print_str(p, name);
 282
 283                 first = 0;
 284         }
 285
 286         return p;
 287 }
 288
 289 /* Print the header of the given kernel.
 290  */
 291 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
 292         struct gpu_prog *prog, struct ppcg_kernel *kernel)
 293 {
 294         p = isl_printer_start_line(p);
 295         p = isl_printer_print_str(p, "__global__ void kernel");
 296         p = isl_printer_print_int(p, kernel->id);
 297         p = isl_printer_print_str(p, "(");
 298         p = print_kernel_arguments(p, prog, kernel, 1);
 299         p = isl_printer_print_str(p, ")");
 300
 301         return p;
 302 }
 303
 304 /* Print the header of the given kernel to both gen->cuda.kernel_h
 305  * and gen->cuda.kernel_c.
 306  */
 307 static void print_kernel_headers(struct gpu_prog *prog,
 308         struct ppcg_kernel *kernel, struct cuda_info *cuda)
 309 {
 310         isl_printer *p;
 311
 312         p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
 313         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 314         p = print_kernel_header(p, prog, kernel);
 315         p = isl_printer_print_str(p, ";");
 316         p = isl_printer_end_line(p);
 317         isl_printer_free(p);
 318
 319         p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
 320         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 321         p = print_kernel_header(p, prog, kernel);
 322         p = isl_printer_end_line(p);
 323         isl_printer_free(p);
 324 }
 325
 326 static void print_indent(FILE *dst, int indent)
 327 {
 328         fprintf(dst, "%*s", indent, "");
 329 }
 330
 331 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
 332 {
 333         int i;
 334         const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
 335         const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
 336                                         "threadIdx.z" };
 337
 338         if (kernel->n_grid > 0) {
 339                 print_indent(out, 4);
 340                 fprintf(out, "int ");
 341                 for (i = 0; i < kernel->n_grid; ++i) {
 342                         if (i)
 343                                 fprintf(out, ", ");
 344                         fprintf(out, "b%d = %s",
 345                                 i, block_dims[kernel->n_grid - 1 - i]);
 346                 }
 347                 fprintf(out, ";\n");
 348         }
 349
 350         if (kernel->n_block > 0) {
 351                 print_indent(out, 4);
 352                 fprintf(out, "int ");
 353                 for (i = 0; i < kernel->n_block; ++i) {
 354                         if (i)
 355                                 fprintf(out, ", ");
 356                         fprintf(out, "t%d = %s",
 357                                 i, thread_dims[kernel->n_block - 1 - i]);
 358                 }
 359                 fprintf(out, ";\n");
 360         }
 361 }
 362
 363 static void print_kernel_var(FILE *out, struct ppcg_kernel_var *var)
 364 {
 365         int j;
 366         isl_int v;
 367
 368         print_indent(out, 4);
 369         if (var->type == ppcg_access_shared)
 370                 fprintf(out, "__shared__ ");
 371         fprintf(out, "%s %s", var->array->type, var->name);
 372         isl_int_init(v);
 373         for (j = 0; j < var->array->n_index; ++j) {
 374                 fprintf(out, "[");
 375                 isl_vec_get_element(var->size, j, &v);
 376                 isl_int_print(out, v, 0);
 377                 fprintf(out, "]");
 378         }
 379         isl_int_clear(v);
 380         fprintf(out, ";\n");
 381 }
 382
 383 static void print_kernel_vars(FILE *out, struct ppcg_kernel *kernel)
 384 {
 385         int i;
 386
 387         for (i = 0; i < kernel->n_var; ++i)
 388                 print_kernel_var(out, &kernel->var[i]);
 389 }
 390
 391 /* Print an access to the element in the private/shared memory copy
 392  * described by "stmt".  The index of the copy is recorded in
 393  * stmt->local_index as a "call" to the array.
 394  */
 395 static __isl_give isl_printer *stmt_print_local_index(__isl_take isl_printer *p,
 396         struct ppcg_kernel_stmt *stmt)
 397 {
 398         int i;
 399         isl_ast_expr *expr;
 400         struct gpu_array_info *array = stmt->u.c.array;
 401
 402         expr = isl_ast_expr_get_op_arg(stmt->u.c.local_index, 0);
 403         p = isl_printer_print_ast_expr(p, expr);
 404         isl_ast_expr_free(expr);
 405
 406         for (i = 0; i < array->n_index; ++i) {
 407                 expr = isl_ast_expr_get_op_arg(stmt->u.c.local_index, 1 + i);
 408
 409                 p = isl_printer_print_str(p, "[");
 410                 p = isl_printer_print_ast_expr(p, expr);
 411                 p = isl_printer_print_str(p, "]");
 412
 413                 isl_ast_expr_free(expr);
 414         }
 415
 416         return p;
 417 }
 418
 419 /* Print an access to the element in the global memory copy
 420  * described by "stmt".  The index of the copy is recorded in
 421  * stmt->index as a "call" to the array.
 422  *
 423  * The copy in global memory has been linearized, so we need to take
 424  * the array size into account.
 425  */
 426 static __isl_give isl_printer *stmt_print_global_index(
 427         __isl_take isl_printer *p, struct ppcg_kernel_stmt *stmt)
 428 {
 429         int i;
 430         struct gpu_array_info *array = stmt->u.c.array;
 431         isl_pw_aff_list *bound = stmt->u.c.local_array->bound;
 432
 433         if (gpu_array_is_scalar(array)) {
 434                 if (!array->read_only)
 435                         p = isl_printer_print_str(p, "*");
 436                 p = isl_printer_print_str(p, array->name);
 437                 return p;
 438         }
 439
 440         p = isl_printer_print_str(p, array->name);
 441         p = isl_printer_print_str(p, "[");
 442         for (i = 0; i + 1 < array->n_index; ++i)
 443                 p = isl_printer_print_str(p, "(");
 444         for (i = 0; i < array->n_index; ++i) {
 445                 isl_ast_expr *expr;
 446                 expr = isl_ast_expr_get_op_arg(stmt->u.c.index, 1 + i);
 447                 if (i) {
 448                         isl_pw_aff *bound_i;
 449                         bound_i = isl_pw_aff_list_get_pw_aff(bound, i);
 450                         p = isl_printer_print_str(p, ") * (");
 451                         p = isl_printer_print_pw_aff(p, bound_i);
 452                         p = isl_printer_print_str(p, ") + ");
 453                         isl_pw_aff_free(bound_i);
 454                 }
 455                 p = isl_printer_print_ast_expr(p, expr);
 456                 isl_ast_expr_free(expr);
 457         }
 458         p = isl_printer_print_str(p, "]");
 459
 460         return p;
 461 }
 462
 463 /* Print a copy statement.
 464  *
 465  * A read copy statement is printed as
 466  *
 467  *      local = global;
 468  *
 469  * while a write copy statement is printed as
 470  *
 471  *      global = local;
 472  */
 473 static __isl_give isl_printer *print_copy(__isl_take isl_printer *p,
 474         struct ppcg_kernel_stmt *stmt)
 475 {
 476         p = isl_printer_start_line(p);
 477         if (stmt->u.c.read) {
 478                 p = stmt_print_local_index(p, stmt);
 479                 p = isl_printer_print_str(p, " = ");
 480                 p = stmt_print_global_index(p, stmt);
 481         } else {
 482                 p = stmt_print_global_index(p, stmt);
 483                 p = isl_printer_print_str(p, " = ");
 484                 p = stmt_print_local_index(p, stmt);
 485         }
 486         p = isl_printer_print_str(p, ";");
 487         p = isl_printer_end_line(p);
 488
 489         return p;
 490 }
 491
 492 /* Print a sync statement.
 493  */
 494 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
 495         struct ppcg_kernel_stmt *stmt)
 496 {
 497         p = isl_printer_start_line(p);
 498         p = isl_printer_print_str(p, "__syncthreads();");
 499         p = isl_printer_end_line(p);
 500
 501         return p;
 502 }
 503
 504 /* Print an access based on the information in "access".
 505  * If this an access to global memory, then the index expression
 506  * is linearized.
 507  *
 508  * If access->array is NULL, then we are
 509  * accessing an iterator in the original program.
 510  */
 511 static __isl_give isl_printer *print_access(__isl_take isl_printer *p,
 512         struct ppcg_kernel_access *access)
 513 {
 514         int i;
 515         unsigned n_index;
 516         struct gpu_array_info *array;
 517         isl_pw_aff_list *bound;
 518
 519         array = access->array;
 520         bound = array ? access->local_array->bound : NULL;
 521         if (!array)
 522                 p = isl_printer_print_str(p, "(");
 523         else {
 524                 if (access->type == ppcg_access_global &&
 525                     gpu_array_is_scalar(array) && !array->read_only)
 526                         p = isl_printer_print_str(p, "*");
 527                 p = isl_printer_print_str(p, access->local_name);
 528                 if (gpu_array_is_scalar(array))
 529                         return p;
 530                 p = isl_printer_print_str(p, "[");
 531         }
 532
 533         n_index = isl_ast_expr_list_n_ast_expr(access->index);
 534         if (access->type == ppcg_access_global)
 535                 for (i = 0; i + 1 < n_index; ++i)
 536                         p = isl_printer_print_str(p, "(");
 537
 538         for (i = 0; i < n_index; ++i) {
 539                 isl_ast_expr *index;
 540
 541                 index = isl_ast_expr_list_get_ast_expr(access->index, i);
 542                 if (array && i) {
 543                         if (access->type == ppcg_access_global) {
 544                                 isl_pw_aff *bound_i;
 545                                 bound_i = isl_pw_aff_list_get_pw_aff(bound, i);
 546                                 p = isl_printer_print_str(p, ") * (");
 547                                 p = isl_printer_print_pw_aff(p, bound_i);
 548                                 p = isl_printer_print_str(p, ") + ");
 549                                 isl_pw_aff_free(bound_i);
 550                         } else
 551                                 p = isl_printer_print_str(p, "][");
 552                 }
 553                 p = isl_printer_print_ast_expr(p, index);
 554                 isl_ast_expr_free(index);
 555         }
 556         if (!array)
 557                 p = isl_printer_print_str(p, ")");
 558         else
 559                 p = isl_printer_print_str(p, "]");
 560
 561         return p;
 562 }
 563
 564 struct cuda_access_print_info {
 565         int i;
 566         struct ppcg_kernel_stmt *stmt;
 567 };
 568
 569 /* To print the cuda accesses we walk the list of cuda accesses simultaneously
 570  * with the pet printer. This means that whenever the pet printer prints a
 571  * pet access expression we have the corresponding cuda access available and can
 572  * print the modified access.
 573  */
 574 static __isl_give isl_printer *print_cuda_access(__isl_take isl_printer *p,
 575         struct pet_expr *expr, void *usr)
 576 {
 577         struct cuda_access_print_info *info =
 578                 (struct cuda_access_print_info *) usr;
 579
 580         p = print_access(p, &info->stmt->u.d.access[info->i]);
 581         info->i++;
 582
 583         return p;
 584 }
 585
 586 static __isl_give isl_printer *print_stmt_body(__isl_take isl_printer *p,
 587         struct ppcg_kernel_stmt *stmt)
 588 {
 589         struct cuda_access_print_info info;
 590
 591         info.i = 0;
 592         info.stmt = stmt;
 593
 594         p = isl_printer_start_line(p);
 595         p = print_pet_expr(p, stmt->u.d.stmt->body, &print_cuda_access, &info);
 596         p = isl_printer_print_str(p, ";");
 597         p = isl_printer_end_line(p);
 598
 599         return p;
 600 }
 601
 602 /* This function is called for each user statement in the AST,
 603  * i.e., for each kernel body statement, copy statement or sync statement.
 604  */
 605 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
 606         __isl_take isl_ast_print_options *print_options,
 607         __isl_keep isl_ast_node *node, void *user)
 608 {
 609         isl_id *id;
 610         struct ppcg_kernel_stmt *stmt;
 611
 612         id = isl_ast_node_get_annotation(node);
 613         stmt = isl_id_get_user(id);
 614         isl_id_free(id);
 615
 616         isl_ast_print_options_free(print_options);
 617
 618         switch (stmt->type) {
 619         case ppcg_kernel_copy:
 620                 return print_copy(p, stmt);
 621         case ppcg_kernel_sync:
 622                 return print_sync(p, stmt);
 623         case ppcg_kernel_domain:
 624                 return print_stmt_body(p, stmt);
 625         }
 626
 627         return p;
 628 }
 629
 630 static int print_macro(enum isl_ast_op_type type, void *user)
 631 {
 632         isl_printer **p = user;
 633
 634         if (type == isl_ast_op_fdiv_q)
 635                 return 0;
 636
 637         *p = isl_ast_op_type_print_macro(type, *p);
 638
 639         return 0;
 640 }
 641
 642 /* Print the required macros for "node", including one for floord.
 643  * We always print a macro for floord as it may also appear in the statements.
 644  */
 645 static __isl_give isl_printer *print_macros(
 646         __isl_keep isl_ast_node *node, __isl_take isl_printer *p)
 647 {
 648         p = isl_ast_op_type_print_macro(isl_ast_op_fdiv_q, p);
 649         if (isl_ast_node_foreach_ast_op_type(node, &print_macro, &p) < 0)
 650                 return isl_printer_free(p);
 651         return p;
 652 }
 653
 654 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
 655         struct cuda_info *cuda)
 656 {
 657         isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
 658         isl_ast_print_options *print_options;
 659         isl_printer *p;
 660
 661         print_kernel_headers(prog, kernel, cuda);
 662         fprintf(cuda->kernel_c, "{\n");
 663         print_kernel_iterators(cuda->kernel_c, kernel);
 664         print_kernel_vars(cuda->kernel_c, kernel);
 665         fprintf(cuda->kernel_c, "\n");
 666
 667         print_options = isl_ast_print_options_alloc(ctx);
 668         print_options = isl_ast_print_options_set_print_user(print_options,
 669                                                     &print_kernel_stmt, NULL);
 670
 671         p = isl_printer_to_file(ctx, cuda->kernel_c);
 672         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 673         p = isl_printer_indent(p, 4);
 674         p = print_macros(kernel->tree, p);
 675         p = isl_ast_node_print(kernel->tree, p, print_options);
 676         isl_printer_free(p);
 677
 678         fprintf(cuda->kernel_c, "}\n");
 679 }
 680
 681 struct print_host_user_data {
 682         struct cuda_info *cuda;
 683         struct gpu_prog *prog;
 684 };
 685
 686 /* Print the user statement of the host code to "p".
 687  *
 688  * In particular, print a block of statements that defines the grid
 689  * and the block and then launches the kernel.
 690  */
 691 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
 692         __isl_take isl_ast_print_options *print_options,
 693         __isl_keep isl_ast_node *node, void *user)
 694 {
 695         isl_id *id;
 696         struct ppcg_kernel *kernel;
 697         struct print_host_user_data *data;
 698
 699         id = isl_ast_node_get_annotation(node);
 700         kernel = isl_id_get_user(id);
 701         isl_id_free(id);
 702
 703         data = (struct print_host_user_data *) user;
 704
 705         p = isl_printer_start_line(p);
 706         p = isl_printer_print_str(p, "{");
 707         p = isl_printer_end_line(p);
 708         p = isl_printer_indent(p, 2);
 709
 710         p = isl_printer_start_line(p);
 711         p = isl_printer_print_str(p, "dim3 k");
 712         p = isl_printer_print_int(p, kernel->id);
 713         p = isl_printer_print_str(p, "_dimBlock");
 714         print_reverse_list(isl_printer_get_file(p),
 715                                 kernel->n_block, kernel->block_dim);
 716         p = isl_printer_print_str(p, ";");
 717         p = isl_printer_end_line(p);
 718
 719         p = print_grid(p, kernel);
 720
 721         p = isl_printer_start_line(p);
 722         p = isl_printer_print_str(p, "kernel");
 723         p = isl_printer_print_int(p, kernel->id);
 724         p = isl_printer_print_str(p, " <<<k");
 725         p = isl_printer_print_int(p, kernel->id);
 726         p = isl_printer_print_str(p, "_dimGrid, k");
 727         p = isl_printer_print_int(p, kernel->id);
 728         p = isl_printer_print_str(p, "_dimBlock>>> (");
 729         p = print_kernel_arguments(p, data->prog, kernel, 0);
 730         p = isl_printer_print_str(p, ");");
 731         p = isl_printer_end_line(p);
 732
 733         p = isl_printer_start_line(p);
 734         p = isl_printer_print_str(p, "cudaCheckKernel();");
 735         p = isl_printer_end_line(p);
 736
 737         p = isl_printer_indent(p, -2);
 738         p = isl_printer_start_line(p);
 739         p = isl_printer_print_str(p, "}");
 740         p = isl_printer_end_line(p);
 741
 742         p = isl_printer_start_line(p);
 743         p = isl_printer_end_line(p);
 744
 745         print_kernel(data->prog, kernel, data->cuda);
 746
 747         isl_ast_print_options_free(print_options);
 748
 749         return p;
 750 }
 751
 752 static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
 753         struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
 754         struct cuda_info *cuda)
 755 {
 756         isl_ast_print_options *print_options;
 757         isl_ctx *ctx = isl_ast_node_get_ctx(tree);
 758         struct print_host_user_data data = { cuda, prog };
 759
 760         print_options = isl_ast_print_options_alloc(ctx);
 761         print_options = isl_ast_print_options_set_print_user(print_options,
 762                                                 &print_host_user, &data);
 763
 764         p = print_macros(tree, p);
 765         p = isl_ast_node_print(tree, p, print_options);
 766
 767         return p;
 768 }
 769
 770 /* For each array that is written anywhere in the gpu_prog,
 771  * copy the contents back from the GPU to the host.
 772  *
 773  * Arrays that are not visible outside the corresponding scop
 774  * do not need to be copied back.
 775  */
 776 static __isl_give isl_printer *copy_arrays_from_device(
 777         __isl_take isl_printer *p, struct gpu_prog *prog)
 778 {
 779         int i;
 780         isl_union_set *write;
 781         write = isl_union_map_range(isl_union_map_copy(prog->write));
 782
 783         for (i = 0; i < prog->n_array; ++i) {
 784                 isl_space *dim;
 785                 isl_set *write_i;
 786                 int empty;
 787
 788                 if (prog->array[i].local)
 789                         continue;
 790
 791                 dim = isl_space_copy(prog->array[i].dim);
 792                 write_i = isl_union_set_extract_set(write, dim);
 793                 empty = isl_set_fast_is_empty(write_i);
 794                 isl_set_free(write_i);
 795                 if (empty)
 796                         continue;
 797
 798                 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
 799                 if (gpu_array_is_scalar(&prog->array[i]))
 800                         p = isl_printer_print_str(p, "&");
 801                 p = isl_printer_print_str(p, prog->array[i].name);
 802                 p = isl_printer_print_str(p, ", dev_");
 803                 p = isl_printer_print_str(p, prog->array[i].name);
 804                 p = isl_printer_print_str(p, ", ");
 805                 p = print_array_size(p, &prog->array[i]);
 806                 p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
 807                 p = isl_printer_end_line(p);
 808         }
 809
 810         isl_union_set_free(write);
 811         p = isl_printer_start_line(p);
 812         p = isl_printer_end_line(p);
 813         return p;
 814 }
 815
 816 static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
 817         struct gpu_prog *prog)
 818 {
 819         int i;
 820
 821         for (i = 0; i < prog->n_array; ++i) {
 822                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
 823                         continue;
 824                 p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
 825                 p = isl_printer_print_str(p, prog->array[i].name);
 826                 p = isl_printer_print_str(p, "));");
 827                 p = isl_printer_end_line(p);
 828         }
 829
 830         return p;
 831 }
 832
 833 int generate_cuda(isl_ctx *ctx, struct ppcg_scop *scop,
 834         struct ppcg_options *options, const char *input)
 835 {
 836         struct cuda_info cuda;
 837         struct gpu_prog *prog;
 838         isl_ast_node *tree;
 839         isl_printer *p;
 840
 841         if (!scop)
 842                 return -1;
 843
 844         scop->context = add_context_from_str(scop->context, options->ctx);
 845
 846         prog = gpu_prog_alloc(ctx, scop);
 847
 848         tree = generate_gpu(ctx, prog, options);
 849
 850         cuda_open_files(&cuda, input);
 851
 852         p = isl_printer_to_file(ctx, cuda.host_c);
 853         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 854         p = ppcg_print_exposed_declarations(p, scop);
 855         p = ppcg_start_block(p);
 856
 857         p = print_cuda_macros(p);
 858
 859         p = declare_device_arrays(p, prog);
 860         p = allocate_device_arrays(p, prog);
 861         p = copy_arrays_to_device(p, prog);
 862
 863         p = print_host_code(p, prog, tree, &cuda);
 864         isl_ast_node_free(tree);
 865
 866         p = copy_arrays_from_device(p, prog);
 867         p = free_device_arrays(p, prog);
 868
 869         p = ppcg_end_block(p);
 870         isl_printer_free(p);
 871
 872         cuda_close_files(&cuda);
 873
 874         gpu_prog_free(prog);
 875
 876         return 0;
 877 }