cuda.c

   1 /*
   2  * Copyright 2012      Ecole Normale Superieure
   3  *
   4  * Use of this software is governed by the GNU LGPLv2.1 license
   5  *
   6  * Written by Sven Verdoolaege,
   7  * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
   8  */
   9
  10 #include <isl/aff.h>
  11 #include <isl/ast.h>
  12
  13 #include "cuda_common.h"
  14 #include "cuda.h"
  15 #include "gpu.h"
  16 #include "pet_printer.h"
  17 #include "print.h"
  18 #include "schedule.h"
  19
  20 static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
  21 {
  22         const char *macros =
  23                 "#define cudaCheckReturn(ret) \\\n"
  24                 "  do { \\\n"
  25                 "    cudaError_t cudaCheckReturn_e = (ret); \\\n"
  26                 "    if (cudaCheckReturn_e != cudaSuccess) { \\\n"
  27                 "      fprintf(stderr, \"CUDA error: %s\\n\", "
  28                 "cudaGetErrorString(cudaCheckReturn_e)); \\\n"
  29                 "      fflush(stderr); \\\n"
  30                 "    } \\\n"
  31                 "    assert(cudaCheckReturn_e == cudaSuccess); \\\n"
  32                 "  } while(0)\n"
  33                 "#define cudaCheckKernel() \\\n"
  34                 "  do { \\\n"
  35                 "    cudaCheckReturn(cudaGetLastError()); \\\n"
  36                 "  } while(0)\n\n";
  37
  38         p = isl_printer_print_str(p, macros);
  39         return p;
  40 }
  41
  42 static __isl_give isl_printer *print_array_size(__isl_take isl_printer *prn,
  43         struct gpu_array_info *array)
  44 {
  45         int i;
  46
  47         for (i = 0; i < array->n_index; ++i) {
  48                 prn = isl_printer_print_str(prn, "(");
  49                 prn = isl_printer_print_pw_aff(prn, array->bound[i]);
  50                 prn = isl_printer_print_str(prn, ") * ");
  51         }
  52         prn = isl_printer_print_str(prn, "sizeof(");
  53         prn = isl_printer_print_str(prn, array->type);
  54         prn = isl_printer_print_str(prn, ")");
  55
  56         return prn;
  57 }
  58
  59 static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
  60         struct gpu_prog *prog)
  61 {
  62         int i;
  63
  64         for (i = 0; i < prog->n_array; ++i) {
  65                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
  66                         continue;
  67                 p = isl_printer_start_line(p);
  68                 p = isl_printer_print_str(p, prog->array[i].type);
  69                 p = isl_printer_print_str(p, " *dev_");
  70                 p = isl_printer_print_str(p, prog->array[i].name);
  71                 p = isl_printer_print_str(p, ";");
  72                 p = isl_printer_end_line(p);
  73         }
  74         p = isl_printer_start_line(p);
  75         p = isl_printer_end_line(p);
  76         return p;
  77 }
  78
  79 static __isl_give isl_printer *allocate_device_arrays(
  80         __isl_take isl_printer *p, struct gpu_prog *prog)
  81 {
  82         int i;
  83
  84         for (i = 0; i < prog->n_array; ++i) {
  85                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
  86                         continue;
  87                 p = isl_printer_start_line(p);
  88                 p = isl_printer_print_str(p,
  89                         "cudaCheckReturn(cudaMalloc((void **) &dev_");
  90                 p = isl_printer_print_str(p, prog->array[i].name);
  91                 p = isl_printer_print_str(p, ", ");
  92                 p = print_array_size(p, &prog->array[i]);
  93                 p = isl_printer_print_str(p, "));");
  94                 p = isl_printer_end_line(p);
  95         }
  96         p = isl_printer_start_line(p);
  97         p = isl_printer_end_line(p);
  98         return p;
  99 }
 100
 101 static __isl_give isl_printer *copy_arrays_to_device(__isl_take isl_printer *p,
 102         struct gpu_prog *prog)
 103 {
 104         int i;
 105
 106         for (i = 0; i < prog->n_array; ++i) {
 107                 isl_space *dim;
 108                 isl_set *read_i;
 109                 int empty;
 110
 111                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
 112                         continue;
 113
 114                 dim = isl_space_copy(prog->array[i].dim);
 115                 read_i = isl_union_set_extract_set(prog->copy_in, dim);
 116                 empty = isl_set_fast_is_empty(read_i);
 117                 isl_set_free(read_i);
 118                 if (empty)
 119                         continue;
 120
 121                 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
 122                 p = isl_printer_print_str(p, prog->array[i].name);
 123                 p = isl_printer_print_str(p, ", ");
 124
 125                 if (gpu_array_is_scalar(&prog->array[i]))
 126                         p = isl_printer_print_str(p, "&");
 127                 p = isl_printer_print_str(p, prog->array[i].name);
 128                 p = isl_printer_print_str(p, ", ");
 129
 130                 p = print_array_size(p, &prog->array[i]);
 131                 p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
 132                 p = isl_printer_end_line(p);
 133         }
 134         p = isl_printer_start_line(p);
 135         p = isl_printer_end_line(p);
 136         return p;
 137 }
 138
 139 static void print_reverse_list(FILE *out, int len, int *list)
 140 {
 141         int i;
 142
 143         if (len == 0)
 144                 return;
 145
 146         fprintf(out, "(");
 147         for (i = 0; i < len; ++i) {
 148                 if (i)
 149                         fprintf(out, ", ");
 150                 fprintf(out, "%d", list[len - 1 - i]);
 151         }
 152         fprintf(out, ")");
 153 }
 154
 155 /* Print the effective grid size as a list of the sizes in each
 156  * dimension, from innermost to outermost.
 157  */
 158 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
 159         struct ppcg_kernel *kernel)
 160 {
 161         int i;
 162         int dim;
 163
 164         dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
 165         if (dim == 0)
 166                 return p;
 167
 168         p = isl_printer_print_str(p, "(");
 169         for (i = dim - 1; i >= 0; --i) {
 170                 isl_pw_aff *bound;
 171
 172                 bound = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
 173                 p = isl_printer_print_pw_aff(p, bound);
 174                 isl_pw_aff_free(bound);
 175
 176                 if (i > 0)
 177                         p = isl_printer_print_str(p, ", ");
 178         }
 179
 180         p = isl_printer_print_str(p, ")");
 181
 182         return p;
 183 }
 184
 185 /* Print the grid definition.
 186  */
 187 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
 188         struct ppcg_kernel *kernel)
 189 {
 190         p = isl_printer_start_line(p);
 191         p = isl_printer_print_str(p, "dim3 k");
 192         p = isl_printer_print_int(p, kernel->id);
 193         p = isl_printer_print_str(p, "_dimGrid");
 194         p = print_grid_size(p, kernel);
 195         p = isl_printer_print_str(p, ";");
 196         p = isl_printer_end_line(p);
 197
 198         return p;
 199 }
 200
 201 /* Print the arguments to a kernel declaration or call.  If "types" is set,
 202  * then print a declaration (including the types of the arguments).
 203  *
 204  * The arguments are printed in the following order
 205  * - the arrays accessed by the kernel
 206  * - the parameters
 207  * - the host loop iterators
 208  */
 209 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
 210         struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
 211 {
 212         int i, n;
 213         int first = 1;
 214         unsigned nparam;
 215         isl_space *space;
 216         const char *type;
 217
 218         for (i = 0; i < prog->n_array; ++i) {
 219                 isl_set *arr;
 220                 int empty;
 221
 222                 space = isl_space_copy(prog->array[i].dim);
 223                 arr = isl_union_set_extract_set(kernel->arrays, space);
 224                 empty = isl_set_fast_is_empty(arr);
 225                 isl_set_free(arr);
 226                 if (empty)
 227                         continue;
 228
 229                 if (!first)
 230                         p = isl_printer_print_str(p, ", ");
 231
 232                 if (types) {
 233                         p = isl_printer_print_str(p, prog->array[i].type);
 234                         p = isl_printer_print_str(p, " ");
 235                 }
 236
 237                 if (gpu_array_is_read_only_scalar(&prog->array[i])) {
 238                         p = isl_printer_print_str(p, prog->array[i].name);
 239                 } else {
 240                         if (types)
 241                                 p = isl_printer_print_str(p, "*");
 242                         else
 243                                 p = isl_printer_print_str(p, "dev_");
 244                         p = isl_printer_print_str(p, prog->array[i].name);
 245                 }
 246
 247                 first = 0;
 248         }
 249
 250         space = isl_union_set_get_space(kernel->arrays);
 251         nparam = isl_space_dim(space, isl_dim_param);
 252         for (i = 0; i < nparam; ++i) {
 253                 const char *name;
 254
 255                 name = isl_space_get_dim_name(space, isl_dim_param, i);
 256
 257                 if (!first)
 258                         p = isl_printer_print_str(p, ", ");
 259                 if (types)
 260                         p = isl_printer_print_str(p, "int ");
 261                 p = isl_printer_print_str(p, name);
 262
 263                 first = 0;
 264         }
 265         isl_space_free(space);
 266
 267         n = isl_space_dim(kernel->space, isl_dim_set);
 268         type = isl_options_get_ast_iterator_type(prog->ctx);
 269         for (i = 0; i < n; ++i) {
 270                 const char *name;
 271                 isl_id *id;
 272
 273                 if (!first)
 274                         p = isl_printer_print_str(p, ", ");
 275                 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
 276                 if (types) {
 277                         p = isl_printer_print_str(p, type);
 278                         p = isl_printer_print_str(p, " ");
 279                 }
 280                 p = isl_printer_print_str(p, name);
 281
 282                 first = 0;
 283         }
 284
 285         return p;
 286 }
 287
 288 /* Print the header of the given kernel.
 289  */
 290 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
 291         struct gpu_prog *prog, struct ppcg_kernel *kernel)
 292 {
 293         p = isl_printer_start_line(p);
 294         p = isl_printer_print_str(p, "__global__ void kernel");
 295         p = isl_printer_print_int(p, kernel->id);
 296         p = isl_printer_print_str(p, "(");
 297         p = print_kernel_arguments(p, prog, kernel, 1);
 298         p = isl_printer_print_str(p, ")");
 299
 300         return p;
 301 }
 302
 303 /* Print the header of the given kernel to both gen->cuda.kernel_h
 304  * and gen->cuda.kernel_c.
 305  */
 306 static void print_kernel_headers(struct gpu_prog *prog,
 307         struct ppcg_kernel *kernel, struct cuda_info *cuda)
 308 {
 309         isl_printer *p;
 310
 311         p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
 312         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 313         p = print_kernel_header(p, prog, kernel);
 314         p = isl_printer_print_str(p, ";");
 315         p = isl_printer_end_line(p);
 316         isl_printer_free(p);
 317
 318         p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
 319         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 320         p = print_kernel_header(p, prog, kernel);
 321         p = isl_printer_end_line(p);
 322         isl_printer_free(p);
 323 }
 324
 325 static void print_indent(FILE *dst, int indent)
 326 {
 327         fprintf(dst, "%*s", indent, "");
 328 }
 329
 330 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
 331 {
 332         int i;
 333         isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
 334         const char *type;
 335         const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
 336         const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
 337                                         "threadIdx.z" };
 338
 339         type = isl_options_get_ast_iterator_type(ctx);
 340
 341         if (kernel->n_grid > 0) {
 342                 print_indent(out, 4);
 343                 fprintf(out, "%s ", type);
 344                 for (i = 0; i < kernel->n_grid; ++i) {
 345                         if (i)
 346                                 fprintf(out, ", ");
 347                         fprintf(out, "b%d = %s",
 348                                 i, block_dims[kernel->n_grid - 1 - i]);
 349                 }
 350                 fprintf(out, ";\n");
 351         }
 352
 353         if (kernel->n_block > 0) {
 354                 print_indent(out, 4);
 355                 fprintf(out, "%s ", type);
 356                 for (i = 0; i < kernel->n_block; ++i) {
 357                         if (i)
 358                                 fprintf(out, ", ");
 359                         fprintf(out, "t%d = %s",
 360                                 i, thread_dims[kernel->n_block - 1 - i]);
 361                 }
 362                 fprintf(out, ";\n");
 363         }
 364 }
 365
 366 static void print_kernel_var(FILE *out, struct ppcg_kernel_var *var)
 367 {
 368         int j;
 369         isl_int v;
 370
 371         print_indent(out, 4);
 372         if (var->type == ppcg_access_shared)
 373                 fprintf(out, "__shared__ ");
 374         fprintf(out, "%s %s", var->array->type, var->name);
 375         isl_int_init(v);
 376         for (j = 0; j < var->array->n_index; ++j) {
 377                 fprintf(out, "[");
 378                 isl_vec_get_element(var->size, j, &v);
 379                 isl_int_print(out, v, 0);
 380                 fprintf(out, "]");
 381         }
 382         isl_int_clear(v);
 383         fprintf(out, ";\n");
 384 }
 385
 386 static void print_kernel_vars(FILE *out, struct ppcg_kernel *kernel)
 387 {
 388         int i;
 389
 390         for (i = 0; i < kernel->n_var; ++i)
 391                 print_kernel_var(out, &kernel->var[i]);
 392 }
 393
 394 /* Print an access to the element in the private/shared memory copy
 395  * described by "stmt".  The index of the copy is recorded in
 396  * stmt->local_index as a "call" to the array.
 397  */
 398 static __isl_give isl_printer *stmt_print_local_index(__isl_take isl_printer *p,
 399         struct ppcg_kernel_stmt *stmt)
 400 {
 401         int i;
 402         isl_ast_expr *expr;
 403         struct gpu_array_info *array = stmt->u.c.array;
 404
 405         expr = isl_ast_expr_get_op_arg(stmt->u.c.local_index, 0);
 406         p = isl_printer_print_ast_expr(p, expr);
 407         isl_ast_expr_free(expr);
 408
 409         for (i = 0; i < array->n_index; ++i) {
 410                 expr = isl_ast_expr_get_op_arg(stmt->u.c.local_index, 1 + i);
 411
 412                 p = isl_printer_print_str(p, "[");
 413                 p = isl_printer_print_ast_expr(p, expr);
 414                 p = isl_printer_print_str(p, "]");
 415
 416                 isl_ast_expr_free(expr);
 417         }
 418
 419         return p;
 420 }
 421
 422 /* Print an access to the element in the global memory copy
 423  * described by "stmt".  The index of the copy is recorded in
 424  * stmt->index as a "call" to the array.
 425  *
 426  * The copy in global memory has been linearized, so we need to take
 427  * the array size into account.
 428  */
 429 static __isl_give isl_printer *stmt_print_global_index(
 430         __isl_take isl_printer *p, struct ppcg_kernel_stmt *stmt)
 431 {
 432         int i;
 433         struct gpu_array_info *array = stmt->u.c.array;
 434         isl_pw_aff_list *bound = stmt->u.c.local_array->bound;
 435
 436         if (gpu_array_is_scalar(array)) {
 437                 if (!array->read_only)
 438                         p = isl_printer_print_str(p, "*");
 439                 p = isl_printer_print_str(p, array->name);
 440                 return p;
 441         }
 442
 443         p = isl_printer_print_str(p, array->name);
 444         p = isl_printer_print_str(p, "[");
 445         for (i = 0; i + 1 < array->n_index; ++i)
 446                 p = isl_printer_print_str(p, "(");
 447         for (i = 0; i < array->n_index; ++i) {
 448                 isl_ast_expr *expr;
 449                 expr = isl_ast_expr_get_op_arg(stmt->u.c.index, 1 + i);
 450                 if (i) {
 451                         isl_pw_aff *bound_i;
 452                         bound_i = isl_pw_aff_list_get_pw_aff(bound, i);
 453                         p = isl_printer_print_str(p, ") * (");
 454                         p = isl_printer_print_pw_aff(p, bound_i);
 455                         p = isl_printer_print_str(p, ") + (");
 456                         isl_pw_aff_free(bound_i);
 457                 }
 458                 p = isl_printer_print_ast_expr(p, expr);
 459                 if (i)
 460                         p = isl_printer_print_str(p, ")");
 461                 isl_ast_expr_free(expr);
 462         }
 463         p = isl_printer_print_str(p, "]");
 464
 465         return p;
 466 }
 467
 468 /* Print a copy statement.
 469  *
 470  * A read copy statement is printed as
 471  *
 472  *      local = global;
 473  *
 474  * while a write copy statement is printed as
 475  *
 476  *      global = local;
 477  */
 478 static __isl_give isl_printer *print_copy(__isl_take isl_printer *p,
 479         struct ppcg_kernel_stmt *stmt)
 480 {
 481         p = isl_printer_start_line(p);
 482         if (stmt->u.c.read) {
 483                 p = stmt_print_local_index(p, stmt);
 484                 p = isl_printer_print_str(p, " = ");
 485                 p = stmt_print_global_index(p, stmt);
 486         } else {
 487                 p = stmt_print_global_index(p, stmt);
 488                 p = isl_printer_print_str(p, " = ");
 489                 p = stmt_print_local_index(p, stmt);
 490         }
 491         p = isl_printer_print_str(p, ";");
 492         p = isl_printer_end_line(p);
 493
 494         return p;
 495 }
 496
 497 /* Print a sync statement.
 498  */
 499 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
 500         struct ppcg_kernel_stmt *stmt)
 501 {
 502         p = isl_printer_start_line(p);
 503         p = isl_printer_print_str(p, "__syncthreads();");
 504         p = isl_printer_end_line(p);
 505
 506         return p;
 507 }
 508
 509 /* Print an access based on the information in "access".
 510  * If this an access to global memory, then the index expression
 511  * is linearized.
 512  *
 513  * If access->array is NULL, then we are
 514  * accessing an iterator in the original program.
 515  */
 516 static __isl_give isl_printer *print_access(__isl_take isl_printer *p,
 517         struct ppcg_kernel_access *access)
 518 {
 519         int i;
 520         unsigned n_index;
 521         struct gpu_array_info *array;
 522         isl_pw_aff_list *bound;
 523
 524         array = access->array;
 525         bound = array ? access->local_array->bound : NULL;
 526         if (!array)
 527                 p = isl_printer_print_str(p, "(");
 528         else {
 529                 if (access->type == ppcg_access_global &&
 530                     gpu_array_is_scalar(array) && !array->read_only)
 531                         p = isl_printer_print_str(p, "*");
 532                 p = isl_printer_print_str(p, access->local_name);
 533                 if (gpu_array_is_scalar(array))
 534                         return p;
 535                 p = isl_printer_print_str(p, "[");
 536         }
 537
 538         n_index = isl_ast_expr_list_n_ast_expr(access->index);
 539         if (access->type == ppcg_access_global)
 540                 for (i = 0; i + 1 < n_index; ++i)
 541                         p = isl_printer_print_str(p, "(");
 542
 543         for (i = 0; i < n_index; ++i) {
 544                 isl_ast_expr *index;
 545
 546                 index = isl_ast_expr_list_get_ast_expr(access->index, i);
 547                 if (array && i) {
 548                         if (access->type == ppcg_access_global) {
 549                                 isl_pw_aff *bound_i;
 550                                 bound_i = isl_pw_aff_list_get_pw_aff(bound, i);
 551                                 p = isl_printer_print_str(p, ") * (");
 552                                 p = isl_printer_print_pw_aff(p, bound_i);
 553                                 p = isl_printer_print_str(p, ") + ");
 554                                 isl_pw_aff_free(bound_i);
 555                         } else
 556                                 p = isl_printer_print_str(p, "][");
 557                 }
 558                 p = isl_printer_print_ast_expr(p, index);
 559                 isl_ast_expr_free(index);
 560         }
 561         if (!array)
 562                 p = isl_printer_print_str(p, ")");
 563         else
 564                 p = isl_printer_print_str(p, "]");
 565
 566         return p;
 567 }
 568
 569 struct cuda_access_print_info {
 570         int i;
 571         struct ppcg_kernel_stmt *stmt;
 572 };
 573
 574 /* To print the cuda accesses we walk the list of cuda accesses simultaneously
 575  * with the pet printer. This means that whenever the pet printer prints a
 576  * pet access expression we have the corresponding cuda access available and can
 577  * print the modified access.
 578  */
 579 static __isl_give isl_printer *print_cuda_access(__isl_take isl_printer *p,
 580         struct pet_expr *expr, void *usr)
 581 {
 582         struct cuda_access_print_info *info =
 583                 (struct cuda_access_print_info *) usr;
 584
 585         p = print_access(p, &info->stmt->u.d.access[info->i]);
 586         info->i++;
 587
 588         return p;
 589 }
 590
 591 static __isl_give isl_printer *print_stmt_body(__isl_take isl_printer *p,
 592         struct ppcg_kernel_stmt *stmt)
 593 {
 594         struct cuda_access_print_info info;
 595
 596         info.i = 0;
 597         info.stmt = stmt;
 598
 599         p = isl_printer_start_line(p);
 600         p = print_pet_expr(p, stmt->u.d.stmt->body, &print_cuda_access, &info);
 601         p = isl_printer_print_str(p, ";");
 602         p = isl_printer_end_line(p);
 603
 604         return p;
 605 }
 606
 607 /* This function is called for each user statement in the AST,
 608  * i.e., for each kernel body statement, copy statement or sync statement.
 609  */
 610 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
 611         __isl_take isl_ast_print_options *print_options,
 612         __isl_keep isl_ast_node *node, void *user)
 613 {
 614         isl_id *id;
 615         struct ppcg_kernel_stmt *stmt;
 616
 617         id = isl_ast_node_get_annotation(node);
 618         stmt = isl_id_get_user(id);
 619         isl_id_free(id);
 620
 621         isl_ast_print_options_free(print_options);
 622
 623         switch (stmt->type) {
 624         case ppcg_kernel_copy:
 625                 return print_copy(p, stmt);
 626         case ppcg_kernel_sync:
 627                 return print_sync(p, stmt);
 628         case ppcg_kernel_domain:
 629                 return print_stmt_body(p, stmt);
 630         }
 631
 632         return p;
 633 }
 634
 635 static int print_macro(enum isl_ast_op_type type, void *user)
 636 {
 637         isl_printer **p = user;
 638
 639         if (type == isl_ast_op_fdiv_q)
 640                 return 0;
 641
 642         *p = isl_ast_op_type_print_macro(type, *p);
 643
 644         return 0;
 645 }
 646
 647 /* Print the required macros for "node", including one for floord.
 648  * We always print a macro for floord as it may also appear in the statements.
 649  */
 650 static __isl_give isl_printer *print_macros(
 651         __isl_keep isl_ast_node *node, __isl_take isl_printer *p)
 652 {
 653         p = isl_ast_op_type_print_macro(isl_ast_op_fdiv_q, p);
 654         if (isl_ast_node_foreach_ast_op_type(node, &print_macro, &p) < 0)
 655                 return isl_printer_free(p);
 656         return p;
 657 }
 658
 659 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
 660         struct cuda_info *cuda)
 661 {
 662         isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
 663         isl_ast_print_options *print_options;
 664         isl_printer *p;
 665
 666         print_kernel_headers(prog, kernel, cuda);
 667         fprintf(cuda->kernel_c, "{\n");
 668         print_kernel_iterators(cuda->kernel_c, kernel);
 669         print_kernel_vars(cuda->kernel_c, kernel);
 670         fprintf(cuda->kernel_c, "\n");
 671
 672         print_options = isl_ast_print_options_alloc(ctx);
 673         print_options = isl_ast_print_options_set_print_user(print_options,
 674                                                     &print_kernel_stmt, NULL);
 675
 676         p = isl_printer_to_file(ctx, cuda->kernel_c);
 677         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 678         p = isl_printer_indent(p, 4);
 679         p = print_macros(kernel->tree, p);
 680         p = isl_ast_node_print(kernel->tree, p, print_options);
 681         isl_printer_free(p);
 682
 683         fprintf(cuda->kernel_c, "}\n");
 684 }
 685
 686 struct print_host_user_data {
 687         struct cuda_info *cuda;
 688         struct gpu_prog *prog;
 689 };
 690
 691 /* Print the user statement of the host code to "p".
 692  *
 693  * In particular, print a block of statements that defines the grid
 694  * and the block and then launches the kernel.
 695  */
 696 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
 697         __isl_take isl_ast_print_options *print_options,
 698         __isl_keep isl_ast_node *node, void *user)
 699 {
 700         isl_id *id;
 701         struct ppcg_kernel *kernel;
 702         struct print_host_user_data *data;
 703
 704         id = isl_ast_node_get_annotation(node);
 705         kernel = isl_id_get_user(id);
 706         isl_id_free(id);
 707
 708         data = (struct print_host_user_data *) user;
 709
 710         p = isl_printer_start_line(p);
 711         p = isl_printer_print_str(p, "{");
 712         p = isl_printer_end_line(p);
 713         p = isl_printer_indent(p, 2);
 714
 715         p = isl_printer_start_line(p);
 716         p = isl_printer_print_str(p, "dim3 k");
 717         p = isl_printer_print_int(p, kernel->id);
 718         p = isl_printer_print_str(p, "_dimBlock");
 719         print_reverse_list(isl_printer_get_file(p),
 720                                 kernel->n_block, kernel->block_dim);
 721         p = isl_printer_print_str(p, ";");
 722         p = isl_printer_end_line(p);
 723
 724         p = print_grid(p, kernel);
 725
 726         p = isl_printer_start_line(p);
 727         p = isl_printer_print_str(p, "kernel");
 728         p = isl_printer_print_int(p, kernel->id);
 729         p = isl_printer_print_str(p, " <<<k");
 730         p = isl_printer_print_int(p, kernel->id);
 731         p = isl_printer_print_str(p, "_dimGrid, k");
 732         p = isl_printer_print_int(p, kernel->id);
 733         p = isl_printer_print_str(p, "_dimBlock>>> (");
 734         p = print_kernel_arguments(p, data->prog, kernel, 0);
 735         p = isl_printer_print_str(p, ");");
 736         p = isl_printer_end_line(p);
 737
 738         p = isl_printer_start_line(p);
 739         p = isl_printer_print_str(p, "cudaCheckKernel();");
 740         p = isl_printer_end_line(p);
 741
 742         p = isl_printer_indent(p, -2);
 743         p = isl_printer_start_line(p);
 744         p = isl_printer_print_str(p, "}");
 745         p = isl_printer_end_line(p);
 746
 747         p = isl_printer_start_line(p);
 748         p = isl_printer_end_line(p);
 749
 750         print_kernel(data->prog, kernel, data->cuda);
 751
 752         isl_ast_print_options_free(print_options);
 753
 754         return p;
 755 }
 756
 757 static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
 758         struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
 759         struct cuda_info *cuda)
 760 {
 761         isl_ast_print_options *print_options;
 762         isl_ctx *ctx = isl_ast_node_get_ctx(tree);
 763         struct print_host_user_data data = { cuda, prog };
 764
 765         print_options = isl_ast_print_options_alloc(ctx);
 766         print_options = isl_ast_print_options_set_print_user(print_options,
 767                                                 &print_host_user, &data);
 768
 769         p = print_macros(tree, p);
 770         p = isl_ast_node_print(tree, p, print_options);
 771
 772         return p;
 773 }
 774
 775 /* For each array that needs to be copied out (based on prog->copy_out),
 776  * copy the contents back from the GPU to the host.
 777  *
 778  * If any element of a given array appears in prog->copy_out, then its
 779  * entire extent is in prog->copy_out.  The bounds on this extent have
 780  * been precomputed in extract_array_info and are used in print_array_size.
 781  */
 782 static __isl_give isl_printer *copy_arrays_from_device(
 783         __isl_take isl_printer *p, struct gpu_prog *prog)
 784 {
 785         int i;
 786         isl_union_set *copy_out;
 787         copy_out = isl_union_set_copy(prog->copy_out);
 788
 789         for (i = 0; i < prog->n_array; ++i) {
 790                 isl_space *dim;
 791                 isl_set *copy_out_i;
 792                 int empty;
 793
 794                 dim = isl_space_copy(prog->array[i].dim);
 795                 copy_out_i = isl_union_set_extract_set(copy_out, dim);
 796                 empty = isl_set_fast_is_empty(copy_out_i);
 797                 isl_set_free(copy_out_i);
 798                 if (empty)
 799                         continue;
 800
 801                 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
 802                 if (gpu_array_is_scalar(&prog->array[i]))
 803                         p = isl_printer_print_str(p, "&");
 804                 p = isl_printer_print_str(p, prog->array[i].name);
 805                 p = isl_printer_print_str(p, ", dev_");
 806                 p = isl_printer_print_str(p, prog->array[i].name);
 807                 p = isl_printer_print_str(p, ", ");
 808                 p = print_array_size(p, &prog->array[i]);
 809                 p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
 810                 p = isl_printer_end_line(p);
 811         }
 812
 813         isl_union_set_free(copy_out);
 814         p = isl_printer_start_line(p);
 815         p = isl_printer_end_line(p);
 816         return p;
 817 }
 818
 819 static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
 820         struct gpu_prog *prog)
 821 {
 822         int i;
 823
 824         for (i = 0; i < prog->n_array; ++i) {
 825                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
 826                         continue;
 827                 p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
 828                 p = isl_printer_print_str(p, prog->array[i].name);
 829                 p = isl_printer_print_str(p, "));");
 830                 p = isl_printer_end_line(p);
 831         }
 832
 833         return p;
 834 }
 835
 836 int generate_cuda(isl_ctx *ctx, struct ppcg_scop *scop,
 837         struct ppcg_options *options, const char *input)
 838 {
 839         struct cuda_info cuda;
 840         struct gpu_prog *prog;
 841         isl_ast_node *tree;
 842         isl_printer *p;
 843
 844         if (!scop)
 845                 return -1;
 846
 847         prog = gpu_prog_alloc(ctx, scop);
 848
 849         tree = generate_gpu(ctx, prog, options);
 850
 851         cuda.start = scop->start;
 852         cuda.end = scop->end;
 853         cuda_open_files(&cuda, input);
 854
 855         p = isl_printer_to_file(ctx, cuda.host_c);
 856         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 857         p = ppcg_print_exposed_declarations(p, scop);
 858         p = ppcg_start_block(p);
 859
 860         p = print_cuda_macros(p);
 861
 862         p = declare_device_arrays(p, prog);
 863         p = allocate_device_arrays(p, prog);
 864         p = copy_arrays_to_device(p, prog);
 865
 866         p = print_host_code(p, prog, tree, &cuda);
 867         isl_ast_node_free(tree);
 868
 869         p = copy_arrays_from_device(p, prog);
 870         p = free_device_arrays(p, prog);
 871
 872         p = ppcg_end_block(p);
 873         isl_printer_free(p);
 874
 875         cuda_close_files(&cuda);
 876
 877         gpu_prog_free(prog);
 878
 879         return 0;
 880 }