cuda.c

   1 /*
   2  * Copyright 2012      Ecole Normale Superieure
   3  *
   4  * Use of this software is governed by the GNU LGPLv2.1 license
   5  *
   6  * Written by Sven Verdoolaege,
   7  * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
   8  */
   9
  10 #include <isl/aff.h>
  11 #include <isl/ast.h>
  12
  13 #include "cuda_common.h"
  14 #include "cuda.h"
  15 #include "gpu.h"
  16 #include "pet_printer.h"
  17 #include "schedule.h"
  18
  19 void print_cuda_macros(FILE *file)
  20 {
  21         const char *macros =
  22                 "#define cudaCheckReturn(ret) assert((ret) == cudaSuccess)\n"
  23                 "#define cudaCheckKernel()"
  24                 " assert(cudaGetLastError() == cudaSuccess)\n\n";
  25         fputs(macros, file);
  26 }
  27
  28 static void print_array_size(isl_ctx *ctx, FILE *out,
  29         struct gpu_array_info *array)
  30 {
  31         int i;
  32         isl_printer *prn;
  33
  34         prn = isl_printer_to_file(ctx, out);
  35         prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
  36         for (i = 0; i < array->n_index; ++i) {
  37                 prn = isl_printer_print_str(prn, "(");
  38                 prn = isl_printer_print_pw_aff(prn, array->bound[i]);
  39                 prn = isl_printer_print_str(prn, ") * ");
  40         }
  41         prn = isl_printer_print_str(prn, "sizeof(");
  42         prn = isl_printer_print_str(prn, array->type);
  43         prn = isl_printer_print_str(prn, ")");
  44         isl_printer_free(prn);
  45 }
  46
  47 static void declare_device_arrays(FILE *out, struct gpu_prog *prog)
  48 {
  49         int i;
  50
  51         for (i = 0; i < prog->n_array; ++i) {
  52                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
  53                         continue;
  54                 fprintf(out, "%s *dev_%s;\n",
  55                         prog->array[i].type, prog->array[i].name);
  56         }
  57         fprintf(out, "\n");
  58 }
  59
  60 static void allocate_device_arrays(FILE *out, struct gpu_prog *prog)
  61 {
  62         int i;
  63
  64         for (i = 0; i < prog->n_array; ++i) {
  65                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
  66                         continue;
  67                 fprintf(out,
  68                         "cudaCheckReturn(cudaMalloc((void **) &dev_%s, ",
  69                         prog->array[i].name);
  70                 print_array_size(prog->ctx, out, &prog->array[i]);
  71                 fprintf(out, "));\n");
  72         }
  73         fprintf(out, "\n");
  74 }
  75
  76 static void copy_arrays_to_device(FILE *out, struct gpu_prog *prog)
  77 {
  78         int i;
  79
  80         for (i = 0; i < prog->n_array; ++i) {
  81                 isl_space *dim;
  82                 isl_set *read_i;
  83                 int empty;
  84
  85                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
  86                         continue;
  87
  88                 dim = isl_space_copy(prog->array[i].dim);
  89                 read_i = isl_union_set_extract_set(prog->copy_in, dim);
  90                 empty = isl_set_fast_is_empty(read_i);
  91                 isl_set_free(read_i);
  92                 if (empty)
  93                         continue;
  94
  95                 fprintf(out, "cudaCheckReturn(cudaMemcpy(dev_%s,",
  96                         prog->array[i].name);
  97
  98                 if (gpu_array_is_scalar(&prog->array[i]))
  99                         fprintf(out, " &%s, ", prog->array[i].name);
 100                 else
 101                         fprintf(out, " %s, ", prog->array[i].name);
 102
 103                 print_array_size(prog->ctx, out, &prog->array[i]);
 104                 fprintf(out, ", cudaMemcpyHostToDevice));\n");
 105         }
 106         fprintf(out, "\n");
 107 }
 108
 109 static void print_reverse_list(FILE *out, int len, int *list)
 110 {
 111         int i;
 112
 113         if (len == 0)
 114                 return;
 115
 116         fprintf(out, "(");
 117         for (i = 0; i < len; ++i) {
 118                 if (i)
 119                         fprintf(out, ", ");
 120                 fprintf(out, "%d", list[len - 1 - i]);
 121         }
 122         fprintf(out, ")");
 123 }
 124
 125 /* Print the effective grid size as a list of the sizes in each
 126  * dimension, from innermost to outermost.
 127  */
 128 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
 129         struct ppcg_kernel *kernel)
 130 {
 131         int i;
 132         int dim;
 133
 134         dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
 135         if (dim == 0)
 136                 return p;
 137
 138         p = isl_printer_print_str(p, "(");
 139         for (i = dim - 1; i >= 0; --i) {
 140                 isl_pw_aff *bound;
 141
 142                 bound = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
 143                 p = isl_printer_print_pw_aff(p, bound);
 144                 isl_pw_aff_free(bound);
 145
 146                 if (i > 0)
 147                         p = isl_printer_print_str(p, ", ");
 148         }
 149
 150         p = isl_printer_print_str(p, ")");
 151
 152         return p;
 153 }
 154
 155 /* Print the grid definition.
 156  */
 157 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
 158         struct ppcg_kernel *kernel)
 159 {
 160         p = isl_printer_start_line(p);
 161         p = isl_printer_print_str(p, "dim3 k");
 162         p = isl_printer_print_int(p, kernel->id);
 163         p = isl_printer_print_str(p, "_dimGrid");
 164         p = print_grid_size(p, kernel);
 165         p = isl_printer_print_str(p, ";");
 166         p = isl_printer_end_line(p);
 167
 168         return p;
 169 }
 170
 171 /* Print the arguments to a kernel declaration or call.  If "types" is set,
 172  * then print a declaration (including the types of the arguments).
 173  *
 174  * The arguments are printed in the following order
 175  * - the arrays accessed by the kernel
 176  * - the parameters
 177  * - the host loop iterators
 178  */
 179 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
 180         struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
 181 {
 182         int i, n;
 183         int first = 1;
 184         unsigned nparam;
 185         isl_space *space;
 186         const char *type;
 187
 188         for (i = 0; i < prog->n_array; ++i) {
 189                 isl_set *arr;
 190                 int empty;
 191
 192                 space = isl_space_copy(prog->array[i].dim);
 193                 arr = isl_union_set_extract_set(kernel->arrays, space);
 194                 empty = isl_set_fast_is_empty(arr);
 195                 isl_set_free(arr);
 196                 if (empty)
 197                         continue;
 198
 199                 if (!first)
 200                         p = isl_printer_print_str(p, ", ");
 201
 202                 if (types) {
 203                         p = isl_printer_print_str(p, prog->array[i].type);
 204                         p = isl_printer_print_str(p, " ");
 205                 }
 206
 207                 if (gpu_array_is_read_only_scalar(&prog->array[i])) {
 208                         p = isl_printer_print_str(p, prog->array[i].name);
 209                 } else {
 210                         if (types)
 211                                 p = isl_printer_print_str(p, "*");
 212                         else
 213                                 p = isl_printer_print_str(p, "dev_");
 214                         p = isl_printer_print_str(p, prog->array[i].name);
 215                 }
 216
 217                 first = 0;
 218         }
 219
 220         space = isl_union_set_get_space(kernel->arrays);
 221         nparam = isl_space_dim(space, isl_dim_param);
 222         for (i = 0; i < nparam; ++i) {
 223                 const char *name;
 224
 225                 name = isl_space_get_dim_name(space, isl_dim_param, i);
 226
 227                 if (!first)
 228                         p = isl_printer_print_str(p, ", ");
 229                 if (types)
 230                         p = isl_printer_print_str(p, "int ");
 231                 p = isl_printer_print_str(p, name);
 232
 233                 first = 0;
 234         }
 235         isl_space_free(space);
 236
 237         n = isl_space_dim(kernel->space, isl_dim_set);
 238         type = isl_options_get_ast_iterator_type(prog->ctx);
 239         for (i = 0; i < n; ++i) {
 240                 const char *name;
 241                 isl_id *id;
 242
 243                 if (!first)
 244                         p = isl_printer_print_str(p, ", ");
 245                 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
 246                 if (types) {
 247                         p = isl_printer_print_str(p, type);
 248                         p = isl_printer_print_str(p, " ");
 249                 }
 250                 p = isl_printer_print_str(p, name);
 251
 252                 first = 0;
 253         }
 254
 255         return p;
 256 }
 257
 258 /* Print the header of the given kernel.
 259  */
 260 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
 261         struct gpu_prog *prog, struct ppcg_kernel *kernel)
 262 {
 263         p = isl_printer_start_line(p);
 264         p = isl_printer_print_str(p, "__global__ void kernel");
 265         p = isl_printer_print_int(p, kernel->id);
 266         p = isl_printer_print_str(p, "(");
 267         p = print_kernel_arguments(p, prog, kernel, 1);
 268         p = isl_printer_print_str(p, ")");
 269
 270         return p;
 271 }
 272
 273 /* Print the header of the given kernel to both gen->cuda.kernel_h
 274  * and gen->cuda.kernel_c.
 275  */
 276 static void print_kernel_headers(struct gpu_prog *prog,
 277         struct ppcg_kernel *kernel, struct cuda_info *cuda)
 278 {
 279         isl_printer *p;
 280
 281         p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
 282         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 283         p = print_kernel_header(p, prog, kernel);
 284         p = isl_printer_print_str(p, ";");
 285         p = isl_printer_end_line(p);
 286         isl_printer_free(p);
 287
 288         p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
 289         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 290         p = print_kernel_header(p, prog, kernel);
 291         p = isl_printer_end_line(p);
 292         isl_printer_free(p);
 293 }
 294
 295 static void print_indent(FILE *dst, int indent)
 296 {
 297         fprintf(dst, "%*s", indent, "");
 298 }
 299
 300 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
 301 {
 302         int i;
 303         const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
 304         const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
 305                                         "threadIdx.z" };
 306
 307         if (kernel->n_grid > 0) {
 308                 print_indent(out, 4);
 309                 fprintf(out, "int ");
 310                 for (i = 0; i < kernel->n_grid; ++i) {
 311                         if (i)
 312                                 fprintf(out, ", ");
 313                         fprintf(out, "b%d = %s",
 314                                 i, block_dims[kernel->n_grid - 1 - i]);
 315                 }
 316                 fprintf(out, ";\n");
 317         }
 318
 319         if (kernel->n_block > 0) {
 320                 print_indent(out, 4);
 321                 fprintf(out, "int ");
 322                 for (i = 0; i < kernel->n_block; ++i) {
 323                         if (i)
 324                                 fprintf(out, ", ");
 325                         fprintf(out, "t%d = %s",
 326                                 i, thread_dims[kernel->n_block - 1 - i]);
 327                 }
 328                 fprintf(out, ";\n");
 329         }
 330 }
 331
 332 static void print_kernel_var(FILE *out, struct ppcg_kernel_var *var)
 333 {
 334         int j;
 335         isl_int v;
 336
 337         print_indent(out, 4);
 338         if (var->type == ppcg_access_shared)
 339                 fprintf(out, "__shared__ ");
 340         fprintf(out, "%s %s", var->array->type, var->name);
 341         isl_int_init(v);
 342         for (j = 0; j < var->array->n_index; ++j) {
 343                 fprintf(out, "[");
 344                 isl_vec_get_element(var->size, j, &v);
 345                 isl_int_print(out, v, 0);
 346                 fprintf(out, "]");
 347         }
 348         isl_int_clear(v);
 349         fprintf(out, ";\n");
 350 }
 351
 352 static void print_kernel_vars(FILE *out, struct ppcg_kernel *kernel)
 353 {
 354         int i;
 355
 356         for (i = 0; i < kernel->n_var; ++i)
 357                 print_kernel_var(out, &kernel->var[i]);
 358 }
 359
 360 /* Print an access to the element in the private/shared memory copy
 361  * described by "stmt".  The index of the copy is recorded in
 362  * stmt->local_index as a "call" to the array.
 363  */
 364 static __isl_give isl_printer *stmt_print_local_index(__isl_take isl_printer *p,
 365         struct ppcg_kernel_stmt *stmt)
 366 {
 367         int i;
 368         isl_ast_expr *expr;
 369         struct gpu_array_info *array = stmt->u.c.array;
 370
 371         expr = isl_ast_expr_get_op_arg(stmt->u.c.local_index, 0);
 372         p = isl_printer_print_ast_expr(p, expr);
 373         isl_ast_expr_free(expr);
 374
 375         for (i = 0; i < array->n_index; ++i) {
 376                 expr = isl_ast_expr_get_op_arg(stmt->u.c.local_index, 1 + i);
 377
 378                 p = isl_printer_print_str(p, "[");
 379                 p = isl_printer_print_ast_expr(p, expr);
 380                 p = isl_printer_print_str(p, "]");
 381
 382                 isl_ast_expr_free(expr);
 383         }
 384
 385         return p;
 386 }
 387
 388 /* Print an access to the element in the global memory copy
 389  * described by "stmt".  The index of the copy is recorded in
 390  * stmt->index as a "call" to the array.
 391  *
 392  * The copy in global memory has been linearized, so we need to take
 393  * the array size into account.
 394  */
 395 static __isl_give isl_printer *stmt_print_global_index(
 396         __isl_take isl_printer *p, struct ppcg_kernel_stmt *stmt)
 397 {
 398         int i;
 399         struct gpu_array_info *array = stmt->u.c.array;
 400         isl_pw_aff_list *bound = stmt->u.c.local_array->bound;
 401
 402         if (gpu_array_is_scalar(array)) {
 403                 if (!array->read_only)
 404                         p = isl_printer_print_str(p, "*");
 405                 p = isl_printer_print_str(p, array->name);
 406                 return p;
 407         }
 408
 409         p = isl_printer_print_str(p, array->name);
 410         p = isl_printer_print_str(p, "[");
 411         for (i = 0; i + 1 < array->n_index; ++i)
 412                 p = isl_printer_print_str(p, "(");
 413         for (i = 0; i < array->n_index; ++i) {
 414                 isl_ast_expr *expr;
 415                 expr = isl_ast_expr_get_op_arg(stmt->u.c.index, 1 + i);
 416                 if (i) {
 417                         isl_pw_aff *bound_i;
 418                         bound_i = isl_pw_aff_list_get_pw_aff(bound, i);
 419                         p = isl_printer_print_str(p, ") * (");
 420                         p = isl_printer_print_pw_aff(p, bound_i);
 421                         p = isl_printer_print_str(p, ") + ");
 422                         isl_pw_aff_free(bound_i);
 423                 }
 424                 p = isl_printer_print_ast_expr(p, expr);
 425                 isl_ast_expr_free(expr);
 426         }
 427         p = isl_printer_print_str(p, "]");
 428
 429         return p;
 430 }
 431
 432 /* Print a copy statement.
 433  *
 434  * A read copy statement is printed as
 435  *
 436  *      local = global;
 437  *
 438  * while a write copy statement is printed as
 439  *
 440  *      global = local;
 441  */
 442 static __isl_give isl_printer *print_copy(__isl_take isl_printer *p,
 443         struct ppcg_kernel_stmt *stmt)
 444 {
 445         p = isl_printer_start_line(p);
 446         if (stmt->u.c.read) {
 447                 p = stmt_print_local_index(p, stmt);
 448                 p = isl_printer_print_str(p, " = ");
 449                 p = stmt_print_global_index(p, stmt);
 450         } else {
 451                 p = stmt_print_global_index(p, stmt);
 452                 p = isl_printer_print_str(p, " = ");
 453                 p = stmt_print_local_index(p, stmt);
 454         }
 455         p = isl_printer_print_str(p, ";");
 456         p = isl_printer_end_line(p);
 457
 458         return p;
 459 }
 460
 461 /* Print a sync statement.
 462  */
 463 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
 464         struct ppcg_kernel_stmt *stmt)
 465 {
 466         p = isl_printer_start_line(p);
 467         p = isl_printer_print_str(p, "__syncthreads();");
 468         p = isl_printer_end_line(p);
 469
 470         return p;
 471 }
 472
 473 /* Print an access based on the information in "access".
 474  * If this an access to global memory, then the index expression
 475  * is linearized.
 476  *
 477  * If access->array is NULL, then we are
 478  * accessing an iterator in the original program.
 479  */
 480 static __isl_give isl_printer *print_access(__isl_take isl_printer *p,
 481         struct ppcg_kernel_access *access)
 482 {
 483         int i;
 484         unsigned n_index;
 485         struct gpu_array_info *array;
 486         isl_pw_aff_list *bound;
 487
 488         array = access->array;
 489         bound = array ? access->local_array->bound : NULL;
 490         if (!array)
 491                 p = isl_printer_print_str(p, "(");
 492         else {
 493                 if (access->type == ppcg_access_global &&
 494                     gpu_array_is_scalar(array) && !array->read_only)
 495                         p = isl_printer_print_str(p, "*");
 496                 p = isl_printer_print_str(p, access->local_name);
 497                 if (gpu_array_is_scalar(array))
 498                         return p;
 499                 p = isl_printer_print_str(p, "[");
 500         }
 501
 502         n_index = isl_ast_expr_list_n_ast_expr(access->index);
 503         if (access->type == ppcg_access_global)
 504                 for (i = 0; i + 1 < n_index; ++i)
 505                         p = isl_printer_print_str(p, "(");
 506
 507         for (i = 0; i < n_index; ++i) {
 508                 isl_ast_expr *index;
 509
 510                 index = isl_ast_expr_list_get_ast_expr(access->index, i);
 511                 if (array && i) {
 512                         if (access->type == ppcg_access_global) {
 513                                 isl_pw_aff *bound_i;
 514                                 bound_i = isl_pw_aff_list_get_pw_aff(bound, i);
 515                                 p = isl_printer_print_str(p, ") * (");
 516                                 p = isl_printer_print_pw_aff(p, bound_i);
 517                                 p = isl_printer_print_str(p, ") + ");
 518                                 isl_pw_aff_free(bound_i);
 519                         } else
 520                                 p = isl_printer_print_str(p, "][");
 521                 }
 522                 p = isl_printer_print_ast_expr(p, index);
 523                 isl_ast_expr_free(index);
 524         }
 525         if (!array)
 526                 p = isl_printer_print_str(p, ")");
 527         else
 528                 p = isl_printer_print_str(p, "]");
 529
 530         return p;
 531 }
 532
 533 struct cuda_access_print_info {
 534         int i;
 535         struct ppcg_kernel_stmt *stmt;
 536 };
 537
 538 /* To print the cuda accesses we walk the list of cuda accesses simultaneously
 539  * with the pet printer. This means that whenever the pet printer prints a
 540  * pet access expression we have the corresponding cuda access available and can
 541  * print the modified access.
 542  */
 543 static __isl_give isl_printer *print_cuda_access(__isl_take isl_printer *p,
 544         struct pet_expr *expr, void *usr)
 545 {
 546         struct cuda_access_print_info *info =
 547                 (struct cuda_access_print_info *) usr;
 548
 549         p = print_access(p, &info->stmt->u.d.access[info->i]);
 550         info->i++;
 551
 552         return p;
 553 }
 554
 555 static __isl_give isl_printer *print_stmt_body(__isl_take isl_printer *p,
 556         struct ppcg_kernel_stmt *stmt)
 557 {
 558         struct cuda_access_print_info info;
 559
 560         info.i = 0;
 561         info.stmt = stmt;
 562
 563         p = isl_printer_start_line(p);
 564         p = print_pet_expr(p, stmt->u.d.stmt->body, &print_cuda_access, &info);
 565         p = isl_printer_print_str(p, ";");
 566         p = isl_printer_end_line(p);
 567
 568         return p;
 569 }
 570
 571 /* This function is called for each user statement in the AST,
 572  * i.e., for each kernel body statement, copy statement or sync statement.
 573  */
 574 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
 575         __isl_keep isl_ast_node *node, void *user)
 576 {
 577         isl_id *id;
 578         struct ppcg_kernel_stmt *stmt;
 579
 580         id = isl_ast_node_get_annotation(node);
 581         stmt = isl_id_get_user(id);
 582         isl_id_free(id);
 583
 584         switch (stmt->type) {
 585         case ppcg_kernel_copy:
 586                 return print_copy(p, stmt);
 587         case ppcg_kernel_sync:
 588                 return print_sync(p, stmt);
 589         case ppcg_kernel_domain:
 590                 return print_stmt_body(p, stmt);
 591         }
 592
 593         return p;
 594 }
 595
 596 static int print_macro(enum isl_ast_op_type type, void *user)
 597 {
 598         isl_printer **p = user;
 599
 600         if (type == isl_ast_op_fdiv_q)
 601                 return 0;
 602
 603         *p = isl_ast_op_type_print_macro(type, *p);
 604
 605         return 0;
 606 }
 607
 608 /* Print the required macros for "node", including one for floord.
 609  * We always print a macro for floord as it may also appear in the statements.
 610  */
 611 static __isl_give isl_printer *print_macros(
 612         __isl_keep isl_ast_node *node, __isl_take isl_printer *p)
 613 {
 614         p = isl_ast_op_type_print_macro(isl_ast_op_fdiv_q, p);
 615         if (isl_ast_node_foreach_ast_op_type(node, &print_macro, &p) < 0)
 616                 return isl_printer_free(p);
 617         return p;
 618 }
 619
 620 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
 621         struct cuda_info *cuda)
 622 {
 623         isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
 624         isl_ast_print_options *print_options;
 625         isl_printer *p;
 626
 627         print_kernel_headers(prog, kernel, cuda);
 628         fprintf(cuda->kernel_c, "{\n");
 629         print_kernel_iterators(cuda->kernel_c, kernel);
 630         print_kernel_vars(cuda->kernel_c, kernel);
 631         fprintf(cuda->kernel_c, "\n");
 632
 633         print_options = isl_ast_print_options_alloc(ctx);
 634         print_options = isl_ast_print_options_set_print_user(print_options,
 635                                                     &print_kernel_stmt, NULL);
 636
 637         p = isl_printer_to_file(ctx, cuda->kernel_c);
 638         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 639         p = isl_printer_indent(p, 4);
 640         p = print_macros(kernel->tree, p);
 641         p = isl_ast_node_print(kernel->tree, p, print_options);
 642         isl_printer_free(p);
 643
 644         isl_ast_print_options_free(print_options);
 645
 646         fprintf(cuda->kernel_c, "}\n");
 647 }
 648
 649 struct print_host_user_data {
 650         struct cuda_info *cuda;
 651         struct gpu_prog *prog;
 652 };
 653
 654 /* Print the user statement of the host code to "p".
 655  *
 656  * In particular, print a block of statements that defines the grid
 657  * and the block and then launches the kernel.
 658  */
 659 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
 660         __isl_keep isl_ast_node *node, void *user)
 661 {
 662         isl_id *id;
 663         struct ppcg_kernel *kernel;
 664         struct print_host_user_data *data;
 665
 666         id = isl_ast_node_get_annotation(node);
 667         kernel = isl_id_get_user(id);
 668         isl_id_free(id);
 669
 670         data = (struct print_host_user_data *) user;
 671
 672         p = isl_printer_start_line(p);
 673         p = isl_printer_print_str(p, "{");
 674         p = isl_printer_end_line(p);
 675         p = isl_printer_indent(p, 2);
 676
 677         p = isl_printer_start_line(p);
 678         p = isl_printer_print_str(p, "dim3 k");
 679         p = isl_printer_print_int(p, kernel->id);
 680         p = isl_printer_print_str(p, "_dimBlock");
 681         print_reverse_list(isl_printer_get_file(p),
 682                                 kernel->n_block, kernel->block_dim);
 683         p = isl_printer_print_str(p, ";");
 684         p = isl_printer_end_line(p);
 685
 686         p = print_grid(p, kernel);
 687
 688         p = isl_printer_start_line(p);
 689         p = isl_printer_print_str(p, "kernel");
 690         p = isl_printer_print_int(p, kernel->id);
 691         p = isl_printer_print_str(p, " <<<k");
 692         p = isl_printer_print_int(p, kernel->id);
 693         p = isl_printer_print_str(p, "_dimGrid, k");
 694         p = isl_printer_print_int(p, kernel->id);
 695         p = isl_printer_print_str(p, "_dimBlock>>> (");
 696         p = print_kernel_arguments(p, data->prog, kernel, 0);
 697         p = isl_printer_print_str(p, ");");
 698         p = isl_printer_end_line(p);
 699
 700         p = isl_printer_start_line(p);
 701         p = isl_printer_print_str(p, "cudaCheckKernel();");
 702         p = isl_printer_end_line(p);
 703
 704         p = isl_printer_indent(p, -2);
 705         p = isl_printer_start_line(p);
 706         p = isl_printer_print_str(p, "}");
 707         p = isl_printer_end_line(p);
 708
 709         p = isl_printer_start_line(p);
 710         p = isl_printer_end_line(p);
 711
 712         print_kernel(data->prog, kernel, data->cuda);
 713
 714         return p;
 715 }
 716
 717 static void print_host_code(FILE *out, struct gpu_prog *prog,
 718         __isl_keep isl_ast_node *tree, struct cuda_info *cuda)
 719 {
 720         isl_ast_print_options *print_options;
 721         isl_printer *p;
 722         isl_ctx *ctx = isl_ast_node_get_ctx(tree);
 723         struct print_host_user_data data = { cuda, prog };
 724
 725         print_options = isl_ast_print_options_alloc(ctx);
 726         print_options = isl_ast_print_options_set_print_user(print_options,
 727                                                 &print_host_user, &data);
 728
 729         p = isl_printer_to_file(ctx, out);
 730         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 731         p = print_macros(tree, p);
 732         p = isl_ast_node_print(tree, p, print_options);
 733         isl_printer_free(p);
 734
 735         isl_ast_print_options_free(print_options);
 736 }
 737
 738 static void copy_arrays_from_device(FILE *out, struct gpu_prog *prog)
 739 {
 740         int i;
 741         isl_union_set *write;
 742         write = isl_union_map_range(isl_union_map_copy(prog->write));
 743
 744         for (i = 0; i < prog->n_array; ++i) {
 745                 isl_space *dim;
 746                 isl_set *write_i;
 747                 int empty;
 748
 749                 dim = isl_space_copy(prog->array[i].dim);
 750                 write_i = isl_union_set_extract_set(write, dim);
 751                 empty = isl_set_fast_is_empty(write_i);
 752                 isl_set_free(write_i);
 753                 if (empty)
 754                         continue;
 755
 756                 fprintf(out, "cudaCheckReturn(cudaMemcpy(");
 757                 if (gpu_array_is_scalar(&prog->array[i]))
 758                         fprintf(out, "&%s, ", prog->array[i].name);
 759                 else
 760                         fprintf(out, "%s, ", prog->array[i].name);
 761                 fprintf(out, "dev_%s, ",  prog->array[i].name);
 762                 print_array_size(prog->ctx, out, &prog->array[i]);
 763                 fprintf(out, ", cudaMemcpyDeviceToHost));\n");
 764         }
 765
 766         isl_union_set_free(write);
 767         fprintf(out, "\n");
 768 }
 769
 770 static void free_device_arrays(FILE *out, struct gpu_prog *prog)
 771 {
 772         int i;
 773
 774         for (i = 0; i < prog->n_array; ++i) {
 775                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
 776                         continue;
 777                 fprintf(out, "cudaCheckReturn(cudaFree(dev_%s));\n",
 778                         prog->array[i].name);
 779         }
 780 }
 781
 782 int generate_cuda(isl_ctx *ctx, struct ppcg_scop *scop,
 783         struct ppcg_options *options, const char *input)
 784 {
 785         struct cuda_info cuda;
 786         struct gpu_prog *prog;
 787         isl_ast_node *tree;
 788
 789         if (!scop)
 790                 return -1;
 791
 792         scop->context = add_context_from_str(scop->context, options->ctx);
 793
 794         prog = gpu_prog_alloc(ctx, scop);
 795
 796         tree = generate_gpu(ctx, prog, options);
 797
 798         cuda_open_files(&cuda, input);
 799
 800         fprintf(cuda.host_c, "{\n");
 801
 802         print_cuda_macros(cuda.host_c);
 803
 804         declare_device_arrays(cuda.host_c, prog);
 805         allocate_device_arrays(cuda.host_c, prog);
 806         copy_arrays_to_device(cuda.host_c, prog);
 807
 808         print_host_code(cuda.host_c, prog, tree, &cuda);
 809         isl_ast_node_free(tree);
 810
 811         copy_arrays_from_device(cuda.host_c, prog);
 812         free_device_arrays(cuda.host_c, prog);
 813
 814         fprintf(cuda.host_c, "}\n");
 815
 816         cuda_close_files(&cuda);
 817
 818         gpu_prog_free(prog);
 819
 820         return 0;
 821 }