cuda.c

   1 /*
   2  * Copyright 2012      Ecole Normale Superieure
   3  *
   4  * Use of this software is governed by the GNU LGPLv2.1 license
   5  *
   6  * Written by Sven Verdoolaege,
   7  * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
   8  */
   9
  10 #include <isl/aff.h>
  11 #include <isl/ast.h>
  12
  13 #include "cuda_common.h"
  14 #include "cuda.h"
  15 #include "gpu.h"
  16 #include "pet_printer.h"
  17 #include "schedule.h"
  18
  19 void print_cuda_macros(FILE *file)
  20 {
  21         const char *macros =
  22                 "#define cudaCheckReturn(ret) assert((ret) == cudaSuccess)\n"
  23                 "#define cudaCheckKernel()"
  24                 " assert(cudaGetLastError() == cudaSuccess)\n\n";
  25         fputs(macros, file);
  26 }
  27
  28 static void print_array_size(isl_ctx *ctx, FILE *out,
  29         struct gpu_array_info *array)
  30 {
  31         int i;
  32         isl_printer *prn;
  33
  34         prn = isl_printer_to_file(ctx, out);
  35         prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
  36         for (i = 0; i < array->n_index; ++i) {
  37                 prn = isl_printer_print_str(prn, "(");
  38                 prn = isl_printer_print_pw_aff(prn, array->bound[i]);
  39                 prn = isl_printer_print_str(prn, ") * ");
  40         }
  41         prn = isl_printer_print_str(prn, "sizeof(");
  42         prn = isl_printer_print_str(prn, array->type);
  43         prn = isl_printer_print_str(prn, ")");
  44         isl_printer_free(prn);
  45 }
  46
  47 static void declare_device_arrays(FILE *out, struct gpu_prog *prog)
  48 {
  49         int i;
  50
  51         for (i = 0; i < prog->n_array; ++i) {
  52                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
  53                         continue;
  54                 fprintf(out, "%s *dev_%s;\n",
  55                         prog->array[i].type, prog->array[i].name);
  56         }
  57         fprintf(out, "\n");
  58 }
  59
  60 static void allocate_device_arrays(FILE *out, struct gpu_prog *prog)
  61 {
  62         int i;
  63
  64         for (i = 0; i < prog->n_array; ++i) {
  65                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
  66                         continue;
  67                 fprintf(out,
  68                         "cudaCheckReturn(cudaMalloc((void **) &dev_%s, ",
  69                         prog->array[i].name);
  70                 print_array_size(prog->ctx, out, &prog->array[i]);
  71                 fprintf(out, "));\n");
  72         }
  73         fprintf(out, "\n");
  74 }
  75
  76 static void copy_arrays_to_device(FILE *out, struct gpu_prog *prog)
  77 {
  78         int i;
  79
  80         for (i = 0; i < prog->n_array; ++i) {
  81                 isl_space *dim;
  82                 isl_set *read_i;
  83                 int empty;
  84
  85                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
  86                         continue;
  87
  88                 dim = isl_space_copy(prog->array[i].dim);
  89                 read_i = isl_union_set_extract_set(prog->copy_in, dim);
  90                 empty = isl_set_fast_is_empty(read_i);
  91                 isl_set_free(read_i);
  92                 if (empty)
  93                         continue;
  94
  95                 fprintf(out, "cudaCheckReturn(cudaMemcpy(dev_%s,",
  96                         prog->array[i].name);
  97
  98                 if (gpu_array_is_scalar(&prog->array[i]))
  99                         fprintf(out, " &%s, ", prog->array[i].name);
 100                 else
 101                         fprintf(out, " %s, ", prog->array[i].name);
 102
 103                 print_array_size(prog->ctx, out, &prog->array[i]);
 104                 fprintf(out, ", cudaMemcpyHostToDevice));\n");
 105         }
 106         fprintf(out, "\n");
 107 }
 108
 109 static void print_reverse_list(FILE *out, int len, int *list)
 110 {
 111         int i;
 112
 113         if (len == 0)
 114                 return;
 115
 116         fprintf(out, "(");
 117         for (i = 0; i < len; ++i) {
 118                 if (i)
 119                         fprintf(out, ", ");
 120                 fprintf(out, "%d", list[len - 1 - i]);
 121         }
 122         fprintf(out, ")");
 123 }
 124
 125 /* Print the effective grid size as a list of the sizes in each
 126  * dimension, from innermost to outermost.
 127  *
 128  * The grid size specified by the user or set by default
 129  * in read_grid_sizes() and applied in tile_schedule(),
 130  * may be too large for the given code in the sense that
 131  * it may contain blocks that don't need to execute anything.
 132  * We therefore don't print this grid size, but instead the
 133  * smallest grid size that ensures that all blocks that actually
 134  * execute code are included in the grid.
 135  *
 136  * For each block dimension, we compute the maximal value of the block id
 137  * and add one.
 138  */
 139 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
 140         struct ppcg_kernel *kernel)
 141 {
 142         int i;
 143         int dim;
 144
 145         dim = isl_set_dim(kernel->grid, isl_dim_set);
 146         if (dim == 0)
 147                 return p;
 148
 149         p = isl_printer_print_str(p, "(");
 150         for (i = dim - 1; i >= 0; --i) {
 151                 isl_space *space;
 152                 isl_aff *one;
 153                 isl_pw_aff *bound ;
 154
 155                 bound = isl_set_dim_max(isl_set_copy(kernel->grid), i);
 156                 bound = isl_pw_aff_coalesce(bound);
 157                 bound = isl_pw_aff_gist(bound, isl_set_copy(kernel->context));
 158
 159                 space = isl_pw_aff_get_domain_space(bound);
 160                 one = isl_aff_zero_on_domain(isl_local_space_from_space(space));
 161                 one = isl_aff_add_constant_si(one, 1);
 162                 bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one));
 163                 p = isl_printer_print_pw_aff(p, bound);
 164                 isl_pw_aff_free(bound);
 165
 166                 if (i > 0)
 167                         p = isl_printer_print_str(p, ", ");
 168         }
 169
 170         p = isl_printer_print_str(p, ")");
 171
 172         return p;
 173 }
 174
 175 /* Print the grid definition.
 176  */
 177 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
 178         struct ppcg_kernel *kernel)
 179 {
 180         p = isl_printer_start_line(p);
 181         p = isl_printer_print_str(p, "dim3 k");
 182         p = isl_printer_print_int(p, kernel->id);
 183         p = isl_printer_print_str(p, "_dimGrid");
 184         p = print_grid_size(p, kernel);
 185         p = isl_printer_print_str(p, ";");
 186         p = isl_printer_end_line(p);
 187
 188         return p;
 189 }
 190
 191 /* Print the arguments to a kernel declaration or call.  If "types" is set,
 192  * then print a declaration (including the types of the arguments).
 193  *
 194  * The arguments are printed in the following order
 195  * - the arrays accessed by the kernel
 196  * - the parameters
 197  * - the host loop iterators
 198  */
 199 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
 200         struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
 201 {
 202         int i, n;
 203         int first = 1;
 204         unsigned nparam;
 205         isl_space *space;
 206         const char *type;
 207
 208         for (i = 0; i < prog->n_array; ++i) {
 209                 isl_set *arr;
 210                 int empty;
 211
 212                 space = isl_space_copy(prog->array[i].dim);
 213                 arr = isl_union_set_extract_set(kernel->arrays, space);
 214                 empty = isl_set_fast_is_empty(arr);
 215                 isl_set_free(arr);
 216                 if (empty)
 217                         continue;
 218
 219                 if (!first)
 220                         p = isl_printer_print_str(p, ", ");
 221
 222                 if (types) {
 223                         p = isl_printer_print_str(p, prog->array[i].type);
 224                         p = isl_printer_print_str(p, " ");
 225                 }
 226
 227                 if (gpu_array_is_read_only_scalar(&prog->array[i])) {
 228                         p = isl_printer_print_str(p, prog->array[i].name);
 229                 } else {
 230                         if (types)
 231                                 p = isl_printer_print_str(p, "*");
 232                         else
 233                                 p = isl_printer_print_str(p, "dev_");
 234                         p = isl_printer_print_str(p, prog->array[i].name);
 235                 }
 236
 237                 first = 0;
 238         }
 239
 240         space = isl_union_set_get_space(kernel->arrays);
 241         nparam = isl_space_dim(space, isl_dim_param);
 242         for (i = 0; i < nparam; ++i) {
 243                 const char *name;
 244
 245                 name = isl_space_get_dim_name(space, isl_dim_param, i);
 246
 247                 if (!first)
 248                         p = isl_printer_print_str(p, ", ");
 249                 if (types)
 250                         p = isl_printer_print_str(p, "int ");
 251                 p = isl_printer_print_str(p, name);
 252
 253                 first = 0;
 254         }
 255         isl_space_free(space);
 256
 257         n = isl_space_dim(kernel->space, isl_dim_set);
 258         type = isl_options_get_ast_iterator_type(prog->ctx);
 259         for (i = 0; i < n; ++i) {
 260                 const char *name;
 261                 isl_id *id;
 262
 263                 if (!first)
 264                         p = isl_printer_print_str(p, ", ");
 265                 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
 266                 if (types) {
 267                         p = isl_printer_print_str(p, type);
 268                         p = isl_printer_print_str(p, " ");
 269                 }
 270                 p = isl_printer_print_str(p, name);
 271
 272                 first = 0;
 273         }
 274
 275         return p;
 276 }
 277
 278 /* Print the header of the given kernel.
 279  */
 280 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
 281         struct gpu_prog *prog, struct ppcg_kernel *kernel)
 282 {
 283         p = isl_printer_start_line(p);
 284         p = isl_printer_print_str(p, "__global__ void kernel");
 285         p = isl_printer_print_int(p, kernel->id);
 286         p = isl_printer_print_str(p, "(");
 287         p = print_kernel_arguments(p, prog, kernel, 1);
 288         p = isl_printer_print_str(p, ")");
 289
 290         return p;
 291 }
 292
 293 /* Print the header of the given kernel to both gen->cuda.kernel_h
 294  * and gen->cuda.kernel_c.
 295  */
 296 static void print_kernel_headers(struct gpu_prog *prog,
 297         struct ppcg_kernel *kernel, struct cuda_info *cuda)
 298 {
 299         isl_printer *p;
 300
 301         p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
 302         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 303         p = print_kernel_header(p, prog, kernel);
 304         p = isl_printer_print_str(p, ";");
 305         p = isl_printer_end_line(p);
 306         isl_printer_free(p);
 307
 308         p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
 309         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 310         p = print_kernel_header(p, prog, kernel);
 311         p = isl_printer_end_line(p);
 312         isl_printer_free(p);
 313 }
 314
 315 static void print_indent(FILE *dst, int indent)
 316 {
 317         fprintf(dst, "%*s", indent, "");
 318 }
 319
 320 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
 321 {
 322         int i;
 323         const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
 324         const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
 325                                         "threadIdx.z" };
 326
 327         if (kernel->n_grid > 0) {
 328                 print_indent(out, 4);
 329                 fprintf(out, "int ");
 330                 for (i = 0; i < kernel->n_grid; ++i) {
 331                         if (i)
 332                                 fprintf(out, ", ");
 333                         fprintf(out, "b%d = %s",
 334                                 i, block_dims[kernel->n_grid - 1 - i]);
 335                 }
 336                 fprintf(out, ";\n");
 337         }
 338
 339         if (kernel->n_block > 0) {
 340                 print_indent(out, 4);
 341                 fprintf(out, "int ");
 342                 for (i = 0; i < kernel->n_block; ++i) {
 343                         if (i)
 344                                 fprintf(out, ", ");
 345                         fprintf(out, "t%d = %s",
 346                                 i, thread_dims[kernel->n_block - 1 - i]);
 347                 }
 348                 fprintf(out, ";\n");
 349         }
 350 }
 351
 352 static void print_kernel_var(FILE *out, struct ppcg_kernel_var *var)
 353 {
 354         int j;
 355         isl_int v;
 356
 357         print_indent(out, 4);
 358         if (var->type == ppcg_access_shared)
 359                 fprintf(out, "__shared__ ");
 360         fprintf(out, "%s %s", var->array->type, var->name);
 361         isl_int_init(v);
 362         for (j = 0; j < var->array->n_index; ++j) {
 363                 fprintf(out, "[");
 364                 isl_vec_get_element(var->size, j, &v);
 365                 isl_int_print(out, v, 0);
 366                 fprintf(out, "]");
 367         }
 368         isl_int_clear(v);
 369         fprintf(out, ";\n");
 370 }
 371
 372 static void print_kernel_vars(FILE *out, struct ppcg_kernel *kernel)
 373 {
 374         int i;
 375
 376         for (i = 0; i < kernel->n_var; ++i)
 377                 print_kernel_var(out, &kernel->var[i]);
 378 }
 379
 380 /* Print an access to the element in the private/shared memory copy
 381  * described by "stmt".  The index of the copy is recorded in
 382  * stmt->local_index.
 383  */
 384 static __isl_give isl_printer *stmt_print_local_index(__isl_take isl_printer *p,
 385         struct ppcg_kernel_stmt *stmt)
 386 {
 387         int i;
 388         const char *name;
 389         struct gpu_array_info *array = stmt->u.c.array;
 390
 391         name = isl_pw_multi_aff_get_tuple_name(stmt->u.c.local_index,
 392                                                 isl_dim_out);
 393         p = isl_printer_print_str(p, name);
 394         for (i = 0; i < array->n_index; ++i) {
 395                 isl_pw_aff *pa;
 396                 pa = isl_pw_multi_aff_get_pw_aff(stmt->u.c.local_index, i);
 397
 398                 p = isl_printer_print_str(p, "[");
 399                 p = isl_printer_print_pw_aff(p, pa);
 400                 p = isl_printer_print_str(p, "]");
 401
 402                 isl_pw_aff_free(pa);
 403         }
 404
 405         return p;
 406 }
 407
 408 /* Print an access to the element in the global memory copy
 409  * described by "stmt".  The index of the copy is recorded in
 410  * stmt->index.
 411  *
 412  * The copy in global memory has been linearized, so we need to take
 413  * the array size into account.
 414  */
 415 static __isl_give isl_printer *stmt_print_global_index(
 416         __isl_take isl_printer *p, struct ppcg_kernel_stmt *stmt)
 417 {
 418         int i;
 419         struct gpu_array_info *array = stmt->u.c.array;
 420         isl_pw_aff_list *bound = stmt->u.c.local_array->bound;
 421
 422         if (gpu_array_is_scalar(array)) {
 423                 if (!array->read_only)
 424                         p = isl_printer_print_str(p, "*");
 425                 p = isl_printer_print_str(p, array->name);
 426                 return p;
 427         }
 428
 429         p = isl_printer_print_str(p, array->name);
 430         p = isl_printer_print_str(p, "[");
 431         for (i = 0; i + 1 < array->n_index; ++i)
 432                 p = isl_printer_print_str(p, "(");
 433         for (i = 0; i < array->n_index; ++i) {
 434                 isl_pw_aff *pa = isl_pw_multi_aff_get_pw_aff(stmt->u.c.index, i);
 435                 pa = isl_pw_aff_coalesce(pa);
 436                 pa = isl_pw_aff_gist_params(pa, isl_set_copy(stmt->u.c.domain));
 437                 if (i) {
 438                         isl_pw_aff *bound_i;
 439                         bound_i = isl_pw_aff_list_get_pw_aff(bound, i);
 440                         p = isl_printer_print_str(p, ") * (");
 441                         p = isl_printer_print_pw_aff(p, bound_i);
 442                         p = isl_printer_print_str(p, ") + ");
 443                         isl_pw_aff_free(bound_i);
 444                 }
 445                 p = isl_printer_print_pw_aff(p, pa);
 446                 isl_pw_aff_free(pa);
 447         }
 448         p = isl_printer_print_str(p, "]");
 449
 450         return p;
 451 }
 452
 453 /* Print a copy statement.
 454  *
 455  * A read copy statement is printed as
 456  *
 457  *      local = global;
 458  *
 459  * while a write copy statement is printed as
 460  *
 461  *      global = local;
 462  */
 463 static __isl_give isl_printer *print_copy(__isl_take isl_printer *p,
 464         struct ppcg_kernel_stmt *stmt)
 465 {
 466         p = isl_printer_start_line(p);
 467         if (stmt->u.c.read) {
 468                 p = stmt_print_local_index(p, stmt);
 469                 p = isl_printer_print_str(p, " = ");
 470                 p = stmt_print_global_index(p, stmt);
 471         } else {
 472                 p = stmt_print_global_index(p, stmt);
 473                 p = isl_printer_print_str(p, " = ");
 474                 p = stmt_print_local_index(p, stmt);
 475         }
 476         p = isl_printer_print_str(p, ";");
 477         p = isl_printer_end_line(p);
 478
 479         return p;
 480 }
 481
 482 /* Print a sync statement.
 483  */
 484 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
 485         struct ppcg_kernel_stmt *stmt)
 486 {
 487         p = isl_printer_start_line(p);
 488         p = isl_printer_print_str(p, "__syncthreads();");
 489         p = isl_printer_end_line(p);
 490
 491         return p;
 492 }
 493
 494 /* Print an access based on the information in "access".
 495  * If this an access to global memory, then the index expression
 496  * is linearized.
 497  *
 498  * If access->array is NULL, then we are
 499  * accessing an iterator in the original program.
 500  */
 501 static __isl_give isl_printer *print_access(__isl_take isl_printer *p,
 502         struct ppcg_kernel_access *access)
 503 {
 504         int i;
 505         unsigned n_index;
 506         struct gpu_array_info *array;
 507         isl_pw_aff_list *bound;
 508
 509         array = access->array;
 510         bound = array ? access->local_array->bound : NULL;
 511         if (!array)
 512                 p = isl_printer_print_str(p, "(");
 513         else {
 514                 if (access->type == ppcg_access_global &&
 515                     gpu_array_is_scalar(array) && !array->read_only)
 516                         p = isl_printer_print_str(p, "*");
 517                 p = isl_printer_print_str(p, access->local_name);
 518                 if (gpu_array_is_scalar(array))
 519                         return p;
 520                 p = isl_printer_print_str(p, "[");
 521         }
 522
 523         n_index = isl_ast_expr_list_n_ast_expr(access->index);
 524         if (access->type == ppcg_access_global)
 525                 for (i = 0; i + 1 < n_index; ++i)
 526                         p = isl_printer_print_str(p, "(");
 527
 528         for (i = 0; i < n_index; ++i) {
 529                 isl_ast_expr *index;
 530
 531                 index = isl_ast_expr_list_get_ast_expr(access->index, i);
 532                 if (array && i) {
 533                         if (access->type == ppcg_access_global) {
 534                                 isl_pw_aff *bound_i;
 535                                 bound_i = isl_pw_aff_list_get_pw_aff(bound, i);
 536                                 p = isl_printer_print_str(p, ") * (");
 537                                 p = isl_printer_print_pw_aff(p, bound_i);
 538                                 p = isl_printer_print_str(p, ") + ");
 539                                 isl_pw_aff_free(bound_i);
 540                         } else
 541                                 p = isl_printer_print_str(p, "][");
 542                 }
 543                 p = isl_printer_print_ast_expr(p, index);
 544                 isl_ast_expr_free(index);
 545         }
 546         if (!array)
 547                 p = isl_printer_print_str(p, ")");
 548         else
 549                 p = isl_printer_print_str(p, "]");
 550
 551         return p;
 552 }
 553
 554 struct cuda_access_print_info {
 555         int i;
 556         struct ppcg_kernel_stmt *stmt;
 557 };
 558
 559 /* To print the cuda accesses we walk the list of cuda accesses simultaneously
 560  * with the pet printer. This means that whenever the pet printer prints a
 561  * pet access expression we have the corresponding cuda access available and can
 562  * print the modified access.
 563  */
 564 static __isl_give isl_printer *print_cuda_access(__isl_take isl_printer *p,
 565         struct pet_expr *expr, void *usr)
 566 {
 567         struct cuda_access_print_info *info =
 568                 (struct cuda_access_print_info *) usr;
 569
 570         p = print_access(p, &info->stmt->u.d.access[info->i]);
 571         info->i++;
 572
 573         return p;
 574 }
 575
 576 static __isl_give isl_printer *print_stmt_body(__isl_take isl_printer *p,
 577         struct ppcg_kernel_stmt *stmt)
 578 {
 579         struct cuda_access_print_info info;
 580
 581         info.i = 0;
 582         info.stmt = stmt;
 583
 584         p = isl_printer_start_line(p);
 585         p = print_pet_expr(p, stmt->u.d.stmt->body, &print_cuda_access, &info);
 586         p = isl_printer_print_str(p, ";");
 587         p = isl_printer_end_line(p);
 588
 589         return p;
 590 }
 591
 592 /* This function is called for each user statement in the AST,
 593  * i.e., for each kernel body statement, copy statement or sync statement.
 594  */
 595 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
 596         __isl_keep isl_ast_node *node, void *user)
 597 {
 598         isl_id *id;
 599         struct ppcg_kernel_stmt *stmt;
 600
 601         id = isl_ast_node_get_annotation(node);
 602         stmt = isl_id_get_user(id);
 603         isl_id_free(id);
 604
 605         switch (stmt->type) {
 606         case ppcg_kernel_copy:
 607                 return print_copy(p, stmt);
 608         case ppcg_kernel_sync:
 609                 return print_sync(p, stmt);
 610         case ppcg_kernel_domain:
 611                 return print_stmt_body(p, stmt);
 612         }
 613
 614         return p;
 615 }
 616
 617 static int print_macro(enum isl_ast_op_type type, void *user)
 618 {
 619         isl_printer **p = user;
 620
 621         if (type == isl_ast_op_fdiv_q)
 622                 return 0;
 623
 624         *p = isl_ast_op_type_print_macro(type, *p);
 625
 626         return 0;
 627 }
 628
 629 /* Print the required macros for "node", including one for floord.
 630  * We always print a macro for floord as it may also appear in the statements.
 631  */
 632 static __isl_give isl_printer *print_macros(
 633         __isl_keep isl_ast_node *node, __isl_take isl_printer *p)
 634 {
 635         p = isl_ast_op_type_print_macro(isl_ast_op_fdiv_q, p);
 636         if (isl_ast_node_foreach_ast_op_type(node, &print_macro, &p) < 0)
 637                 return isl_printer_free(p);
 638         return p;
 639 }
 640
 641 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
 642         struct cuda_info *cuda)
 643 {
 644         isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
 645         isl_ast_print_options *print_options;
 646         isl_printer *p;
 647
 648         print_kernel_headers(prog, kernel, cuda);
 649         fprintf(cuda->kernel_c, "{\n");
 650         print_kernel_iterators(cuda->kernel_c, kernel);
 651         print_kernel_vars(cuda->kernel_c, kernel);
 652         fprintf(cuda->kernel_c, "\n");
 653
 654         print_options = isl_ast_print_options_alloc(ctx);
 655         print_options = isl_ast_print_options_set_print_user(print_options,
 656                                                     &print_kernel_stmt, NULL);
 657
 658         p = isl_printer_to_file(ctx, cuda->kernel_c);
 659         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 660         p = isl_printer_indent(p, 4);
 661         p = print_macros(kernel->tree, p);
 662         p = isl_ast_node_print(kernel->tree, p, print_options);
 663         isl_printer_free(p);
 664
 665         isl_ast_print_options_free(print_options);
 666
 667         fprintf(cuda->kernel_c, "}\n");
 668 }
 669
 670 struct print_host_user_data {
 671         struct cuda_info *cuda;
 672         struct gpu_prog *prog;
 673 };
 674
 675 /* Print the user statement of the host code to "p".
 676  *
 677  * In particular, print a block of statements that defines the grid
 678  * and the block and then launches the kernel.
 679  */
 680 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
 681         __isl_keep isl_ast_node *node, void *user)
 682 {
 683         isl_id *id;
 684         struct ppcg_kernel *kernel;
 685         struct print_host_user_data *data;
 686
 687         id = isl_ast_node_get_annotation(node);
 688         kernel = isl_id_get_user(id);
 689         isl_id_free(id);
 690
 691         data = (struct print_host_user_data *) user;
 692
 693         p = isl_printer_start_line(p);
 694         p = isl_printer_print_str(p, "{");
 695         p = isl_printer_end_line(p);
 696         p = isl_printer_indent(p, 2);
 697
 698         p = isl_printer_start_line(p);
 699         p = isl_printer_print_str(p, "dim3 k");
 700         p = isl_printer_print_int(p, kernel->id);
 701         p = isl_printer_print_str(p, "_dimBlock");
 702         print_reverse_list(isl_printer_get_file(p),
 703                                 kernel->n_block, kernel->block_dim);
 704         p = isl_printer_print_str(p, ";");
 705         p = isl_printer_end_line(p);
 706
 707         p = print_grid(p, kernel);
 708
 709         p = isl_printer_start_line(p);
 710         p = isl_printer_print_str(p, "kernel");
 711         p = isl_printer_print_int(p, kernel->id);
 712         p = isl_printer_print_str(p, " <<<k");
 713         p = isl_printer_print_int(p, kernel->id);
 714         p = isl_printer_print_str(p, "_dimGrid, k");
 715         p = isl_printer_print_int(p, kernel->id);
 716         p = isl_printer_print_str(p, "_dimBlock>>> (");
 717         p = print_kernel_arguments(p, data->prog, kernel, 0);
 718         p = isl_printer_print_str(p, ");");
 719         p = isl_printer_end_line(p);
 720
 721         p = isl_printer_start_line(p);
 722         p = isl_printer_print_str(p, "cudaCheckKernel();");
 723         p = isl_printer_end_line(p);
 724
 725         p = isl_printer_indent(p, -2);
 726         p = isl_printer_start_line(p);
 727         p = isl_printer_print_str(p, "}");
 728         p = isl_printer_end_line(p);
 729
 730         p = isl_printer_start_line(p);
 731         p = isl_printer_end_line(p);
 732
 733         print_kernel(data->prog, kernel, data->cuda);
 734
 735         return p;
 736 }
 737
 738 static void print_host_code(FILE *out, struct gpu_prog *prog,
 739         __isl_keep isl_ast_node *tree, struct cuda_info *cuda)
 740 {
 741         isl_ast_print_options *print_options;
 742         isl_printer *p;
 743         isl_ctx *ctx = isl_ast_node_get_ctx(tree);
 744         struct print_host_user_data data = { cuda, prog };
 745
 746         print_options = isl_ast_print_options_alloc(ctx);
 747         print_options = isl_ast_print_options_set_print_user(print_options,
 748                                                 &print_host_user, &data);
 749
 750         p = isl_printer_to_file(ctx, out);
 751         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 752         p = print_macros(tree, p);
 753         p = isl_ast_node_print(tree, p, print_options);
 754         isl_printer_free(p);
 755
 756         isl_ast_print_options_free(print_options);
 757 }
 758
 759 static void copy_arrays_from_device(FILE *out, struct gpu_prog *prog)
 760 {
 761         int i;
 762         isl_union_set *write;
 763         write = isl_union_map_range(isl_union_map_copy(prog->write));
 764
 765         for (i = 0; i < prog->n_array; ++i) {
 766                 isl_space *dim;
 767                 isl_set *write_i;
 768                 int empty;
 769
 770                 dim = isl_space_copy(prog->array[i].dim);
 771                 write_i = isl_union_set_extract_set(write, dim);
 772                 empty = isl_set_fast_is_empty(write_i);
 773                 isl_set_free(write_i);
 774                 if (empty)
 775                         continue;
 776
 777                 fprintf(out, "cudaCheckReturn(cudaMemcpy(");
 778                 if (gpu_array_is_scalar(&prog->array[i]))
 779                         fprintf(out, "&%s, ", prog->array[i].name);
 780                 else
 781                         fprintf(out, "%s, ", prog->array[i].name);
 782                 fprintf(out, "dev_%s, ",  prog->array[i].name);
 783                 print_array_size(prog->ctx, out, &prog->array[i]);
 784                 fprintf(out, ", cudaMemcpyDeviceToHost));\n");
 785         }
 786
 787         isl_union_set_free(write);
 788         fprintf(out, "\n");
 789 }
 790
 791 static void free_device_arrays(FILE *out, struct gpu_prog *prog)
 792 {
 793         int i;
 794
 795         for (i = 0; i < prog->n_array; ++i) {
 796                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
 797                         continue;
 798                 fprintf(out, "cudaCheckReturn(cudaFree(dev_%s));\n",
 799                         prog->array[i].name);
 800         }
 801 }
 802
 803 int generate_cuda(isl_ctx *ctx, struct pet_scop *scop,
 804         struct ppcg_options *options, const char *input)
 805 {
 806         struct cuda_info cuda;
 807         struct gpu_prog *prog;
 808         isl_ast_node *tree;
 809
 810         if (!scop)
 811                 return -1;
 812
 813         scop->context = add_context_from_str(scop->context, options->ctx);
 814
 815         prog = gpu_prog_alloc(ctx, scop);
 816
 817         tree = generate_gpu(ctx, prog, options);
 818
 819         cuda_open_files(&cuda, input);
 820
 821         fprintf(cuda.host_c, "{\n");
 822
 823         print_cuda_macros(cuda.host_c);
 824
 825         declare_device_arrays(cuda.host_c, prog);
 826         allocate_device_arrays(cuda.host_c, prog);
 827         copy_arrays_to_device(cuda.host_c, prog);
 828
 829         print_host_code(cuda.host_c, prog, tree, &cuda);
 830         isl_ast_node_free(tree);
 831
 832         copy_arrays_from_device(cuda.host_c, prog);
 833         free_device_arrays(cuda.host_c, prog);
 834
 835         fprintf(cuda.host_c, "}\n");
 836
 837         cuda_close_files(&cuda);
 838
 839         gpu_prog_free(prog);
 840
 841         return 0;
 842 }