cuda.c

   1 /*
   2  * Copyright 2012      Ecole Normale Superieure
   3  *
   4  * Use of this software is governed by the GNU LGPLv2.1 license
   5  *
   6  * Written by Sven Verdoolaege,
   7  * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
   8  */
   9
  10 #include <isl/aff.h>
  11 #include <isl/ast.h>
  12
  13 #include "cuda_common.h"
  14 #include "cuda.h"
  15 #include "gpu.h"
  16 #include "gpu_print.h"
  17 #include "print.h"
  18
  19 static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
  20 {
  21         const char *macros =
  22                 "#define cudaCheckReturn(ret) \\\n"
  23                 "  do { \\\n"
  24                 "    cudaError_t cudaCheckReturn_e = (ret); \\\n"
  25                 "    if (cudaCheckReturn_e != cudaSuccess) { \\\n"
  26                 "      fprintf(stderr, \"CUDA error: %s\\n\", "
  27                 "cudaGetErrorString(cudaCheckReturn_e)); \\\n"
  28                 "      fflush(stderr); \\\n"
  29                 "    } \\\n"
  30                 "    assert(cudaCheckReturn_e == cudaSuccess); \\\n"
  31                 "  } while(0)\n"
  32                 "#define cudaCheckKernel() \\\n"
  33                 "  do { \\\n"
  34                 "    cudaCheckReturn(cudaGetLastError()); \\\n"
  35                 "  } while(0)\n\n";
  36
  37         p = isl_printer_print_str(p, macros);
  38         return p;
  39 }
  40
  41 static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
  42         struct gpu_prog *prog)
  43 {
  44         int i;
  45
  46         for (i = 0; i < prog->n_array; ++i) {
  47                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
  48                         continue;
  49                 p = isl_printer_start_line(p);
  50                 p = isl_printer_print_str(p, prog->array[i].type);
  51                 p = isl_printer_print_str(p, " *dev_");
  52                 p = isl_printer_print_str(p, prog->array[i].name);
  53                 p = isl_printer_print_str(p, ";");
  54                 p = isl_printer_end_line(p);
  55         }
  56         p = isl_printer_start_line(p);
  57         p = isl_printer_end_line(p);
  58         return p;
  59 }
  60
  61 static __isl_give isl_printer *allocate_device_arrays(
  62         __isl_take isl_printer *p, struct gpu_prog *prog)
  63 {
  64         int i;
  65
  66         for (i = 0; i < prog->n_array; ++i) {
  67                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
  68                         continue;
  69                 p = isl_printer_start_line(p);
  70                 p = isl_printer_print_str(p,
  71                         "cudaCheckReturn(cudaMalloc((void **) &dev_");
  72                 p = isl_printer_print_str(p, prog->array[i].name);
  73                 p = isl_printer_print_str(p, ", ");
  74                 p = gpu_array_info_print_size(p, &prog->array[i]);
  75                 p = isl_printer_print_str(p, "));");
  76                 p = isl_printer_end_line(p);
  77         }
  78         p = isl_printer_start_line(p);
  79         p = isl_printer_end_line(p);
  80         return p;
  81 }
  82
  83 static __isl_give isl_printer *copy_arrays_to_device(__isl_take isl_printer *p,
  84         struct gpu_prog *prog)
  85 {
  86         int i;
  87
  88         for (i = 0; i < prog->n_array; ++i) {
  89                 isl_space *dim;
  90                 isl_set *read_i;
  91                 int empty;
  92
  93                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
  94                         continue;
  95
  96                 dim = isl_space_copy(prog->array[i].dim);
  97                 read_i = isl_union_set_extract_set(prog->copy_in, dim);
  98                 empty = isl_set_fast_is_empty(read_i);
  99                 isl_set_free(read_i);
 100                 if (empty)
 101                         continue;
 102
 103                 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
 104                 p = isl_printer_print_str(p, prog->array[i].name);
 105                 p = isl_printer_print_str(p, ", ");
 106
 107                 if (gpu_array_is_scalar(&prog->array[i]))
 108                         p = isl_printer_print_str(p, "&");
 109                 p = isl_printer_print_str(p, prog->array[i].name);
 110                 p = isl_printer_print_str(p, ", ");
 111
 112                 p = gpu_array_info_print_size(p, &prog->array[i]);
 113                 p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
 114                 p = isl_printer_end_line(p);
 115         }
 116         p = isl_printer_start_line(p);
 117         p = isl_printer_end_line(p);
 118         return p;
 119 }
 120
 121 static void print_reverse_list(FILE *out, int len, int *list)
 122 {
 123         int i;
 124
 125         if (len == 0)
 126                 return;
 127
 128         fprintf(out, "(");
 129         for (i = 0; i < len; ++i) {
 130                 if (i)
 131                         fprintf(out, ", ");
 132                 fprintf(out, "%d", list[len - 1 - i]);
 133         }
 134         fprintf(out, ")");
 135 }
 136
 137 /* Print the effective grid size as a list of the sizes in each
 138  * dimension, from innermost to outermost.
 139  */
 140 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
 141         struct ppcg_kernel *kernel)
 142 {
 143         int i;
 144         int dim;
 145
 146         dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
 147         if (dim == 0)
 148                 return p;
 149
 150         p = isl_printer_print_str(p, "(");
 151         for (i = dim - 1; i >= 0; --i) {
 152                 isl_pw_aff *bound;
 153
 154                 bound = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
 155                 p = isl_printer_print_pw_aff(p, bound);
 156                 isl_pw_aff_free(bound);
 157
 158                 if (i > 0)
 159                         p = isl_printer_print_str(p, ", ");
 160         }
 161
 162         p = isl_printer_print_str(p, ")");
 163
 164         return p;
 165 }
 166
 167 /* Print the grid definition.
 168  */
 169 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
 170         struct ppcg_kernel *kernel)
 171 {
 172         p = isl_printer_start_line(p);
 173         p = isl_printer_print_str(p, "dim3 k");
 174         p = isl_printer_print_int(p, kernel->id);
 175         p = isl_printer_print_str(p, "_dimGrid");
 176         p = print_grid_size(p, kernel);
 177         p = isl_printer_print_str(p, ";");
 178         p = isl_printer_end_line(p);
 179
 180         return p;
 181 }
 182
 183 /* Print the arguments to a kernel declaration or call.  If "types" is set,
 184  * then print a declaration (including the types of the arguments).
 185  *
 186  * The arguments are printed in the following order
 187  * - the arrays accessed by the kernel
 188  * - the parameters
 189  * - the host loop iterators
 190  */
 191 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
 192         struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
 193 {
 194         int i, n;
 195         int first = 1;
 196         unsigned nparam;
 197         isl_space *space;
 198         const char *type;
 199
 200         for (i = 0; i < prog->n_array; ++i) {
 201                 isl_set *arr;
 202                 int empty;
 203
 204                 space = isl_space_copy(prog->array[i].dim);
 205                 arr = isl_union_set_extract_set(kernel->arrays, space);
 206                 empty = isl_set_fast_is_empty(arr);
 207                 isl_set_free(arr);
 208                 if (empty)
 209                         continue;
 210
 211                 if (!first)
 212                         p = isl_printer_print_str(p, ", ");
 213
 214                 if (types) {
 215                         p = isl_printer_print_str(p, prog->array[i].type);
 216                         p = isl_printer_print_str(p, " ");
 217                 }
 218
 219                 if (gpu_array_is_read_only_scalar(&prog->array[i])) {
 220                         p = isl_printer_print_str(p, prog->array[i].name);
 221                 } else {
 222                         if (types)
 223                                 p = isl_printer_print_str(p, "*");
 224                         else
 225                                 p = isl_printer_print_str(p, "dev_");
 226                         p = isl_printer_print_str(p, prog->array[i].name);
 227                 }
 228
 229                 first = 0;
 230         }
 231
 232         space = isl_union_set_get_space(kernel->arrays);
 233         nparam = isl_space_dim(space, isl_dim_param);
 234         for (i = 0; i < nparam; ++i) {
 235                 const char *name;
 236
 237                 name = isl_space_get_dim_name(space, isl_dim_param, i);
 238
 239                 if (!first)
 240                         p = isl_printer_print_str(p, ", ");
 241                 if (types)
 242                         p = isl_printer_print_str(p, "int ");
 243                 p = isl_printer_print_str(p, name);
 244
 245                 first = 0;
 246         }
 247         isl_space_free(space);
 248
 249         n = isl_space_dim(kernel->space, isl_dim_set);
 250         type = isl_options_get_ast_iterator_type(prog->ctx);
 251         for (i = 0; i < n; ++i) {
 252                 const char *name;
 253                 isl_id *id;
 254
 255                 if (!first)
 256                         p = isl_printer_print_str(p, ", ");
 257                 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
 258                 if (types) {
 259                         p = isl_printer_print_str(p, type);
 260                         p = isl_printer_print_str(p, " ");
 261                 }
 262                 p = isl_printer_print_str(p, name);
 263
 264                 first = 0;
 265         }
 266
 267         return p;
 268 }
 269
 270 /* Print the header of the given kernel.
 271  */
 272 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
 273         struct gpu_prog *prog, struct ppcg_kernel *kernel)
 274 {
 275         p = isl_printer_start_line(p);
 276         p = isl_printer_print_str(p, "__global__ void kernel");
 277         p = isl_printer_print_int(p, kernel->id);
 278         p = isl_printer_print_str(p, "(");
 279         p = print_kernel_arguments(p, prog, kernel, 1);
 280         p = isl_printer_print_str(p, ")");
 281
 282         return p;
 283 }
 284
 285 /* Print the header of the given kernel to both gen->cuda.kernel_h
 286  * and gen->cuda.kernel_c.
 287  */
 288 static void print_kernel_headers(struct gpu_prog *prog,
 289         struct ppcg_kernel *kernel, struct cuda_info *cuda)
 290 {
 291         isl_printer *p;
 292
 293         p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
 294         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 295         p = print_kernel_header(p, prog, kernel);
 296         p = isl_printer_print_str(p, ";");
 297         p = isl_printer_end_line(p);
 298         isl_printer_free(p);
 299
 300         p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
 301         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 302         p = print_kernel_header(p, prog, kernel);
 303         p = isl_printer_end_line(p);
 304         isl_printer_free(p);
 305 }
 306
 307 static void print_indent(FILE *dst, int indent)
 308 {
 309         fprintf(dst, "%*s", indent, "");
 310 }
 311
 312 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
 313 {
 314         int i;
 315         isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
 316         const char *type;
 317         const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
 318         const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
 319                                         "threadIdx.z" };
 320
 321         type = isl_options_get_ast_iterator_type(ctx);
 322
 323         if (kernel->n_grid > 0) {
 324                 print_indent(out, 4);
 325                 fprintf(out, "%s ", type);
 326                 for (i = 0; i < kernel->n_grid; ++i) {
 327                         if (i)
 328                                 fprintf(out, ", ");
 329                         fprintf(out, "b%d = %s",
 330                                 i, block_dims[kernel->n_grid - 1 - i]);
 331                 }
 332                 fprintf(out, ";\n");
 333         }
 334
 335         if (kernel->n_block > 0) {
 336                 print_indent(out, 4);
 337                 fprintf(out, "%s ", type);
 338                 for (i = 0; i < kernel->n_block; ++i) {
 339                         if (i)
 340                                 fprintf(out, ", ");
 341                         fprintf(out, "t%d = %s",
 342                                 i, thread_dims[kernel->n_block - 1 - i]);
 343                 }
 344                 fprintf(out, ";\n");
 345         }
 346 }
 347
 348 static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p,
 349         struct ppcg_kernel_var *var)
 350 {
 351         int j;
 352
 353         p = isl_printer_start_line(p);
 354         if (var->type == ppcg_access_shared)
 355                 p = isl_printer_print_str(p, "__shared__ ");
 356         p = isl_printer_print_str(p, var->array->type);
 357         p = isl_printer_print_str(p, " ");
 358         p = isl_printer_print_str(p,  var->name);
 359         for (j = 0; j < var->array->n_index; ++j) {
 360                 isl_val *v;
 361
 362                 p = isl_printer_print_str(p, "[");
 363                 v = isl_vec_get_element_val(var->size, j);
 364                 p = isl_printer_print_val(p, v);
 365                 isl_val_free(v);
 366                 p = isl_printer_print_str(p, "]");
 367         }
 368         p = isl_printer_print_str(p, ";");
 369         p = isl_printer_end_line(p);
 370
 371         return p;
 372 }
 373
 374 static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p,
 375         struct ppcg_kernel *kernel)
 376 {
 377         int i;
 378
 379         for (i = 0; i < kernel->n_var; ++i)
 380                 p = print_kernel_var(p, &kernel->var[i]);
 381
 382         return p;
 383 }
 384
 385 /* Print a sync statement.
 386  */
 387 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
 388         struct ppcg_kernel_stmt *stmt)
 389 {
 390         p = isl_printer_start_line(p);
 391         p = isl_printer_print_str(p, "__syncthreads();");
 392         p = isl_printer_end_line(p);
 393
 394         return p;
 395 }
 396
 397 /* This function is called for each user statement in the AST,
 398  * i.e., for each kernel body statement, copy statement or sync statement.
 399  */
 400 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
 401         __isl_take isl_ast_print_options *print_options,
 402         __isl_keep isl_ast_node *node, void *user)
 403 {
 404         isl_id *id;
 405         struct ppcg_kernel_stmt *stmt;
 406
 407         id = isl_ast_node_get_annotation(node);
 408         stmt = isl_id_get_user(id);
 409         isl_id_free(id);
 410
 411         isl_ast_print_options_free(print_options);
 412
 413         switch (stmt->type) {
 414         case ppcg_kernel_copy:
 415                 return ppcg_kernel_print_copy(p, stmt);
 416         case ppcg_kernel_sync:
 417                 return print_sync(p, stmt);
 418         case ppcg_kernel_domain:
 419                 return ppcg_kernel_print_domain(p, stmt);
 420         }
 421
 422         return p;
 423 }
 424
 425 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
 426         struct cuda_info *cuda)
 427 {
 428         isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
 429         isl_ast_print_options *print_options;
 430         isl_printer *p;
 431
 432         print_kernel_headers(prog, kernel, cuda);
 433         fprintf(cuda->kernel_c, "{\n");
 434         print_kernel_iterators(cuda->kernel_c, kernel);
 435
 436         p = isl_printer_to_file(ctx, cuda->kernel_c);
 437         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 438         p = isl_printer_indent(p, 4);
 439
 440         p = print_kernel_vars(p, kernel);
 441         p = isl_printer_end_line(p);
 442         p = gpu_print_macros(p, kernel->tree);
 443
 444         print_options = isl_ast_print_options_alloc(ctx);
 445         print_options = isl_ast_print_options_set_print_user(print_options,
 446                                                     &print_kernel_stmt, NULL);
 447         p = isl_ast_node_print(kernel->tree, p, print_options);
 448         isl_printer_free(p);
 449
 450         fprintf(cuda->kernel_c, "}\n");
 451 }
 452
 453 struct print_host_user_data {
 454         struct cuda_info *cuda;
 455         struct gpu_prog *prog;
 456 };
 457
 458 /* Print the user statement of the host code to "p".
 459  *
 460  * In particular, print a block of statements that defines the grid
 461  * and the block and then launches the kernel.
 462  */
 463 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
 464         __isl_take isl_ast_print_options *print_options,
 465         __isl_keep isl_ast_node *node, void *user)
 466 {
 467         isl_id *id;
 468         struct ppcg_kernel *kernel;
 469         struct print_host_user_data *data;
 470
 471         id = isl_ast_node_get_annotation(node);
 472         kernel = isl_id_get_user(id);
 473         isl_id_free(id);
 474
 475         data = (struct print_host_user_data *) user;
 476
 477         p = isl_printer_start_line(p);
 478         p = isl_printer_print_str(p, "{");
 479         p = isl_printer_end_line(p);
 480         p = isl_printer_indent(p, 2);
 481
 482         p = isl_printer_start_line(p);
 483         p = isl_printer_print_str(p, "dim3 k");
 484         p = isl_printer_print_int(p, kernel->id);
 485         p = isl_printer_print_str(p, "_dimBlock");
 486         print_reverse_list(isl_printer_get_file(p),
 487                                 kernel->n_block, kernel->block_dim);
 488         p = isl_printer_print_str(p, ";");
 489         p = isl_printer_end_line(p);
 490
 491         p = print_grid(p, kernel);
 492
 493         p = isl_printer_start_line(p);
 494         p = isl_printer_print_str(p, "kernel");
 495         p = isl_printer_print_int(p, kernel->id);
 496         p = isl_printer_print_str(p, " <<<k");
 497         p = isl_printer_print_int(p, kernel->id);
 498         p = isl_printer_print_str(p, "_dimGrid, k");
 499         p = isl_printer_print_int(p, kernel->id);
 500         p = isl_printer_print_str(p, "_dimBlock>>> (");
 501         p = print_kernel_arguments(p, data->prog, kernel, 0);
 502         p = isl_printer_print_str(p, ");");
 503         p = isl_printer_end_line(p);
 504
 505         p = isl_printer_start_line(p);
 506         p = isl_printer_print_str(p, "cudaCheckKernel();");
 507         p = isl_printer_end_line(p);
 508
 509         p = isl_printer_indent(p, -2);
 510         p = isl_printer_start_line(p);
 511         p = isl_printer_print_str(p, "}");
 512         p = isl_printer_end_line(p);
 513
 514         p = isl_printer_start_line(p);
 515         p = isl_printer_end_line(p);
 516
 517         print_kernel(data->prog, kernel, data->cuda);
 518
 519         isl_ast_print_options_free(print_options);
 520
 521         return p;
 522 }
 523
 524 static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
 525         struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
 526         struct cuda_info *cuda)
 527 {
 528         isl_ast_print_options *print_options;
 529         isl_ctx *ctx = isl_ast_node_get_ctx(tree);
 530         struct print_host_user_data data = { cuda, prog };
 531
 532         print_options = isl_ast_print_options_alloc(ctx);
 533         print_options = isl_ast_print_options_set_print_user(print_options,
 534                                                 &print_host_user, &data);
 535
 536         p = gpu_print_macros(p, tree);
 537         p = isl_ast_node_print(tree, p, print_options);
 538
 539         return p;
 540 }
 541
 542 /* For each array that needs to be copied out (based on prog->copy_out),
 543  * copy the contents back from the GPU to the host.
 544  *
 545  * If any element of a given array appears in prog->copy_out, then its
 546  * entire extent is in prog->copy_out.  The bounds on this extent have
 547  * been precomputed in extract_array_info and are used in
 548  * gpu_array_info_print_size.
 549  */
 550 static __isl_give isl_printer *copy_arrays_from_device(
 551         __isl_take isl_printer *p, struct gpu_prog *prog)
 552 {
 553         int i;
 554         isl_union_set *copy_out;
 555         copy_out = isl_union_set_copy(prog->copy_out);
 556
 557         for (i = 0; i < prog->n_array; ++i) {
 558                 isl_space *dim;
 559                 isl_set *copy_out_i;
 560                 int empty;
 561
 562                 dim = isl_space_copy(prog->array[i].dim);
 563                 copy_out_i = isl_union_set_extract_set(copy_out, dim);
 564                 empty = isl_set_fast_is_empty(copy_out_i);
 565                 isl_set_free(copy_out_i);
 566                 if (empty)
 567                         continue;
 568
 569                 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
 570                 if (gpu_array_is_scalar(&prog->array[i]))
 571                         p = isl_printer_print_str(p, "&");
 572                 p = isl_printer_print_str(p, prog->array[i].name);
 573                 p = isl_printer_print_str(p, ", dev_");
 574                 p = isl_printer_print_str(p, prog->array[i].name);
 575                 p = isl_printer_print_str(p, ", ");
 576                 p = gpu_array_info_print_size(p, &prog->array[i]);
 577                 p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
 578                 p = isl_printer_end_line(p);
 579         }
 580
 581         isl_union_set_free(copy_out);
 582         p = isl_printer_start_line(p);
 583         p = isl_printer_end_line(p);
 584         return p;
 585 }
 586
 587 static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
 588         struct gpu_prog *prog)
 589 {
 590         int i;
 591
 592         for (i = 0; i < prog->n_array; ++i) {
 593                 if (gpu_array_is_read_only_scalar(&prog->array[i]))
 594                         continue;
 595                 p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
 596                 p = isl_printer_print_str(p, prog->array[i].name);
 597                 p = isl_printer_print_str(p, "));");
 598                 p = isl_printer_end_line(p);
 599         }
 600
 601         return p;
 602 }
 603
 604 int generate_cuda(isl_ctx *ctx, struct ppcg_scop *scop,
 605         struct ppcg_options *options, const char *input)
 606 {
 607         struct cuda_info cuda;
 608         struct gpu_prog *prog;
 609         isl_ast_node *tree;
 610         isl_printer *p;
 611
 612         if (!scop)
 613                 return -1;
 614
 615         prog = gpu_prog_alloc(ctx, scop);
 616
 617         tree = generate_gpu(ctx, prog, options);
 618
 619         cuda.start = scop->start;
 620         cuda.end = scop->end;
 621         cuda_open_files(&cuda, input);
 622
 623         p = isl_printer_to_file(ctx, cuda.host_c);
 624         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 625         p = ppcg_print_exposed_declarations(p, scop);
 626         p = ppcg_start_block(p);
 627
 628         p = print_cuda_macros(p);
 629
 630         p = declare_device_arrays(p, prog);
 631         p = allocate_device_arrays(p, prog);
 632         p = copy_arrays_to_device(p, prog);
 633
 634         p = print_host_code(p, prog, tree, &cuda);
 635         isl_ast_node_free(tree);
 636
 637         p = copy_arrays_from_device(p, prog);
 638         p = free_device_arrays(p, prog);
 639
 640         p = ppcg_end_block(p);
 641         isl_printer_free(p);
 642
 643         cuda_close_files(&cuda);
 644
 645         gpu_prog_free(prog);
 646
 647         return 0;
 648 }