gpu backend: create single kernel for entire subtree without permutable bands
[ppcg.git] / cuda.c
blob3063f6df6a5e41dea528faec776e34b18f464f34
1 /*
2 * Copyright 2012 Ecole Normale Superieure
4 * Use of this software is governed by the MIT license
6 * Written by Sven Verdoolaege,
7 * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
8 */
10 #include <isl/aff.h>
11 #include <isl/ast.h>
13 #include "cuda_common.h"
14 #include "cuda.h"
15 #include "gpu.h"
16 #include "gpu_print.h"
17 #include "print.h"
18 #include "util.h"
20 static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
22 const char *macros =
23 "#define cudaCheckReturn(ret) \\\n"
24 " do { \\\n"
25 " cudaError_t cudaCheckReturn_e = (ret); \\\n"
26 " if (cudaCheckReturn_e != cudaSuccess) { \\\n"
27 " fprintf(stderr, \"CUDA error: %s\\n\", "
28 "cudaGetErrorString(cudaCheckReturn_e)); \\\n"
29 " fflush(stderr); \\\n"
30 " } \\\n"
31 " assert(cudaCheckReturn_e == cudaSuccess); \\\n"
32 " } while(0)\n"
33 "#define cudaCheckKernel() \\\n"
34 " do { \\\n"
35 " cudaCheckReturn(cudaGetLastError()); \\\n"
36 " } while(0)\n\n";
38 p = isl_printer_print_str(p, macros);
39 return p;
42 /* Print a declaration for the device array corresponding to "array" on "p".
44 static __isl_give isl_printer *declare_device_array(__isl_take isl_printer *p,
45 struct gpu_array_info *array)
47 int i;
49 p = isl_printer_start_line(p);
50 p = isl_printer_print_str(p, array->type);
51 p = isl_printer_print_str(p, " ");
52 if (!array->linearize && array->n_index > 1)
53 p = isl_printer_print_str(p, "(");
54 p = isl_printer_print_str(p, "*dev_");
55 p = isl_printer_print_str(p, array->name);
56 if (!array->linearize && array->n_index > 1) {
57 p = isl_printer_print_str(p, ")");
58 for (i = 1; i < array->n_index; i++) {
59 p = isl_printer_print_str(p, "[");
60 p = isl_printer_print_pw_aff(p, array->bound[i]);
61 p = isl_printer_print_str(p, "]");
64 p = isl_printer_print_str(p, ";");
65 p = isl_printer_end_line(p);
67 return p;
70 static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
71 struct gpu_prog *prog)
73 int i;
75 for (i = 0; i < prog->n_array; ++i) {
76 if (!gpu_array_requires_device_allocation(&prog->array[i]))
77 continue;
79 p = declare_device_array(p, &prog->array[i]);
81 p = isl_printer_start_line(p);
82 p = isl_printer_end_line(p);
83 return p;
86 static __isl_give isl_printer *allocate_device_arrays(
87 __isl_take isl_printer *p, struct gpu_prog *prog)
89 int i;
91 for (i = 0; i < prog->n_array; ++i) {
92 if (!gpu_array_requires_device_allocation(&prog->array[i]))
93 continue;
94 p = isl_printer_start_line(p);
95 p = isl_printer_print_str(p,
96 "cudaCheckReturn(cudaMalloc((void **) &dev_");
97 p = isl_printer_print_str(p, prog->array[i].name);
98 p = isl_printer_print_str(p, ", ");
99 p = gpu_array_info_print_size(p, &prog->array[i]);
100 p = isl_printer_print_str(p, "));");
101 p = isl_printer_end_line(p);
103 p = isl_printer_start_line(p);
104 p = isl_printer_end_line(p);
105 return p;
108 /* Print code to "p" for copying "array" from the host to the device
109 * in its entirety. The bounds on the extent of "array" have
110 * been precomputed in extract_array_info and are used in
111 * gpu_array_info_print_size.
113 static __isl_give isl_printer *copy_array_to_device(__isl_take isl_printer *p,
114 struct gpu_array_info *array)
116 p = isl_printer_start_line(p);
117 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
118 p = isl_printer_print_str(p, array->name);
119 p = isl_printer_print_str(p, ", ");
121 if (gpu_array_is_scalar(array))
122 p = isl_printer_print_str(p, "&");
123 p = isl_printer_print_str(p, array->name);
124 p = isl_printer_print_str(p, ", ");
126 p = gpu_array_info_print_size(p, array);
127 p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
128 p = isl_printer_end_line(p);
130 return p;
133 /* Print code to "p" for copying "array" back from the device to the host
134 * in its entirety. The bounds on the extent of "array" have
135 * been precomputed in extract_array_info and are used in
136 * gpu_array_info_print_size.
138 static __isl_give isl_printer *copy_array_from_device(
139 __isl_take isl_printer *p, struct gpu_array_info *array)
141 p = isl_printer_start_line(p);
142 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
143 if (gpu_array_is_scalar(array))
144 p = isl_printer_print_str(p, "&");
145 p = isl_printer_print_str(p, array->name);
146 p = isl_printer_print_str(p, ", dev_");
147 p = isl_printer_print_str(p, array->name);
148 p = isl_printer_print_str(p, ", ");
149 p = gpu_array_info_print_size(p, array);
150 p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
151 p = isl_printer_end_line(p);
153 return p;
156 static void print_reverse_list(FILE *out, int len, int *list)
158 int i;
160 if (len == 0)
161 return;
163 fprintf(out, "(");
164 for (i = 0; i < len; ++i) {
165 if (i)
166 fprintf(out, ", ");
167 fprintf(out, "%d", list[len - 1 - i]);
169 fprintf(out, ")");
172 /* Print the effective grid size as a list of the sizes in each
173 * dimension, from innermost to outermost.
175 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
176 struct ppcg_kernel *kernel)
178 int i;
179 int dim;
181 dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
182 if (dim == 0)
183 return p;
185 p = isl_printer_print_str(p, "(");
186 for (i = dim - 1; i >= 0; --i) {
187 isl_pw_aff *bound;
189 bound = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
190 p = isl_printer_print_pw_aff(p, bound);
191 isl_pw_aff_free(bound);
193 if (i > 0)
194 p = isl_printer_print_str(p, ", ");
197 p = isl_printer_print_str(p, ")");
199 return p;
202 /* Print the grid definition.
204 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
205 struct ppcg_kernel *kernel)
207 p = isl_printer_start_line(p);
208 p = isl_printer_print_str(p, "dim3 k");
209 p = isl_printer_print_int(p, kernel->id);
210 p = isl_printer_print_str(p, "_dimGrid");
211 p = print_grid_size(p, kernel);
212 p = isl_printer_print_str(p, ";");
213 p = isl_printer_end_line(p);
215 return p;
218 /* Print the arguments to a kernel declaration or call. If "types" is set,
219 * then print a declaration (including the types of the arguments).
221 * The arguments are printed in the following order
222 * - the arrays accessed by the kernel
223 * - the parameters
224 * - the host loop iterators
226 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
227 struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
229 int i, n;
230 int first = 1;
231 unsigned nparam;
232 isl_space *space;
233 const char *type;
235 for (i = 0; i < prog->n_array; ++i) {
236 int required;
238 required = ppcg_kernel_requires_array_argument(kernel, i);
239 if (required < 0)
240 return isl_printer_free(p);
241 if (!required)
242 continue;
244 if (!first)
245 p = isl_printer_print_str(p, ", ");
247 if (types)
248 p = gpu_array_info_print_declaration_argument(p,
249 &prog->array[i], NULL);
250 else
251 p = gpu_array_info_print_call_argument(p,
252 &prog->array[i]);
254 first = 0;
257 space = isl_union_set_get_space(kernel->arrays);
258 nparam = isl_space_dim(space, isl_dim_param);
259 for (i = 0; i < nparam; ++i) {
260 const char *name;
262 name = isl_space_get_dim_name(space, isl_dim_param, i);
264 if (!first)
265 p = isl_printer_print_str(p, ", ");
266 if (types)
267 p = isl_printer_print_str(p, "int ");
268 p = isl_printer_print_str(p, name);
270 first = 0;
272 isl_space_free(space);
274 n = isl_space_dim(kernel->space, isl_dim_set);
275 type = isl_options_get_ast_iterator_type(prog->ctx);
276 for (i = 0; i < n; ++i) {
277 const char *name;
279 if (!first)
280 p = isl_printer_print_str(p, ", ");
281 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
282 if (types) {
283 p = isl_printer_print_str(p, type);
284 p = isl_printer_print_str(p, " ");
286 p = isl_printer_print_str(p, name);
288 first = 0;
291 return p;
294 /* Print the header of the given kernel.
296 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
297 struct gpu_prog *prog, struct ppcg_kernel *kernel)
299 p = isl_printer_start_line(p);
300 p = isl_printer_print_str(p, "__global__ void kernel");
301 p = isl_printer_print_int(p, kernel->id);
302 p = isl_printer_print_str(p, "(");
303 p = print_kernel_arguments(p, prog, kernel, 1);
304 p = isl_printer_print_str(p, ")");
306 return p;
309 /* Print the header of the given kernel to both gen->cuda.kernel_h
310 * and gen->cuda.kernel_c.
312 static void print_kernel_headers(struct gpu_prog *prog,
313 struct ppcg_kernel *kernel, struct cuda_info *cuda)
315 isl_printer *p;
317 p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
318 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
319 p = print_kernel_header(p, prog, kernel);
320 p = isl_printer_print_str(p, ";");
321 p = isl_printer_end_line(p);
322 isl_printer_free(p);
324 p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
325 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
326 p = print_kernel_header(p, prog, kernel);
327 p = isl_printer_end_line(p);
328 isl_printer_free(p);
331 static void print_indent(FILE *dst, int indent)
333 fprintf(dst, "%*s", indent, "");
336 /* Print a list of iterators of type "type" with names "ids" to "out".
337 * Each iterator is assigned one of the cuda identifiers in cuda_dims.
338 * In particular, the last iterator is assigned the x identifier
339 * (the first in the list of cuda identifiers).
341 static void print_iterators(FILE *out, const char *type,
342 __isl_keep isl_id_list *ids, const char *cuda_dims[])
344 int i, n;
346 n = isl_id_list_n_id(ids);
347 if (n <= 0)
348 return;
349 print_indent(out, 4);
350 fprintf(out, "%s ", type);
351 for (i = 0; i < n; ++i) {
352 isl_id *id;
354 if (i)
355 fprintf(out, ", ");
356 id = isl_id_list_get_id(ids, i);
357 fprintf(out, "%s = %s", isl_id_get_name(id),
358 cuda_dims[n - 1 - i]);
359 isl_id_free(id);
361 fprintf(out, ";\n");
364 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
366 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
367 const char *type;
368 const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
369 const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
370 "threadIdx.z" };
372 type = isl_options_get_ast_iterator_type(ctx);
374 print_iterators(out, type, kernel->block_ids, block_dims);
375 print_iterators(out, type, kernel->thread_ids, thread_dims);
378 static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p,
379 struct ppcg_kernel_var *var)
381 int j;
383 p = isl_printer_start_line(p);
384 if (var->type == ppcg_access_shared)
385 p = isl_printer_print_str(p, "__shared__ ");
386 p = isl_printer_print_str(p, var->array->type);
387 p = isl_printer_print_str(p, " ");
388 p = isl_printer_print_str(p, var->name);
389 for (j = 0; j < var->array->n_index; ++j) {
390 isl_val *v;
392 p = isl_printer_print_str(p, "[");
393 v = isl_vec_get_element_val(var->size, j);
394 p = isl_printer_print_val(p, v);
395 isl_val_free(v);
396 p = isl_printer_print_str(p, "]");
398 p = isl_printer_print_str(p, ";");
399 p = isl_printer_end_line(p);
401 return p;
404 static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p,
405 struct ppcg_kernel *kernel)
407 int i;
409 for (i = 0; i < kernel->n_var; ++i)
410 p = print_kernel_var(p, &kernel->var[i]);
412 return p;
415 /* Print a sync statement.
417 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
418 struct ppcg_kernel_stmt *stmt)
420 p = isl_printer_start_line(p);
421 p = isl_printer_print_str(p, "__syncthreads();");
422 p = isl_printer_end_line(p);
424 return p;
427 /* This function is called for each user statement in the AST,
428 * i.e., for each kernel body statement, copy statement or sync statement.
430 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
431 __isl_take isl_ast_print_options *print_options,
432 __isl_keep isl_ast_node *node, void *user)
434 isl_id *id;
435 struct ppcg_kernel_stmt *stmt;
437 id = isl_ast_node_get_annotation(node);
438 stmt = isl_id_get_user(id);
439 isl_id_free(id);
441 isl_ast_print_options_free(print_options);
443 switch (stmt->type) {
444 case ppcg_kernel_copy:
445 return ppcg_kernel_print_copy(p, stmt);
446 case ppcg_kernel_sync:
447 return print_sync(p, stmt);
448 case ppcg_kernel_domain:
449 return ppcg_kernel_print_domain(p, stmt);
452 return p;
455 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
456 struct cuda_info *cuda)
458 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
459 isl_ast_print_options *print_options;
460 isl_printer *p;
462 print_kernel_headers(prog, kernel, cuda);
463 fprintf(cuda->kernel_c, "{\n");
464 print_kernel_iterators(cuda->kernel_c, kernel);
466 p = isl_printer_to_file(ctx, cuda->kernel_c);
467 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
468 p = isl_printer_indent(p, 4);
470 p = print_kernel_vars(p, kernel);
471 p = isl_printer_end_line(p);
472 p = isl_ast_op_type_print_macro(isl_ast_op_fdiv_q, p);
473 p = ppcg_print_macros(p, kernel->tree);
475 print_options = isl_ast_print_options_alloc(ctx);
476 print_options = isl_ast_print_options_set_print_user(print_options,
477 &print_kernel_stmt, NULL);
478 p = isl_ast_node_print(kernel->tree, p, print_options);
479 isl_printer_free(p);
481 fprintf(cuda->kernel_c, "}\n");
484 /* Print a statement for copying an array to or from the device.
485 * The statement identifier is called "to_device_<array name>" or
486 * "from_device_<array name>" and its user pointer points
487 * to the gpu_array_info of the array that needs to be copied.
489 * Extract the array from the identifier and call
490 * copy_array_to_device or copy_array_from_device.
492 static __isl_give isl_printer *print_to_from_device(__isl_take isl_printer *p,
493 __isl_keep isl_ast_node *node, struct gpu_prog *prog)
495 isl_ast_expr *expr, *arg;
496 isl_id *id;
497 const char *name;
498 struct gpu_array_info *array;
500 expr = isl_ast_node_user_get_expr(node);
501 arg = isl_ast_expr_get_op_arg(expr, 0);
502 id = isl_ast_expr_get_id(arg);
503 name = isl_id_get_name(id);
504 array = isl_id_get_user(id);
505 isl_id_free(id);
506 isl_ast_expr_free(arg);
507 isl_ast_expr_free(expr);
509 if (!name)
510 array = NULL;
511 if (!array)
512 return isl_printer_free(p);
514 if (!prefixcmp(name, "to_device"))
515 return copy_array_to_device(p, array);
516 else
517 return copy_array_from_device(p, array);
520 struct print_host_user_data {
521 struct cuda_info *cuda;
522 struct gpu_prog *prog;
525 /* Print the user statement of the host code to "p".
527 * The host code may contain original user statements, kernel launches and
528 * statements that copy data to/from the device.
529 * The original user statements and the kernel launches have
530 * an associated annotation, while the data copy statements do not.
531 * The latter are handled by print_to_from_device.
532 * The annotation on the user statements is called "user".
534 * In case of a kernel launch, print a block of statements that
535 * defines the grid and the block and then launches the kernel.
537 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
538 __isl_take isl_ast_print_options *print_options,
539 __isl_keep isl_ast_node *node, void *user)
541 isl_id *id;
542 int is_user;
543 struct ppcg_kernel *kernel;
544 struct ppcg_kernel_stmt *stmt;
545 struct print_host_user_data *data;
547 isl_ast_print_options_free(print_options);
549 data = (struct print_host_user_data *) user;
551 id = isl_ast_node_get_annotation(node);
552 if (!id)
553 return print_to_from_device(p, node, data->prog);
555 is_user = !strcmp(isl_id_get_name(id), "user");
556 kernel = is_user ? NULL : isl_id_get_user(id);
557 stmt = is_user ? isl_id_get_user(id) : NULL;
558 isl_id_free(id);
560 if (is_user)
561 return ppcg_kernel_print_domain(p, stmt);
563 p = isl_printer_start_line(p);
564 p = isl_printer_print_str(p, "{");
565 p = isl_printer_end_line(p);
566 p = isl_printer_indent(p, 2);
568 p = isl_printer_start_line(p);
569 p = isl_printer_print_str(p, "dim3 k");
570 p = isl_printer_print_int(p, kernel->id);
571 p = isl_printer_print_str(p, "_dimBlock");
572 print_reverse_list(isl_printer_get_file(p),
573 kernel->n_block, kernel->block_dim);
574 p = isl_printer_print_str(p, ";");
575 p = isl_printer_end_line(p);
577 p = print_grid(p, kernel);
579 p = isl_printer_start_line(p);
580 p = isl_printer_print_str(p, "kernel");
581 p = isl_printer_print_int(p, kernel->id);
582 p = isl_printer_print_str(p, " <<<k");
583 p = isl_printer_print_int(p, kernel->id);
584 p = isl_printer_print_str(p, "_dimGrid, k");
585 p = isl_printer_print_int(p, kernel->id);
586 p = isl_printer_print_str(p, "_dimBlock>>> (");
587 p = print_kernel_arguments(p, data->prog, kernel, 0);
588 p = isl_printer_print_str(p, ");");
589 p = isl_printer_end_line(p);
591 p = isl_printer_start_line(p);
592 p = isl_printer_print_str(p, "cudaCheckKernel();");
593 p = isl_printer_end_line(p);
595 p = isl_printer_indent(p, -2);
596 p = isl_printer_start_line(p);
597 p = isl_printer_print_str(p, "}");
598 p = isl_printer_end_line(p);
600 p = isl_printer_start_line(p);
601 p = isl_printer_end_line(p);
603 print_kernel(data->prog, kernel, data->cuda);
605 return p;
608 static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
609 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
610 struct cuda_info *cuda)
612 isl_ast_print_options *print_options;
613 isl_ctx *ctx = isl_ast_node_get_ctx(tree);
614 struct print_host_user_data data = { cuda, prog };
616 print_options = isl_ast_print_options_alloc(ctx);
617 print_options = isl_ast_print_options_set_print_user(print_options,
618 &print_host_user, &data);
620 p = ppcg_print_macros(p, tree);
621 p = isl_ast_node_print(tree, p, print_options);
623 return p;
626 static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
627 struct gpu_prog *prog)
629 int i;
631 for (i = 0; i < prog->n_array; ++i) {
632 if (!gpu_array_requires_device_allocation(&prog->array[i]))
633 continue;
634 p = isl_printer_start_line(p);
635 p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
636 p = isl_printer_print_str(p, prog->array[i].name);
637 p = isl_printer_print_str(p, "));");
638 p = isl_printer_end_line(p);
641 return p;
644 /* Given a gpu_prog "prog" and the corresponding transformed AST
645 * "tree", print the entire CUDA code to "p".
646 * "types" collects the types for which a definition has already
647 * been printed.
649 static __isl_give isl_printer *print_cuda(__isl_take isl_printer *p,
650 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
651 struct gpu_types *types, void *user)
653 struct cuda_info *cuda = user;
654 isl_printer *kernel;
656 kernel = isl_printer_to_file(isl_printer_get_ctx(p), cuda->kernel_c);
657 kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
658 kernel = gpu_print_types(kernel, types, prog);
659 isl_printer_free(kernel);
661 if (!kernel)
662 return isl_printer_free(p);
664 p = ppcg_start_block(p);
666 p = print_cuda_macros(p);
668 p = gpu_print_local_declarations(p, prog);
669 p = declare_device_arrays(p, prog);
670 p = allocate_device_arrays(p, prog);
672 p = print_host_code(p, prog, tree, cuda);
674 p = free_device_arrays(p, prog);
676 p = ppcg_end_block(p);
678 return p;
681 /* Transform the code in the file called "input" by replacing
682 * all scops by corresponding CUDA code.
683 * The names of the output files are derived from "input".
685 * We let generate_gpu do all the hard work and then let it call
686 * us back for printing the AST in print_cuda.
688 * To prepare for this printing, we first open the output files
689 * and we close them after generate_gpu has finished.
691 int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
692 const char *input)
694 struct cuda_info cuda;
695 int r;
697 cuda_open_files(&cuda, input);
699 r = generate_gpu(ctx, input, cuda.host_c, options, &print_cuda, &cuda);
701 cuda_close_files(&cuda);
703 return r;