2 * Copyright 2012 Ecole Normale Superieure
4 * Use of this software is governed by the MIT license
6 * Written by Sven Verdoolaege,
7 * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
13 #include "cuda_common.h"
16 #include "gpu_print.h"
20 static __isl_give isl_printer
*print_cuda_macros(__isl_take isl_printer
*p
)
23 "#define cudaCheckReturn(ret) \\\n"
25 " cudaError_t cudaCheckReturn_e = (ret); \\\n"
26 " if (cudaCheckReturn_e != cudaSuccess) { \\\n"
27 " fprintf(stderr, \"CUDA error: %s\\n\", "
28 "cudaGetErrorString(cudaCheckReturn_e)); \\\n"
29 " fflush(stderr); \\\n"
31 " assert(cudaCheckReturn_e == cudaSuccess); \\\n"
33 "#define cudaCheckKernel() \\\n"
35 " cudaCheckReturn(cudaGetLastError()); \\\n"
38 p
= isl_printer_print_str(p
, macros
);
42 /* Print a declaration for the device array corresponding to "array" on "p".
44 static __isl_give isl_printer
*declare_device_array(__isl_take isl_printer
*p
,
45 struct gpu_array_info
*array
)
49 p
= isl_printer_start_line(p
);
50 p
= isl_printer_print_str(p
, array
->type
);
51 p
= isl_printer_print_str(p
, " ");
52 if (!array
->linearize
&& array
->n_index
> 1)
53 p
= isl_printer_print_str(p
, "(");
54 p
= isl_printer_print_str(p
, "*dev_");
55 p
= isl_printer_print_str(p
, array
->name
);
56 if (!array
->linearize
&& array
->n_index
> 1) {
57 p
= isl_printer_print_str(p
, ")");
58 for (i
= 1; i
< array
->n_index
; i
++) {
59 p
= isl_printer_print_str(p
, "[");
60 p
= isl_printer_print_pw_aff(p
, array
->bound
[i
]);
61 p
= isl_printer_print_str(p
, "]");
64 p
= isl_printer_print_str(p
, ";");
65 p
= isl_printer_end_line(p
);
70 static __isl_give isl_printer
*declare_device_arrays(__isl_take isl_printer
*p
,
71 struct gpu_prog
*prog
)
75 for (i
= 0; i
< prog
->n_array
; ++i
) {
76 if (!gpu_array_requires_device_allocation(&prog
->array
[i
]))
79 p
= declare_device_array(p
, &prog
->array
[i
]);
81 p
= isl_printer_start_line(p
);
82 p
= isl_printer_end_line(p
);
86 static __isl_give isl_printer
*allocate_device_arrays(
87 __isl_take isl_printer
*p
, struct gpu_prog
*prog
)
91 for (i
= 0; i
< prog
->n_array
; ++i
) {
92 if (!gpu_array_requires_device_allocation(&prog
->array
[i
]))
94 p
= isl_printer_start_line(p
);
95 p
= isl_printer_print_str(p
,
96 "cudaCheckReturn(cudaMalloc((void **) &dev_");
97 p
= isl_printer_print_str(p
, prog
->array
[i
].name
);
98 p
= isl_printer_print_str(p
, ", ");
99 p
= gpu_array_info_print_size(p
, &prog
->array
[i
]);
100 p
= isl_printer_print_str(p
, "));");
101 p
= isl_printer_end_line(p
);
103 p
= isl_printer_start_line(p
);
104 p
= isl_printer_end_line(p
);
108 static __isl_give isl_printer
*free_device_arrays(__isl_take isl_printer
*p
,
109 struct gpu_prog
*prog
)
113 for (i
= 0; i
< prog
->n_array
; ++i
) {
114 if (!gpu_array_requires_device_allocation(&prog
->array
[i
]))
116 p
= isl_printer_start_line(p
);
117 p
= isl_printer_print_str(p
, "cudaCheckReturn(cudaFree(dev_");
118 p
= isl_printer_print_str(p
, prog
->array
[i
].name
);
119 p
= isl_printer_print_str(p
, "));");
120 p
= isl_printer_end_line(p
);
126 /* Print code to "p" for copying "array" from the host to the device
127 * in its entirety. The bounds on the extent of "array" have
128 * been precomputed in extract_array_info and are used in
129 * gpu_array_info_print_size.
131 static __isl_give isl_printer
*copy_array_to_device(__isl_take isl_printer
*p
,
132 struct gpu_array_info
*array
)
134 p
= isl_printer_start_line(p
);
135 p
= isl_printer_print_str(p
, "cudaCheckReturn(cudaMemcpy(dev_");
136 p
= isl_printer_print_str(p
, array
->name
);
137 p
= isl_printer_print_str(p
, ", ");
139 if (gpu_array_is_scalar(array
))
140 p
= isl_printer_print_str(p
, "&");
141 p
= isl_printer_print_str(p
, array
->name
);
142 p
= isl_printer_print_str(p
, ", ");
144 p
= gpu_array_info_print_size(p
, array
);
145 p
= isl_printer_print_str(p
, ", cudaMemcpyHostToDevice));");
146 p
= isl_printer_end_line(p
);
151 /* Print code to "p" for copying "array" back from the device to the host
152 * in its entirety. The bounds on the extent of "array" have
153 * been precomputed in extract_array_info and are used in
154 * gpu_array_info_print_size.
156 static __isl_give isl_printer
*copy_array_from_device(
157 __isl_take isl_printer
*p
, struct gpu_array_info
*array
)
159 p
= isl_printer_start_line(p
);
160 p
= isl_printer_print_str(p
, "cudaCheckReturn(cudaMemcpy(");
161 if (gpu_array_is_scalar(array
))
162 p
= isl_printer_print_str(p
, "&");
163 p
= isl_printer_print_str(p
, array
->name
);
164 p
= isl_printer_print_str(p
, ", dev_");
165 p
= isl_printer_print_str(p
, array
->name
);
166 p
= isl_printer_print_str(p
, ", ");
167 p
= gpu_array_info_print_size(p
, array
);
168 p
= isl_printer_print_str(p
, ", cudaMemcpyDeviceToHost));");
169 p
= isl_printer_end_line(p
);
174 static void print_reverse_list(FILE *out
, int len
, int *list
)
178 if (!out
|| len
== 0)
182 for (i
= 0; i
< len
; ++i
) {
185 fprintf(out
, "%d", list
[len
- 1 - i
]);
190 /* Print the effective grid size as a list of the sizes in each
191 * dimension, from innermost to outermost.
193 static __isl_give isl_printer
*print_grid_size(__isl_take isl_printer
*p
,
194 struct ppcg_kernel
*kernel
)
199 dim
= isl_multi_pw_aff_dim(kernel
->grid_size
, isl_dim_set
);
203 p
= isl_printer_print_str(p
, "(");
204 for (i
= dim
- 1; i
>= 0; --i
) {
207 bound
= isl_multi_pw_aff_get_pw_aff(kernel
->grid_size
, i
);
208 p
= isl_printer_print_pw_aff(p
, bound
);
209 isl_pw_aff_free(bound
);
212 p
= isl_printer_print_str(p
, ", ");
215 p
= isl_printer_print_str(p
, ")");
220 /* Print the grid definition.
222 static __isl_give isl_printer
*print_grid(__isl_take isl_printer
*p
,
223 struct ppcg_kernel
*kernel
)
225 p
= isl_printer_start_line(p
);
226 p
= isl_printer_print_str(p
, "dim3 k");
227 p
= isl_printer_print_int(p
, kernel
->id
);
228 p
= isl_printer_print_str(p
, "_dimGrid");
229 p
= print_grid_size(p
, kernel
);
230 p
= isl_printer_print_str(p
, ";");
231 p
= isl_printer_end_line(p
);
236 /* Print the arguments to a kernel declaration or call. If "types" is set,
237 * then print a declaration (including the types of the arguments).
239 * The arguments are printed in the following order
240 * - the arrays accessed by the kernel
242 * - the host loop iterators
244 static __isl_give isl_printer
*print_kernel_arguments(__isl_take isl_printer
*p
,
245 struct gpu_prog
*prog
, struct ppcg_kernel
*kernel
, int types
)
253 for (i
= 0; i
< prog
->n_array
; ++i
) {
256 required
= ppcg_kernel_requires_array_argument(kernel
, i
);
258 return isl_printer_free(p
);
263 p
= isl_printer_print_str(p
, ", ");
266 p
= gpu_array_info_print_declaration_argument(p
,
267 &prog
->array
[i
], NULL
);
269 p
= gpu_array_info_print_call_argument(p
,
275 space
= isl_union_set_get_space(kernel
->arrays
);
276 nparam
= isl_space_dim(space
, isl_dim_param
);
277 for (i
= 0; i
< nparam
; ++i
) {
280 name
= isl_space_get_dim_name(space
, isl_dim_param
, i
);
283 p
= isl_printer_print_str(p
, ", ");
285 p
= isl_printer_print_str(p
, "int ");
286 p
= isl_printer_print_str(p
, name
);
290 isl_space_free(space
);
292 n
= isl_space_dim(kernel
->space
, isl_dim_set
);
293 type
= isl_options_get_ast_iterator_type(prog
->ctx
);
294 for (i
= 0; i
< n
; ++i
) {
298 p
= isl_printer_print_str(p
, ", ");
299 name
= isl_space_get_dim_name(kernel
->space
, isl_dim_set
, i
);
301 p
= isl_printer_print_str(p
, type
);
302 p
= isl_printer_print_str(p
, " ");
304 p
= isl_printer_print_str(p
, name
);
312 /* Print the header of the given kernel.
314 static __isl_give isl_printer
*print_kernel_header(__isl_take isl_printer
*p
,
315 struct gpu_prog
*prog
, struct ppcg_kernel
*kernel
)
317 p
= isl_printer_start_line(p
);
318 p
= isl_printer_print_str(p
, "__global__ void kernel");
319 p
= isl_printer_print_int(p
, kernel
->id
);
320 p
= isl_printer_print_str(p
, "(");
321 p
= print_kernel_arguments(p
, prog
, kernel
, 1);
322 p
= isl_printer_print_str(p
, ")");
327 /* Print the header of the given kernel to both gen->cuda.kernel_h
328 * and gen->cuda.kernel_c.
330 static void print_kernel_headers(struct gpu_prog
*prog
,
331 struct ppcg_kernel
*kernel
, struct cuda_info
*cuda
)
335 p
= isl_printer_to_file(prog
->ctx
, cuda
->kernel_h
);
336 p
= isl_printer_set_output_format(p
, ISL_FORMAT_C
);
337 p
= print_kernel_header(p
, prog
, kernel
);
338 p
= isl_printer_print_str(p
, ";");
339 p
= isl_printer_end_line(p
);
342 p
= isl_printer_to_file(prog
->ctx
, cuda
->kernel_c
);
343 p
= isl_printer_set_output_format(p
, ISL_FORMAT_C
);
344 p
= print_kernel_header(p
, prog
, kernel
);
345 p
= isl_printer_end_line(p
);
349 static void print_indent(FILE *dst
, int indent
)
351 fprintf(dst
, "%*s", indent
, "");
354 /* Print a list of iterators of type "type" with names "ids" to "out".
355 * Each iterator is assigned one of the cuda identifiers in cuda_dims.
356 * In particular, the last iterator is assigned the x identifier
357 * (the first in the list of cuda identifiers).
359 static void print_iterators(FILE *out
, const char *type
,
360 __isl_keep isl_id_list
*ids
, const char *cuda_dims
[])
364 n
= isl_id_list_n_id(ids
);
367 print_indent(out
, 4);
368 fprintf(out
, "%s ", type
);
369 for (i
= 0; i
< n
; ++i
) {
374 id
= isl_id_list_get_id(ids
, i
);
375 fprintf(out
, "%s = %s", isl_id_get_name(id
),
376 cuda_dims
[n
- 1 - i
]);
382 static void print_kernel_iterators(FILE *out
, struct ppcg_kernel
*kernel
)
384 isl_ctx
*ctx
= isl_ast_node_get_ctx(kernel
->tree
);
386 const char *block_dims
[] = { "blockIdx.x", "blockIdx.y" };
387 const char *thread_dims
[] = { "threadIdx.x", "threadIdx.y",
390 type
= isl_options_get_ast_iterator_type(ctx
);
392 print_iterators(out
, type
, kernel
->block_ids
, block_dims
);
393 print_iterators(out
, type
, kernel
->thread_ids
, thread_dims
);
396 static __isl_give isl_printer
*print_kernel_var(__isl_take isl_printer
*p
,
397 struct ppcg_kernel_var
*var
)
401 p
= isl_printer_start_line(p
);
402 if (var
->type
== ppcg_access_shared
)
403 p
= isl_printer_print_str(p
, "__shared__ ");
404 p
= isl_printer_print_str(p
, var
->array
->type
);
405 p
= isl_printer_print_str(p
, " ");
406 p
= isl_printer_print_str(p
, var
->name
);
407 for (j
= 0; j
< var
->array
->n_index
; ++j
) {
410 p
= isl_printer_print_str(p
, "[");
411 v
= isl_vec_get_element_val(var
->size
, j
);
412 p
= isl_printer_print_val(p
, v
);
414 p
= isl_printer_print_str(p
, "]");
416 p
= isl_printer_print_str(p
, ";");
417 p
= isl_printer_end_line(p
);
422 static __isl_give isl_printer
*print_kernel_vars(__isl_take isl_printer
*p
,
423 struct ppcg_kernel
*kernel
)
427 for (i
= 0; i
< kernel
->n_var
; ++i
)
428 p
= print_kernel_var(p
, &kernel
->var
[i
]);
433 /* Print a sync statement.
435 static __isl_give isl_printer
*print_sync(__isl_take isl_printer
*p
,
436 struct ppcg_kernel_stmt
*stmt
)
438 p
= isl_printer_start_line(p
);
439 p
= isl_printer_print_str(p
, "__syncthreads();");
440 p
= isl_printer_end_line(p
);
445 /* This function is called for each user statement in the AST,
446 * i.e., for each kernel body statement, copy statement or sync statement.
448 static __isl_give isl_printer
*print_kernel_stmt(__isl_take isl_printer
*p
,
449 __isl_take isl_ast_print_options
*print_options
,
450 __isl_keep isl_ast_node
*node
, void *user
)
453 struct ppcg_kernel_stmt
*stmt
;
455 id
= isl_ast_node_get_annotation(node
);
456 stmt
= isl_id_get_user(id
);
459 isl_ast_print_options_free(print_options
);
461 switch (stmt
->type
) {
462 case ppcg_kernel_copy
:
463 return ppcg_kernel_print_copy(p
, stmt
);
464 case ppcg_kernel_sync
:
465 return print_sync(p
, stmt
);
466 case ppcg_kernel_domain
:
467 return ppcg_kernel_print_domain(p
, stmt
);
473 static void print_kernel(struct gpu_prog
*prog
, struct ppcg_kernel
*kernel
,
474 struct cuda_info
*cuda
)
476 isl_ctx
*ctx
= isl_ast_node_get_ctx(kernel
->tree
);
477 isl_ast_print_options
*print_options
;
480 print_kernel_headers(prog
, kernel
, cuda
);
481 fprintf(cuda
->kernel_c
, "{\n");
482 print_kernel_iterators(cuda
->kernel_c
, kernel
);
484 p
= isl_printer_to_file(ctx
, cuda
->kernel_c
);
485 p
= isl_printer_set_output_format(p
, ISL_FORMAT_C
);
486 p
= isl_printer_indent(p
, 4);
488 p
= print_kernel_vars(p
, kernel
);
489 p
= isl_printer_end_line(p
);
490 p
= ppcg_set_macro_names(p
);
491 p
= isl_ast_op_type_print_macro(isl_ast_op_fdiv_q
, p
);
492 p
= ppcg_print_macros(p
, kernel
->tree
);
494 print_options
= isl_ast_print_options_alloc(ctx
);
495 print_options
= isl_ast_print_options_set_print_user(print_options
,
496 &print_kernel_stmt
, NULL
);
497 p
= isl_ast_node_print(kernel
->tree
, p
, print_options
);
500 fprintf(cuda
->kernel_c
, "}\n");
503 /* Print code for initializing the device for execution of the transformed
504 * code. This includes declaring locally defined variables as well as
505 * declaring and allocating the required copies of arrays on the device.
507 static __isl_give isl_printer
*init_device(__isl_take isl_printer
*p
,
508 struct gpu_prog
*prog
)
510 p
= print_cuda_macros(p
);
512 p
= gpu_print_local_declarations(p
, prog
);
513 p
= declare_device_arrays(p
, prog
);
514 p
= allocate_device_arrays(p
, prog
);
519 /* Print code for clearing the device after execution of the transformed code.
520 * In particular, free the memory that was allocated on the device.
522 static __isl_give isl_printer
*clear_device(__isl_take isl_printer
*p
,
523 struct gpu_prog
*prog
)
525 p
= free_device_arrays(p
, prog
);
530 /* Print a statement for copying an array to or from the device,
531 * or for initializing or clearing the device.
532 * The statement identifier of a copying node is called
533 * "to_device_<array name>" or "from_device_<array name>" and
534 * its user pointer points to the gpu_array_info of the array
535 * that needs to be copied.
536 * The node for initializing the device is called "init_device".
537 * The node for clearing the device is called "clear_device".
539 * Extract the array (if any) from the identifier and call
540 * init_device, clear_device, copy_array_to_device or copy_array_from_device.
542 static __isl_give isl_printer
*print_device_node(__isl_take isl_printer
*p
,
543 __isl_keep isl_ast_node
*node
, struct gpu_prog
*prog
)
545 isl_ast_expr
*expr
, *arg
;
548 struct gpu_array_info
*array
;
550 expr
= isl_ast_node_user_get_expr(node
);
551 arg
= isl_ast_expr_get_op_arg(expr
, 0);
552 id
= isl_ast_expr_get_id(arg
);
553 name
= isl_id_get_name(id
);
554 array
= isl_id_get_user(id
);
556 isl_ast_expr_free(arg
);
557 isl_ast_expr_free(expr
);
560 return isl_printer_free(p
);
561 if (!strcmp(name
, "init_device"))
562 return init_device(p
, prog
);
563 if (!strcmp(name
, "clear_device"))
564 return clear_device(p
, prog
);
566 return isl_printer_free(p
);
568 if (!prefixcmp(name
, "to_device"))
569 return copy_array_to_device(p
, array
);
571 return copy_array_from_device(p
, array
);
574 struct print_host_user_data
{
575 struct cuda_info
*cuda
;
576 struct gpu_prog
*prog
;
579 /* Print the user statement of the host code to "p".
581 * The host code may contain original user statements, kernel launches,
582 * statements that copy data to/from the device and statements
583 * the initialize or clear the device.
584 * The original user statements and the kernel launches have
585 * an associated annotation, while the other statements do not.
586 * The latter are handled by print_device_node.
587 * The annotation on the user statements is called "user".
589 * In case of a kernel launch, print a block of statements that
590 * defines the grid and the block and then launches the kernel.
592 static __isl_give isl_printer
*print_host_user(__isl_take isl_printer
*p
,
593 __isl_take isl_ast_print_options
*print_options
,
594 __isl_keep isl_ast_node
*node
, void *user
)
598 struct ppcg_kernel
*kernel
;
599 struct ppcg_kernel_stmt
*stmt
;
600 struct print_host_user_data
*data
;
602 isl_ast_print_options_free(print_options
);
604 data
= (struct print_host_user_data
*) user
;
606 id
= isl_ast_node_get_annotation(node
);
608 return print_device_node(p
, node
, data
->prog
);
610 is_user
= !strcmp(isl_id_get_name(id
), "user");
611 kernel
= is_user
? NULL
: isl_id_get_user(id
);
612 stmt
= is_user
? isl_id_get_user(id
) : NULL
;
616 return ppcg_kernel_print_domain(p
, stmt
);
618 p
= isl_printer_start_line(p
);
619 p
= isl_printer_print_str(p
, "{");
620 p
= isl_printer_end_line(p
);
621 p
= isl_printer_indent(p
, 2);
623 p
= isl_printer_start_line(p
);
624 p
= isl_printer_print_str(p
, "dim3 k");
625 p
= isl_printer_print_int(p
, kernel
->id
);
626 p
= isl_printer_print_str(p
, "_dimBlock");
627 print_reverse_list(isl_printer_get_file(p
),
628 kernel
->n_block
, kernel
->block_dim
);
629 p
= isl_printer_print_str(p
, ";");
630 p
= isl_printer_end_line(p
);
632 p
= print_grid(p
, kernel
);
634 p
= isl_printer_start_line(p
);
635 p
= isl_printer_print_str(p
, "kernel");
636 p
= isl_printer_print_int(p
, kernel
->id
);
637 p
= isl_printer_print_str(p
, " <<<k");
638 p
= isl_printer_print_int(p
, kernel
->id
);
639 p
= isl_printer_print_str(p
, "_dimGrid, k");
640 p
= isl_printer_print_int(p
, kernel
->id
);
641 p
= isl_printer_print_str(p
, "_dimBlock>>> (");
642 p
= print_kernel_arguments(p
, data
->prog
, kernel
, 0);
643 p
= isl_printer_print_str(p
, ");");
644 p
= isl_printer_end_line(p
);
646 p
= isl_printer_start_line(p
);
647 p
= isl_printer_print_str(p
, "cudaCheckKernel();");
648 p
= isl_printer_end_line(p
);
650 p
= isl_printer_indent(p
, -2);
651 p
= isl_printer_start_line(p
);
652 p
= isl_printer_print_str(p
, "}");
653 p
= isl_printer_end_line(p
);
655 p
= isl_printer_start_line(p
);
656 p
= isl_printer_end_line(p
);
658 print_kernel(data
->prog
, kernel
, data
->cuda
);
663 static __isl_give isl_printer
*print_host_code(__isl_take isl_printer
*p
,
664 struct gpu_prog
*prog
, __isl_keep isl_ast_node
*tree
,
665 struct cuda_info
*cuda
)
667 isl_ast_print_options
*print_options
;
668 isl_ctx
*ctx
= isl_ast_node_get_ctx(tree
);
669 struct print_host_user_data data
= { cuda
, prog
};
671 print_options
= isl_ast_print_options_alloc(ctx
);
672 print_options
= isl_ast_print_options_set_print_user(print_options
,
673 &print_host_user
, &data
);
675 p
= ppcg_print_macros(p
, tree
);
676 p
= isl_ast_node_print(tree
, p
, print_options
);
681 /* Given a gpu_prog "prog" and the corresponding transformed AST
682 * "tree", print the entire CUDA code to "p".
683 * "types" collects the types for which a definition has already
686 static __isl_give isl_printer
*print_cuda(__isl_take isl_printer
*p
,
687 struct gpu_prog
*prog
, __isl_keep isl_ast_node
*tree
,
688 struct gpu_types
*types
, void *user
)
690 struct cuda_info
*cuda
= user
;
693 kernel
= isl_printer_to_file(isl_printer_get_ctx(p
), cuda
->kernel_c
);
694 kernel
= isl_printer_set_output_format(kernel
, ISL_FORMAT_C
);
695 kernel
= gpu_print_types(kernel
, types
, prog
);
696 isl_printer_free(kernel
);
699 return isl_printer_free(p
);
701 p
= print_host_code(p
, prog
, tree
, cuda
);
706 /* Transform the code in the file called "input" by replacing
707 * all scops by corresponding CUDA code.
708 * The names of the output files are derived from "input".
710 * We let generate_gpu do all the hard work and then let it call
711 * us back for printing the AST in print_cuda.
713 * To prepare for this printing, we first open the output files
714 * and we close them after generate_gpu has finished.
716 int generate_cuda(isl_ctx
*ctx
, struct ppcg_options
*options
,
719 struct cuda_info cuda
;
722 cuda_open_files(&cuda
, input
);
724 r
= generate_gpu(ctx
, input
, cuda
.host_c
, options
, &print_cuda
, &cuda
);
726 cuda_close_files(&cuda
);