2 * Copyright 2012 Ecole Normale Superieure
4 * Use of this software is governed by the MIT license
6 * Written by Sven Verdoolaege,
7 * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
13 #include "cuda_common.h"
16 #include "gpu_print.h"
20 static __isl_give isl_printer
*print_cuda_macros(__isl_take isl_printer
*p
)
23 "#define cudaCheckReturn(ret) \\\n"
25 " cudaError_t cudaCheckReturn_e = (ret); \\\n"
26 " if (cudaCheckReturn_e != cudaSuccess) { \\\n"
27 " fprintf(stderr, \"CUDA error: %s\\n\", "
28 "cudaGetErrorString(cudaCheckReturn_e)); \\\n"
29 " fflush(stderr); \\\n"
31 " assert(cudaCheckReturn_e == cudaSuccess); \\\n"
33 "#define cudaCheckKernel() \\\n"
35 " cudaCheckReturn(cudaGetLastError()); \\\n"
38 p
= isl_printer_print_str(p
, macros
);
42 /* Print a declaration for the device array corresponding to "array" on "p".
44 static __isl_give isl_printer
*declare_device_array(__isl_take isl_printer
*p
,
45 struct gpu_array_info
*array
)
49 p
= isl_printer_start_line(p
);
50 p
= isl_printer_print_str(p
, array
->type
);
51 p
= isl_printer_print_str(p
, " ");
52 if (!array
->linearize
&& array
->n_index
> 1)
53 p
= isl_printer_print_str(p
, "(");
54 p
= isl_printer_print_str(p
, "*dev_");
55 p
= isl_printer_print_str(p
, array
->name
);
56 if (!array
->linearize
&& array
->n_index
> 1) {
57 p
= isl_printer_print_str(p
, ")");
58 for (i
= 1; i
< array
->n_index
; i
++) {
59 p
= isl_printer_print_str(p
, "[");
60 p
= isl_printer_print_pw_aff(p
, array
->bound
[i
]);
61 p
= isl_printer_print_str(p
, "]");
64 p
= isl_printer_print_str(p
, ";");
65 p
= isl_printer_end_line(p
);
70 static __isl_give isl_printer
*declare_device_arrays(__isl_take isl_printer
*p
,
71 struct gpu_prog
*prog
)
75 for (i
= 0; i
< prog
->n_array
; ++i
) {
76 if (gpu_array_is_read_only_scalar(&prog
->array
[i
]))
78 if (!prog
->array
[i
].accessed
)
81 p
= declare_device_array(p
, &prog
->array
[i
]);
83 p
= isl_printer_start_line(p
);
84 p
= isl_printer_end_line(p
);
88 static __isl_give isl_printer
*allocate_device_arrays(
89 __isl_take isl_printer
*p
, struct gpu_prog
*prog
)
93 for (i
= 0; i
< prog
->n_array
; ++i
) {
94 if (gpu_array_is_read_only_scalar(&prog
->array
[i
]))
96 if (!prog
->array
[i
].accessed
)
98 p
= isl_printer_start_line(p
);
99 p
= isl_printer_print_str(p
,
100 "cudaCheckReturn(cudaMalloc((void **) &dev_");
101 p
= isl_printer_print_str(p
, prog
->array
[i
].name
);
102 p
= isl_printer_print_str(p
, ", ");
103 p
= gpu_array_info_print_size(p
, &prog
->array
[i
]);
104 p
= isl_printer_print_str(p
, "));");
105 p
= isl_printer_end_line(p
);
107 p
= isl_printer_start_line(p
);
108 p
= isl_printer_end_line(p
);
112 /* Print code to "p" for copying "array" from the host to the device
113 * in its entirety. The bounds on the extent of "array" have
114 * been precomputed in extract_array_info and are used in
115 * gpu_array_info_print_size.
117 static __isl_give isl_printer
*copy_array_to_device(__isl_take isl_printer
*p
,
118 struct gpu_array_info
*array
)
120 p
= isl_printer_start_line(p
);
121 p
= isl_printer_print_str(p
, "cudaCheckReturn(cudaMemcpy(dev_");
122 p
= isl_printer_print_str(p
, array
->name
);
123 p
= isl_printer_print_str(p
, ", ");
125 if (gpu_array_is_scalar(array
))
126 p
= isl_printer_print_str(p
, "&");
127 p
= isl_printer_print_str(p
, array
->name
);
128 p
= isl_printer_print_str(p
, ", ");
130 p
= gpu_array_info_print_size(p
, array
);
131 p
= isl_printer_print_str(p
, ", cudaMemcpyHostToDevice));");
132 p
= isl_printer_end_line(p
);
137 /* Print code to "p" for copying "array" back from the device to the host
138 * in its entirety. The bounds on the extent of "array" have
139 * been precomputed in extract_array_info and are used in
140 * gpu_array_info_print_size.
142 static __isl_give isl_printer
*copy_array_from_device(
143 __isl_take isl_printer
*p
, struct gpu_array_info
*array
)
145 p
= isl_printer_start_line(p
);
146 p
= isl_printer_print_str(p
, "cudaCheckReturn(cudaMemcpy(");
147 if (gpu_array_is_scalar(array
))
148 p
= isl_printer_print_str(p
, "&");
149 p
= isl_printer_print_str(p
, array
->name
);
150 p
= isl_printer_print_str(p
, ", dev_");
151 p
= isl_printer_print_str(p
, array
->name
);
152 p
= isl_printer_print_str(p
, ", ");
153 p
= gpu_array_info_print_size(p
, array
);
154 p
= isl_printer_print_str(p
, ", cudaMemcpyDeviceToHost));");
155 p
= isl_printer_end_line(p
);
160 static void print_reverse_list(FILE *out
, int len
, int *list
)
168 for (i
= 0; i
< len
; ++i
) {
171 fprintf(out
, "%d", list
[len
- 1 - i
]);
176 /* Print the effective grid size as a list of the sizes in each
177 * dimension, from innermost to outermost.
179 static __isl_give isl_printer
*print_grid_size(__isl_take isl_printer
*p
,
180 struct ppcg_kernel
*kernel
)
185 dim
= isl_multi_pw_aff_dim(kernel
->grid_size
, isl_dim_set
);
189 p
= isl_printer_print_str(p
, "(");
190 for (i
= dim
- 1; i
>= 0; --i
) {
193 bound
= isl_multi_pw_aff_get_pw_aff(kernel
->grid_size
, i
);
194 p
= isl_printer_print_pw_aff(p
, bound
);
195 isl_pw_aff_free(bound
);
198 p
= isl_printer_print_str(p
, ", ");
201 p
= isl_printer_print_str(p
, ")");
206 /* Print the grid definition.
208 static __isl_give isl_printer
*print_grid(__isl_take isl_printer
*p
,
209 struct ppcg_kernel
*kernel
)
211 p
= isl_printer_start_line(p
);
212 p
= isl_printer_print_str(p
, "dim3 k");
213 p
= isl_printer_print_int(p
, kernel
->id
);
214 p
= isl_printer_print_str(p
, "_dimGrid");
215 p
= print_grid_size(p
, kernel
);
216 p
= isl_printer_print_str(p
, ";");
217 p
= isl_printer_end_line(p
);
222 /* Print the arguments to a kernel declaration or call. If "types" is set,
223 * then print a declaration (including the types of the arguments).
225 * The arguments are printed in the following order
226 * - the arrays accessed by the kernel
228 * - the host loop iterators
230 static __isl_give isl_printer
*print_kernel_arguments(__isl_take isl_printer
*p
,
231 struct gpu_prog
*prog
, struct ppcg_kernel
*kernel
, int types
)
239 for (i
= 0; i
< prog
->n_array
; ++i
) {
243 space
= isl_space_copy(prog
->array
[i
].space
);
244 arr
= isl_union_set_extract_set(kernel
->arrays
, space
);
245 empty
= isl_set_plain_is_empty(arr
);
251 p
= isl_printer_print_str(p
, ", ");
254 p
= gpu_array_info_print_declaration_argument(p
,
255 &prog
->array
[i
], NULL
);
257 p
= gpu_array_info_print_call_argument(p
,
263 space
= isl_union_set_get_space(kernel
->arrays
);
264 nparam
= isl_space_dim(space
, isl_dim_param
);
265 for (i
= 0; i
< nparam
; ++i
) {
268 name
= isl_space_get_dim_name(space
, isl_dim_param
, i
);
271 p
= isl_printer_print_str(p
, ", ");
273 p
= isl_printer_print_str(p
, "int ");
274 p
= isl_printer_print_str(p
, name
);
278 isl_space_free(space
);
280 n
= isl_space_dim(kernel
->space
, isl_dim_set
);
281 type
= isl_options_get_ast_iterator_type(prog
->ctx
);
282 for (i
= 0; i
< n
; ++i
) {
286 p
= isl_printer_print_str(p
, ", ");
287 name
= isl_space_get_dim_name(kernel
->space
, isl_dim_set
, i
);
289 p
= isl_printer_print_str(p
, type
);
290 p
= isl_printer_print_str(p
, " ");
292 p
= isl_printer_print_str(p
, name
);
300 /* Print the header of the given kernel.
302 static __isl_give isl_printer
*print_kernel_header(__isl_take isl_printer
*p
,
303 struct gpu_prog
*prog
, struct ppcg_kernel
*kernel
)
305 p
= isl_printer_start_line(p
);
306 p
= isl_printer_print_str(p
, "__global__ void kernel");
307 p
= isl_printer_print_int(p
, kernel
->id
);
308 p
= isl_printer_print_str(p
, "(");
309 p
= print_kernel_arguments(p
, prog
, kernel
, 1);
310 p
= isl_printer_print_str(p
, ")");
315 /* Print the header of the given kernel to both gen->cuda.kernel_h
316 * and gen->cuda.kernel_c.
318 static void print_kernel_headers(struct gpu_prog
*prog
,
319 struct ppcg_kernel
*kernel
, struct cuda_info
*cuda
)
323 p
= isl_printer_to_file(prog
->ctx
, cuda
->kernel_h
);
324 p
= isl_printer_set_output_format(p
, ISL_FORMAT_C
);
325 p
= print_kernel_header(p
, prog
, kernel
);
326 p
= isl_printer_print_str(p
, ";");
327 p
= isl_printer_end_line(p
);
330 p
= isl_printer_to_file(prog
->ctx
, cuda
->kernel_c
);
331 p
= isl_printer_set_output_format(p
, ISL_FORMAT_C
);
332 p
= print_kernel_header(p
, prog
, kernel
);
333 p
= isl_printer_end_line(p
);
337 static void print_indent(FILE *dst
, int indent
)
339 fprintf(dst
, "%*s", indent
, "");
342 /* Print a list of iterators of type "type" with names "ids" to "out".
343 * Each iterator is assigned one of the cuda identifiers in cuda_dims.
344 * In particular, the last iterator is assigned the x identifier
345 * (the first in the list of cuda identifiers).
347 static void print_iterators(FILE *out
, const char *type
,
348 __isl_keep isl_id_list
*ids
, const char *cuda_dims
[])
352 n
= isl_id_list_n_id(ids
);
355 print_indent(out
, 4);
356 fprintf(out
, "%s ", type
);
357 for (i
= 0; i
< n
; ++i
) {
362 id
= isl_id_list_get_id(ids
, i
);
363 fprintf(out
, "%s = %s", isl_id_get_name(id
),
364 cuda_dims
[n
- 1 - i
]);
370 static void print_kernel_iterators(FILE *out
, struct ppcg_kernel
*kernel
)
372 isl_ctx
*ctx
= isl_ast_node_get_ctx(kernel
->tree
);
374 const char *block_dims
[] = { "blockIdx.x", "blockIdx.y" };
375 const char *thread_dims
[] = { "threadIdx.x", "threadIdx.y",
378 type
= isl_options_get_ast_iterator_type(ctx
);
380 print_iterators(out
, type
, kernel
->block_ids
, block_dims
);
381 print_iterators(out
, type
, kernel
->thread_ids
, thread_dims
);
384 static __isl_give isl_printer
*print_kernel_var(__isl_take isl_printer
*p
,
385 struct ppcg_kernel_var
*var
)
389 p
= isl_printer_start_line(p
);
390 if (var
->type
== ppcg_access_shared
)
391 p
= isl_printer_print_str(p
, "__shared__ ");
392 p
= isl_printer_print_str(p
, var
->array
->type
);
393 p
= isl_printer_print_str(p
, " ");
394 p
= isl_printer_print_str(p
, var
->name
);
395 for (j
= 0; j
< var
->array
->n_index
; ++j
) {
398 p
= isl_printer_print_str(p
, "[");
399 v
= isl_vec_get_element_val(var
->size
, j
);
400 p
= isl_printer_print_val(p
, v
);
402 p
= isl_printer_print_str(p
, "]");
404 p
= isl_printer_print_str(p
, ";");
405 p
= isl_printer_end_line(p
);
410 static __isl_give isl_printer
*print_kernel_vars(__isl_take isl_printer
*p
,
411 struct ppcg_kernel
*kernel
)
415 for (i
= 0; i
< kernel
->n_var
; ++i
)
416 p
= print_kernel_var(p
, &kernel
->var
[i
]);
421 /* Print a sync statement.
423 static __isl_give isl_printer
*print_sync(__isl_take isl_printer
*p
,
424 struct ppcg_kernel_stmt
*stmt
)
426 p
= isl_printer_start_line(p
);
427 p
= isl_printer_print_str(p
, "__syncthreads();");
428 p
= isl_printer_end_line(p
);
433 /* This function is called for each user statement in the AST,
434 * i.e., for each kernel body statement, copy statement or sync statement.
436 static __isl_give isl_printer
*print_kernel_stmt(__isl_take isl_printer
*p
,
437 __isl_take isl_ast_print_options
*print_options
,
438 __isl_keep isl_ast_node
*node
, void *user
)
441 struct ppcg_kernel_stmt
*stmt
;
443 id
= isl_ast_node_get_annotation(node
);
444 stmt
= isl_id_get_user(id
);
447 isl_ast_print_options_free(print_options
);
449 switch (stmt
->type
) {
450 case ppcg_kernel_copy
:
451 return ppcg_kernel_print_copy(p
, stmt
);
452 case ppcg_kernel_sync
:
453 return print_sync(p
, stmt
);
454 case ppcg_kernel_domain
:
455 return ppcg_kernel_print_domain(p
, stmt
);
461 static void print_kernel(struct gpu_prog
*prog
, struct ppcg_kernel
*kernel
,
462 struct cuda_info
*cuda
)
464 isl_ctx
*ctx
= isl_ast_node_get_ctx(kernel
->tree
);
465 isl_ast_print_options
*print_options
;
468 print_kernel_headers(prog
, kernel
, cuda
);
469 fprintf(cuda
->kernel_c
, "{\n");
470 print_kernel_iterators(cuda
->kernel_c
, kernel
);
472 p
= isl_printer_to_file(ctx
, cuda
->kernel_c
);
473 p
= isl_printer_set_output_format(p
, ISL_FORMAT_C
);
474 p
= isl_printer_indent(p
, 4);
476 p
= print_kernel_vars(p
, kernel
);
477 p
= isl_printer_end_line(p
);
478 p
= gpu_print_macros(p
, kernel
->tree
);
480 print_options
= isl_ast_print_options_alloc(ctx
);
481 print_options
= isl_ast_print_options_set_print_user(print_options
,
482 &print_kernel_stmt
, NULL
);
483 p
= isl_ast_node_print(kernel
->tree
, p
, print_options
);
486 fprintf(cuda
->kernel_c
, "}\n");
489 /* Print a statement for copying an array to or from the device.
490 * The statement identifier is called "to_device_<array name>" or
491 * "from_device_<array name>" and its user pointer points
492 * to the gpu_array_info of the array that needs to be copied.
494 * Extract the array from the identifier and call
495 * copy_array_to_device or copy_array_from_device.
497 static __isl_give isl_printer
*print_to_from_device(__isl_take isl_printer
*p
,
498 __isl_keep isl_ast_node
*node
, struct gpu_prog
*prog
)
500 isl_ast_expr
*expr
, *arg
;
503 struct gpu_array_info
*array
;
505 expr
= isl_ast_node_user_get_expr(node
);
506 arg
= isl_ast_expr_get_op_arg(expr
, 0);
507 id
= isl_ast_expr_get_id(arg
);
508 name
= isl_id_get_name(id
);
509 array
= isl_id_get_user(id
);
511 isl_ast_expr_free(arg
);
512 isl_ast_expr_free(expr
);
517 return isl_printer_free(p
);
519 if (!prefixcmp(name
, "to_device"))
520 return copy_array_to_device(p
, array
);
522 return copy_array_from_device(p
, array
);
525 struct print_host_user_data
{
526 struct cuda_info
*cuda
;
527 struct gpu_prog
*prog
;
530 /* Print the user statement of the host code to "p".
532 * The host code only contains kernel launches and statements
533 * that copy data to/from the device.
534 * The kernel launches have an associated annotation, while
535 * the data copy statements do not.
536 * The latter are handled by print_to_from_device.
538 * In case of a kernel launch, print a block of statements that
539 * defines the grid and the block and then launches the kernel.
541 static __isl_give isl_printer
*print_host_user(__isl_take isl_printer
*p
,
542 __isl_take isl_ast_print_options
*print_options
,
543 __isl_keep isl_ast_node
*node
, void *user
)
546 struct ppcg_kernel
*kernel
;
547 struct print_host_user_data
*data
;
549 isl_ast_print_options_free(print_options
);
551 data
= (struct print_host_user_data
*) user
;
553 id
= isl_ast_node_get_annotation(node
);
555 return print_to_from_device(p
, node
, data
->prog
);
557 kernel
= isl_id_get_user(id
);
560 p
= isl_printer_start_line(p
);
561 p
= isl_printer_print_str(p
, "{");
562 p
= isl_printer_end_line(p
);
563 p
= isl_printer_indent(p
, 2);
565 p
= isl_printer_start_line(p
);
566 p
= isl_printer_print_str(p
, "dim3 k");
567 p
= isl_printer_print_int(p
, kernel
->id
);
568 p
= isl_printer_print_str(p
, "_dimBlock");
569 print_reverse_list(isl_printer_get_file(p
),
570 kernel
->n_block
, kernel
->block_dim
);
571 p
= isl_printer_print_str(p
, ";");
572 p
= isl_printer_end_line(p
);
574 p
= print_grid(p
, kernel
);
576 p
= isl_printer_start_line(p
);
577 p
= isl_printer_print_str(p
, "kernel");
578 p
= isl_printer_print_int(p
, kernel
->id
);
579 p
= isl_printer_print_str(p
, " <<<k");
580 p
= isl_printer_print_int(p
, kernel
->id
);
581 p
= isl_printer_print_str(p
, "_dimGrid, k");
582 p
= isl_printer_print_int(p
, kernel
->id
);
583 p
= isl_printer_print_str(p
, "_dimBlock>>> (");
584 p
= print_kernel_arguments(p
, data
->prog
, kernel
, 0);
585 p
= isl_printer_print_str(p
, ");");
586 p
= isl_printer_end_line(p
);
588 p
= isl_printer_start_line(p
);
589 p
= isl_printer_print_str(p
, "cudaCheckKernel();");
590 p
= isl_printer_end_line(p
);
592 p
= isl_printer_indent(p
, -2);
593 p
= isl_printer_start_line(p
);
594 p
= isl_printer_print_str(p
, "}");
595 p
= isl_printer_end_line(p
);
597 p
= isl_printer_start_line(p
);
598 p
= isl_printer_end_line(p
);
600 print_kernel(data
->prog
, kernel
, data
->cuda
);
605 static __isl_give isl_printer
*print_host_code(__isl_take isl_printer
*p
,
606 struct gpu_prog
*prog
, __isl_keep isl_ast_node
*tree
,
607 struct cuda_info
*cuda
)
609 isl_ast_print_options
*print_options
;
610 isl_ctx
*ctx
= isl_ast_node_get_ctx(tree
);
611 struct print_host_user_data data
= { cuda
, prog
};
613 print_options
= isl_ast_print_options_alloc(ctx
);
614 print_options
= isl_ast_print_options_set_print_user(print_options
,
615 &print_host_user
, &data
);
617 p
= gpu_print_macros(p
, tree
);
618 p
= isl_ast_node_print(tree
, p
, print_options
);
623 static __isl_give isl_printer
*free_device_arrays(__isl_take isl_printer
*p
,
624 struct gpu_prog
*prog
)
628 for (i
= 0; i
< prog
->n_array
; ++i
) {
629 if (gpu_array_is_read_only_scalar(&prog
->array
[i
]))
631 if (!prog
->array
[i
].accessed
)
633 p
= isl_printer_start_line(p
);
634 p
= isl_printer_print_str(p
, "cudaCheckReturn(cudaFree(dev_");
635 p
= isl_printer_print_str(p
, prog
->array
[i
].name
);
636 p
= isl_printer_print_str(p
, "));");
637 p
= isl_printer_end_line(p
);
643 /* Given a gpu_prog "prog" and the corresponding transformed AST
644 * "tree", print the entire CUDA code to "p".
645 * "types" collects the types for which a definition has already
648 static __isl_give isl_printer
*print_cuda(__isl_take isl_printer
*p
,
649 struct gpu_prog
*prog
, __isl_keep isl_ast_node
*tree
,
650 struct gpu_types
*types
, void *user
)
652 struct cuda_info
*cuda
= user
;
655 kernel
= isl_printer_to_file(isl_printer_get_ctx(p
), cuda
->kernel_c
);
656 kernel
= isl_printer_set_output_format(kernel
, ISL_FORMAT_C
);
657 kernel
= gpu_print_types(kernel
, types
, prog
);
658 isl_printer_free(kernel
);
661 return isl_printer_free(p
);
663 p
= ppcg_start_block(p
);
665 p
= print_cuda_macros(p
);
667 p
= gpu_print_local_declarations(p
, prog
);
668 p
= declare_device_arrays(p
, prog
);
669 p
= allocate_device_arrays(p
, prog
);
671 p
= print_host_code(p
, prog
, tree
, cuda
);
673 p
= free_device_arrays(p
, prog
);
675 p
= ppcg_end_block(p
);
680 /* Transform the code in the file called "input" by replacing
681 * all scops by corresponding CUDA code.
682 * The names of the output files are derived from "input".
684 * We let generate_gpu do all the hard work and then let it call
685 * us back for printing the AST in print_cuda.
687 * To prepare for this printing, we first open the output files
688 * and we close them after generate_gpu has finished.
690 int generate_cuda(isl_ctx
*ctx
, struct ppcg_options
*options
,
693 struct cuda_info cuda
;
696 cuda_open_files(&cuda
, input
);
698 r
= generate_gpu(ctx
, input
, cuda
.host_c
, options
, &print_cuda
, &cuda
);
700 cuda_close_files(&cuda
);