2 * Copyright 2012 Ecole Normale Superieure
4 * Use of this software is governed by the MIT license
6 * Written by Sven Verdoolaege,
7 * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
13 #include "cuda_common.h"
16 #include "gpu_print.h"
19 static __isl_give isl_printer
*print_cuda_macros(__isl_take isl_printer
*p
)
22 "#define cudaCheckReturn(ret) \\\n"
24 " cudaError_t cudaCheckReturn_e = (ret); \\\n"
25 " if (cudaCheckReturn_e != cudaSuccess) { \\\n"
26 " fprintf(stderr, \"CUDA error: %s\\n\", "
27 "cudaGetErrorString(cudaCheckReturn_e)); \\\n"
28 " fflush(stderr); \\\n"
30 " assert(cudaCheckReturn_e == cudaSuccess); \\\n"
32 "#define cudaCheckKernel() \\\n"
34 " cudaCheckReturn(cudaGetLastError()); \\\n"
37 p
= isl_printer_print_str(p
, macros
);
41 /* Print a declaration for the device array corresponding to "array" on "p".
43 static __isl_give isl_printer
*declare_device_array(__isl_take isl_printer
*p
,
44 struct gpu_array_info
*array
)
48 p
= isl_printer_start_line(p
);
49 p
= isl_printer_print_str(p
, array
->type
);
50 p
= isl_printer_print_str(p
, " ");
51 if (!array
->linearize
&& array
->n_index
> 1)
52 p
= isl_printer_print_str(p
, "(");
53 p
= isl_printer_print_str(p
, "*dev_");
54 p
= isl_printer_print_str(p
, array
->name
);
55 if (!array
->linearize
&& array
->n_index
> 1) {
56 p
= isl_printer_print_str(p
, ")");
57 for (i
= 1; i
< array
->n_index
; i
++) {
58 p
= isl_printer_print_str(p
, "[");
59 p
= isl_printer_print_pw_aff(p
, array
->bound
[i
]);
60 p
= isl_printer_print_str(p
, "]");
63 p
= isl_printer_print_str(p
, ";");
64 p
= isl_printer_end_line(p
);
69 static __isl_give isl_printer
*declare_device_arrays(__isl_take isl_printer
*p
,
70 struct gpu_prog
*prog
)
74 for (i
= 0; i
< prog
->n_array
; ++i
) {
75 if (gpu_array_is_read_only_scalar(&prog
->array
[i
]))
77 if (!prog
->array
[i
].accessed
)
80 p
= declare_device_array(p
, &prog
->array
[i
]);
82 p
= isl_printer_start_line(p
);
83 p
= isl_printer_end_line(p
);
87 static __isl_give isl_printer
*allocate_device_arrays(
88 __isl_take isl_printer
*p
, struct gpu_prog
*prog
)
92 for (i
= 0; i
< prog
->n_array
; ++i
) {
93 if (gpu_array_is_read_only_scalar(&prog
->array
[i
]))
95 if (!prog
->array
[i
].accessed
)
97 p
= isl_printer_start_line(p
);
98 p
= isl_printer_print_str(p
,
99 "cudaCheckReturn(cudaMalloc((void **) &dev_");
100 p
= isl_printer_print_str(p
, prog
->array
[i
].name
);
101 p
= isl_printer_print_str(p
, ", ");
102 p
= gpu_array_info_print_size(p
, &prog
->array
[i
]);
103 p
= isl_printer_print_str(p
, "));");
104 p
= isl_printer_end_line(p
);
106 p
= isl_printer_start_line(p
);
107 p
= isl_printer_end_line(p
);
111 /* Print code to "p" for copying "array" from the host to the device
112 * in its entirety. The bounds on the extent of "array" have
113 * been precomputed in extract_array_info and are used in
114 * gpu_array_info_print_size.
116 static __isl_give isl_printer
*copy_array_to_device(__isl_take isl_printer
*p
,
117 struct gpu_array_info
*array
)
119 p
= isl_printer_start_line(p
);
120 p
= isl_printer_print_str(p
, "cudaCheckReturn(cudaMemcpy(dev_");
121 p
= isl_printer_print_str(p
, array
->name
);
122 p
= isl_printer_print_str(p
, ", ");
124 if (gpu_array_is_scalar(array
))
125 p
= isl_printer_print_str(p
, "&");
126 p
= isl_printer_print_str(p
, array
->name
);
127 p
= isl_printer_print_str(p
, ", ");
129 p
= gpu_array_info_print_size(p
, array
);
130 p
= isl_printer_print_str(p
, ", cudaMemcpyHostToDevice));");
131 p
= isl_printer_end_line(p
);
136 /* Print code to "p" for copying "array" back from the device to the host
137 * in its entirety. The bounds on the extent of "array" have
138 * been precomputed in extract_array_info and are used in
139 * gpu_array_info_print_size.
141 static __isl_give isl_printer
*copy_array_from_device(
142 __isl_take isl_printer
*p
, struct gpu_array_info
*array
)
144 p
= isl_printer_start_line(p
);
145 p
= isl_printer_print_str(p
, "cudaCheckReturn(cudaMemcpy(");
146 if (gpu_array_is_scalar(array
))
147 p
= isl_printer_print_str(p
, "&");
148 p
= isl_printer_print_str(p
, array
->name
);
149 p
= isl_printer_print_str(p
, ", dev_");
150 p
= isl_printer_print_str(p
, array
->name
);
151 p
= isl_printer_print_str(p
, ", ");
152 p
= gpu_array_info_print_size(p
, array
);
153 p
= isl_printer_print_str(p
, ", cudaMemcpyDeviceToHost));");
154 p
= isl_printer_end_line(p
);
159 static __isl_give isl_printer
*copy_arrays_to_device(__isl_take isl_printer
*p
,
160 struct gpu_prog
*prog
)
164 for (i
= 0; i
< prog
->n_array
; ++i
) {
169 if (gpu_array_is_read_only_scalar(&prog
->array
[i
]))
172 space
= isl_space_copy(prog
->array
[i
].space
);
173 read_i
= isl_union_set_extract_set(prog
->copy_in
, space
);
174 empty
= isl_set_plain_is_empty(read_i
);
175 isl_set_free(read_i
);
179 p
= copy_array_to_device(p
, &prog
->array
[i
]);
181 p
= isl_printer_start_line(p
);
182 p
= isl_printer_end_line(p
);
186 static void print_reverse_list(FILE *out
, int len
, int *list
)
194 for (i
= 0; i
< len
; ++i
) {
197 fprintf(out
, "%d", list
[len
- 1 - i
]);
202 /* Print the effective grid size as a list of the sizes in each
203 * dimension, from innermost to outermost.
205 static __isl_give isl_printer
*print_grid_size(__isl_take isl_printer
*p
,
206 struct ppcg_kernel
*kernel
)
211 dim
= isl_multi_pw_aff_dim(kernel
->grid_size
, isl_dim_set
);
215 p
= isl_printer_print_str(p
, "(");
216 for (i
= dim
- 1; i
>= 0; --i
) {
219 bound
= isl_multi_pw_aff_get_pw_aff(kernel
->grid_size
, i
);
220 p
= isl_printer_print_pw_aff(p
, bound
);
221 isl_pw_aff_free(bound
);
224 p
= isl_printer_print_str(p
, ", ");
227 p
= isl_printer_print_str(p
, ")");
232 /* Print the grid definition.
234 static __isl_give isl_printer
*print_grid(__isl_take isl_printer
*p
,
235 struct ppcg_kernel
*kernel
)
237 p
= isl_printer_start_line(p
);
238 p
= isl_printer_print_str(p
, "dim3 k");
239 p
= isl_printer_print_int(p
, kernel
->id
);
240 p
= isl_printer_print_str(p
, "_dimGrid");
241 p
= print_grid_size(p
, kernel
);
242 p
= isl_printer_print_str(p
, ";");
243 p
= isl_printer_end_line(p
);
248 /* Print the arguments to a kernel declaration or call. If "types" is set,
249 * then print a declaration (including the types of the arguments).
251 * The arguments are printed in the following order
252 * - the arrays accessed by the kernel
254 * - the host loop iterators
256 static __isl_give isl_printer
*print_kernel_arguments(__isl_take isl_printer
*p
,
257 struct gpu_prog
*prog
, struct ppcg_kernel
*kernel
, int types
)
265 for (i
= 0; i
< prog
->n_array
; ++i
) {
269 space
= isl_space_copy(prog
->array
[i
].space
);
270 arr
= isl_union_set_extract_set(kernel
->arrays
, space
);
271 empty
= isl_set_plain_is_empty(arr
);
277 p
= isl_printer_print_str(p
, ", ");
280 p
= gpu_array_info_print_declaration_argument(p
,
281 &prog
->array
[i
], NULL
);
283 p
= gpu_array_info_print_call_argument(p
,
289 space
= isl_union_set_get_space(kernel
->arrays
);
290 nparam
= isl_space_dim(space
, isl_dim_param
);
291 for (i
= 0; i
< nparam
; ++i
) {
294 name
= isl_space_get_dim_name(space
, isl_dim_param
, i
);
297 p
= isl_printer_print_str(p
, ", ");
299 p
= isl_printer_print_str(p
, "int ");
300 p
= isl_printer_print_str(p
, name
);
304 isl_space_free(space
);
306 n
= isl_space_dim(kernel
->space
, isl_dim_set
);
307 type
= isl_options_get_ast_iterator_type(prog
->ctx
);
308 for (i
= 0; i
< n
; ++i
) {
312 p
= isl_printer_print_str(p
, ", ");
313 name
= isl_space_get_dim_name(kernel
->space
, isl_dim_set
, i
);
315 p
= isl_printer_print_str(p
, type
);
316 p
= isl_printer_print_str(p
, " ");
318 p
= isl_printer_print_str(p
, name
);
326 /* Print the header of the given kernel.
328 static __isl_give isl_printer
*print_kernel_header(__isl_take isl_printer
*p
,
329 struct gpu_prog
*prog
, struct ppcg_kernel
*kernel
)
331 p
= isl_printer_start_line(p
);
332 p
= isl_printer_print_str(p
, "__global__ void kernel");
333 p
= isl_printer_print_int(p
, kernel
->id
);
334 p
= isl_printer_print_str(p
, "(");
335 p
= print_kernel_arguments(p
, prog
, kernel
, 1);
336 p
= isl_printer_print_str(p
, ")");
341 /* Print the header of the given kernel to both gen->cuda.kernel_h
342 * and gen->cuda.kernel_c.
344 static void print_kernel_headers(struct gpu_prog
*prog
,
345 struct ppcg_kernel
*kernel
, struct cuda_info
*cuda
)
349 p
= isl_printer_to_file(prog
->ctx
, cuda
->kernel_h
);
350 p
= isl_printer_set_output_format(p
, ISL_FORMAT_C
);
351 p
= print_kernel_header(p
, prog
, kernel
);
352 p
= isl_printer_print_str(p
, ";");
353 p
= isl_printer_end_line(p
);
356 p
= isl_printer_to_file(prog
->ctx
, cuda
->kernel_c
);
357 p
= isl_printer_set_output_format(p
, ISL_FORMAT_C
);
358 p
= print_kernel_header(p
, prog
, kernel
);
359 p
= isl_printer_end_line(p
);
363 static void print_indent(FILE *dst
, int indent
)
365 fprintf(dst
, "%*s", indent
, "");
368 /* Print a list of iterators of type "type" with names "ids" to "out".
369 * Each iterator is assigned one of the cuda identifiers in cuda_dims.
370 * In particular, the last iterator is assigned the x identifier
371 * (the first in the list of cuda identifiers).
373 static void print_iterators(FILE *out
, const char *type
,
374 __isl_keep isl_id_list
*ids
, const char *cuda_dims
[])
378 n
= isl_id_list_n_id(ids
);
381 print_indent(out
, 4);
382 fprintf(out
, "%s ", type
);
383 for (i
= 0; i
< n
; ++i
) {
388 id
= isl_id_list_get_id(ids
, i
);
389 fprintf(out
, "%s = %s", isl_id_get_name(id
),
390 cuda_dims
[n
- 1 - i
]);
396 static void print_kernel_iterators(FILE *out
, struct ppcg_kernel
*kernel
)
398 isl_ctx
*ctx
= isl_ast_node_get_ctx(kernel
->tree
);
400 const char *block_dims
[] = { "blockIdx.x", "blockIdx.y" };
401 const char *thread_dims
[] = { "threadIdx.x", "threadIdx.y",
404 type
= isl_options_get_ast_iterator_type(ctx
);
406 print_iterators(out
, type
, kernel
->block_ids
, block_dims
);
407 print_iterators(out
, type
, kernel
->thread_ids
, thread_dims
);
410 static __isl_give isl_printer
*print_kernel_var(__isl_take isl_printer
*p
,
411 struct ppcg_kernel_var
*var
)
415 p
= isl_printer_start_line(p
);
416 if (var
->type
== ppcg_access_shared
)
417 p
= isl_printer_print_str(p
, "__shared__ ");
418 p
= isl_printer_print_str(p
, var
->array
->type
);
419 p
= isl_printer_print_str(p
, " ");
420 p
= isl_printer_print_str(p
, var
->name
);
421 for (j
= 0; j
< var
->array
->n_index
; ++j
) {
424 p
= isl_printer_print_str(p
, "[");
425 v
= isl_vec_get_element_val(var
->size
, j
);
426 p
= isl_printer_print_val(p
, v
);
428 p
= isl_printer_print_str(p
, "]");
430 p
= isl_printer_print_str(p
, ";");
431 p
= isl_printer_end_line(p
);
436 static __isl_give isl_printer
*print_kernel_vars(__isl_take isl_printer
*p
,
437 struct ppcg_kernel
*kernel
)
441 for (i
= 0; i
< kernel
->n_var
; ++i
)
442 p
= print_kernel_var(p
, &kernel
->var
[i
]);
447 /* Print a sync statement.
449 static __isl_give isl_printer
*print_sync(__isl_take isl_printer
*p
,
450 struct ppcg_kernel_stmt
*stmt
)
452 p
= isl_printer_start_line(p
);
453 p
= isl_printer_print_str(p
, "__syncthreads();");
454 p
= isl_printer_end_line(p
);
459 /* This function is called for each user statement in the AST,
460 * i.e., for each kernel body statement, copy statement or sync statement.
462 static __isl_give isl_printer
*print_kernel_stmt(__isl_take isl_printer
*p
,
463 __isl_take isl_ast_print_options
*print_options
,
464 __isl_keep isl_ast_node
*node
, void *user
)
467 struct ppcg_kernel_stmt
*stmt
;
469 id
= isl_ast_node_get_annotation(node
);
470 stmt
= isl_id_get_user(id
);
473 isl_ast_print_options_free(print_options
);
475 switch (stmt
->type
) {
476 case ppcg_kernel_copy
:
477 return ppcg_kernel_print_copy(p
, stmt
);
478 case ppcg_kernel_sync
:
479 return print_sync(p
, stmt
);
480 case ppcg_kernel_domain
:
481 return ppcg_kernel_print_domain(p
, stmt
);
487 static void print_kernel(struct gpu_prog
*prog
, struct ppcg_kernel
*kernel
,
488 struct cuda_info
*cuda
)
490 isl_ctx
*ctx
= isl_ast_node_get_ctx(kernel
->tree
);
491 isl_ast_print_options
*print_options
;
494 print_kernel_headers(prog
, kernel
, cuda
);
495 fprintf(cuda
->kernel_c
, "{\n");
496 print_kernel_iterators(cuda
->kernel_c
, kernel
);
498 p
= isl_printer_to_file(ctx
, cuda
->kernel_c
);
499 p
= isl_printer_set_output_format(p
, ISL_FORMAT_C
);
500 p
= isl_printer_indent(p
, 4);
502 p
= print_kernel_vars(p
, kernel
);
503 p
= isl_printer_end_line(p
);
504 p
= gpu_print_macros(p
, kernel
->tree
);
506 print_options
= isl_ast_print_options_alloc(ctx
);
507 print_options
= isl_ast_print_options_set_print_user(print_options
,
508 &print_kernel_stmt
, NULL
);
509 p
= isl_ast_node_print(kernel
->tree
, p
, print_options
);
512 fprintf(cuda
->kernel_c
, "}\n");
515 struct print_host_user_data
{
516 struct cuda_info
*cuda
;
517 struct gpu_prog
*prog
;
520 /* Print the user statement of the host code to "p".
522 * In particular, print a block of statements that defines the grid
523 * and the block and then launches the kernel.
525 static __isl_give isl_printer
*print_host_user(__isl_take isl_printer
*p
,
526 __isl_take isl_ast_print_options
*print_options
,
527 __isl_keep isl_ast_node
*node
, void *user
)
530 struct ppcg_kernel
*kernel
;
531 struct print_host_user_data
*data
;
533 id
= isl_ast_node_get_annotation(node
);
534 kernel
= isl_id_get_user(id
);
537 data
= (struct print_host_user_data
*) user
;
539 p
= isl_printer_start_line(p
);
540 p
= isl_printer_print_str(p
, "{");
541 p
= isl_printer_end_line(p
);
542 p
= isl_printer_indent(p
, 2);
544 p
= isl_printer_start_line(p
);
545 p
= isl_printer_print_str(p
, "dim3 k");
546 p
= isl_printer_print_int(p
, kernel
->id
);
547 p
= isl_printer_print_str(p
, "_dimBlock");
548 print_reverse_list(isl_printer_get_file(p
),
549 kernel
->n_block
, kernel
->block_dim
);
550 p
= isl_printer_print_str(p
, ";");
551 p
= isl_printer_end_line(p
);
553 p
= print_grid(p
, kernel
);
555 p
= isl_printer_start_line(p
);
556 p
= isl_printer_print_str(p
, "kernel");
557 p
= isl_printer_print_int(p
, kernel
->id
);
558 p
= isl_printer_print_str(p
, " <<<k");
559 p
= isl_printer_print_int(p
, kernel
->id
);
560 p
= isl_printer_print_str(p
, "_dimGrid, k");
561 p
= isl_printer_print_int(p
, kernel
->id
);
562 p
= isl_printer_print_str(p
, "_dimBlock>>> (");
563 p
= print_kernel_arguments(p
, data
->prog
, kernel
, 0);
564 p
= isl_printer_print_str(p
, ");");
565 p
= isl_printer_end_line(p
);
567 p
= isl_printer_start_line(p
);
568 p
= isl_printer_print_str(p
, "cudaCheckKernel();");
569 p
= isl_printer_end_line(p
);
571 p
= isl_printer_indent(p
, -2);
572 p
= isl_printer_start_line(p
);
573 p
= isl_printer_print_str(p
, "}");
574 p
= isl_printer_end_line(p
);
576 p
= isl_printer_start_line(p
);
577 p
= isl_printer_end_line(p
);
579 print_kernel(data
->prog
, kernel
, data
->cuda
);
581 isl_ast_print_options_free(print_options
);
586 static __isl_give isl_printer
*print_host_code(__isl_take isl_printer
*p
,
587 struct gpu_prog
*prog
, __isl_keep isl_ast_node
*tree
,
588 struct cuda_info
*cuda
)
590 isl_ast_print_options
*print_options
;
591 isl_ctx
*ctx
= isl_ast_node_get_ctx(tree
);
592 struct print_host_user_data data
= { cuda
, prog
};
594 print_options
= isl_ast_print_options_alloc(ctx
);
595 print_options
= isl_ast_print_options_set_print_user(print_options
,
596 &print_host_user
, &data
);
598 p
= gpu_print_macros(p
, tree
);
599 p
= isl_ast_node_print(tree
, p
, print_options
);
604 /* For each array that needs to be copied out (based on prog->copy_out),
605 * copy the contents back from the GPU to the host.
607 * If any element of a given array appears in prog->copy_out, then its
608 * entire extent is in prog->copy_out. The bounds on this extent have
609 * been precomputed in extract_array_info and are used in
610 * gpu_array_info_print_size.
612 static __isl_give isl_printer
*copy_arrays_from_device(
613 __isl_take isl_printer
*p
, struct gpu_prog
*prog
)
616 isl_union_set
*copy_out
;
617 copy_out
= isl_union_set_copy(prog
->copy_out
);
619 for (i
= 0; i
< prog
->n_array
; ++i
) {
624 space
= isl_space_copy(prog
->array
[i
].space
);
625 copy_out_i
= isl_union_set_extract_set(copy_out
, space
);
626 empty
= isl_set_plain_is_empty(copy_out_i
);
627 isl_set_free(copy_out_i
);
631 p
= copy_array_from_device(p
, &prog
->array
[i
]);
634 isl_union_set_free(copy_out
);
635 p
= isl_printer_start_line(p
);
636 p
= isl_printer_end_line(p
);
640 static __isl_give isl_printer
*free_device_arrays(__isl_take isl_printer
*p
,
641 struct gpu_prog
*prog
)
645 for (i
= 0; i
< prog
->n_array
; ++i
) {
646 if (gpu_array_is_read_only_scalar(&prog
->array
[i
]))
648 if (!prog
->array
[i
].accessed
)
650 p
= isl_printer_start_line(p
);
651 p
= isl_printer_print_str(p
, "cudaCheckReturn(cudaFree(dev_");
652 p
= isl_printer_print_str(p
, prog
->array
[i
].name
);
653 p
= isl_printer_print_str(p
, "));");
654 p
= isl_printer_end_line(p
);
660 /* Given a gpu_prog "prog" and the corresponding transformed AST
661 * "tree", print the entire CUDA code to "p".
662 * "types" collects the types for which a definition has already
665 static __isl_give isl_printer
*print_cuda(__isl_take isl_printer
*p
,
666 struct gpu_prog
*prog
, __isl_keep isl_ast_node
*tree
,
667 struct gpu_types
*types
, void *user
)
669 struct cuda_info
*cuda
= user
;
672 kernel
= isl_printer_to_file(isl_printer_get_ctx(p
), cuda
->kernel_c
);
673 kernel
= isl_printer_set_output_format(kernel
, ISL_FORMAT_C
);
674 kernel
= gpu_print_types(kernel
, types
, prog
);
675 isl_printer_free(kernel
);
678 return isl_printer_free(p
);
680 p
= ppcg_start_block(p
);
682 p
= print_cuda_macros(p
);
684 p
= declare_device_arrays(p
, prog
);
685 p
= allocate_device_arrays(p
, prog
);
686 p
= copy_arrays_to_device(p
, prog
);
688 p
= print_host_code(p
, prog
, tree
, cuda
);
690 p
= copy_arrays_from_device(p
, prog
);
691 p
= free_device_arrays(p
, prog
);
693 p
= ppcg_end_block(p
);
698 /* Transform the code in the file called "input" by replacing
699 * all scops by corresponding CUDA code.
700 * The names of the output files are derived from "input".
702 * We let generate_gpu do all the hard work and then let it call
703 * us back for printing the AST in print_cuda.
705 * To prepare for this printing, we first open the output files
706 * and we close them after generate_gpu has finished.
708 int generate_cuda(isl_ctx
*ctx
, struct ppcg_options
*options
,
711 struct cuda_info cuda
;
714 cuda_open_files(&cuda
, input
);
716 r
= generate_gpu(ctx
, input
, cuda
.host_c
, options
, &print_cuda
, &cuda
);
718 cuda_close_files(&cuda
);