2 * Copyright 2012 Ecole Normale Superieure
4 * Use of this software is governed by the MIT license
6 * Written by Sven Verdoolaege,
7 * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
13 #include "cuda_common.h"
16 #include "gpu_print.h"
19 static __isl_give isl_printer
*print_cuda_macros(__isl_take isl_printer
*p
)
22 "#define cudaCheckReturn(ret) \\\n"
24 " cudaError_t cudaCheckReturn_e = (ret); \\\n"
25 " if (cudaCheckReturn_e != cudaSuccess) { \\\n"
26 " fprintf(stderr, \"CUDA error: %s\\n\", "
27 "cudaGetErrorString(cudaCheckReturn_e)); \\\n"
28 " fflush(stderr); \\\n"
30 " assert(cudaCheckReturn_e == cudaSuccess); \\\n"
32 "#define cudaCheckKernel() \\\n"
34 " cudaCheckReturn(cudaGetLastError()); \\\n"
37 p
= isl_printer_print_str(p
, macros
);
41 static __isl_give isl_printer
*declare_device_arrays(__isl_take isl_printer
*p
,
42 struct gpu_prog
*prog
)
46 for (i
= 0; i
< prog
->n_array
; ++i
) {
47 if (gpu_array_is_read_only_scalar(&prog
->array
[i
]))
49 p
= isl_printer_start_line(p
);
50 p
= isl_printer_print_str(p
, prog
->array
[i
].type
);
51 p
= isl_printer_print_str(p
, " *dev_");
52 p
= isl_printer_print_str(p
, prog
->array
[i
].name
);
53 p
= isl_printer_print_str(p
, ";");
54 p
= isl_printer_end_line(p
);
56 p
= isl_printer_start_line(p
);
57 p
= isl_printer_end_line(p
);
61 static __isl_give isl_printer
*allocate_device_arrays(
62 __isl_take isl_printer
*p
, struct gpu_prog
*prog
)
66 for (i
= 0; i
< prog
->n_array
; ++i
) {
67 if (gpu_array_is_read_only_scalar(&prog
->array
[i
]))
69 p
= isl_printer_start_line(p
);
70 p
= isl_printer_print_str(p
,
71 "cudaCheckReturn(cudaMalloc((void **) &dev_");
72 p
= isl_printer_print_str(p
, prog
->array
[i
].name
);
73 p
= isl_printer_print_str(p
, ", ");
74 p
= gpu_array_info_print_size(p
, &prog
->array
[i
]);
75 p
= isl_printer_print_str(p
, "));");
76 p
= isl_printer_end_line(p
);
78 p
= isl_printer_start_line(p
);
79 p
= isl_printer_end_line(p
);
83 static __isl_give isl_printer
*copy_arrays_to_device(__isl_take isl_printer
*p
,
84 struct gpu_prog
*prog
)
88 for (i
= 0; i
< prog
->n_array
; ++i
) {
93 if (gpu_array_is_read_only_scalar(&prog
->array
[i
]))
96 dim
= isl_space_copy(prog
->array
[i
].space
);
97 read_i
= isl_union_set_extract_set(prog
->copy_in
, dim
);
98 empty
= isl_set_fast_is_empty(read_i
);
103 p
= isl_printer_start_line(p
);
104 p
= isl_printer_print_str(p
, "cudaCheckReturn(cudaMemcpy(dev_");
105 p
= isl_printer_print_str(p
, prog
->array
[i
].name
);
106 p
= isl_printer_print_str(p
, ", ");
108 if (gpu_array_is_scalar(&prog
->array
[i
]))
109 p
= isl_printer_print_str(p
, "&");
110 p
= isl_printer_print_str(p
, prog
->array
[i
].name
);
111 p
= isl_printer_print_str(p
, ", ");
113 p
= gpu_array_info_print_size(p
, &prog
->array
[i
]);
114 p
= isl_printer_print_str(p
, ", cudaMemcpyHostToDevice));");
115 p
= isl_printer_end_line(p
);
117 p
= isl_printer_start_line(p
);
118 p
= isl_printer_end_line(p
);
122 static void print_reverse_list(FILE *out
, int len
, int *list
)
130 for (i
= 0; i
< len
; ++i
) {
133 fprintf(out
, "%d", list
[len
- 1 - i
]);
138 /* Print the effective grid size as a list of the sizes in each
139 * dimension, from innermost to outermost.
141 static __isl_give isl_printer
*print_grid_size(__isl_take isl_printer
*p
,
142 struct ppcg_kernel
*kernel
)
147 dim
= isl_multi_pw_aff_dim(kernel
->grid_size
, isl_dim_set
);
151 p
= isl_printer_print_str(p
, "(");
152 for (i
= dim
- 1; i
>= 0; --i
) {
155 bound
= isl_multi_pw_aff_get_pw_aff(kernel
->grid_size
, i
);
156 p
= isl_printer_print_pw_aff(p
, bound
);
157 isl_pw_aff_free(bound
);
160 p
= isl_printer_print_str(p
, ", ");
163 p
= isl_printer_print_str(p
, ")");
168 /* Print the grid definition.
170 static __isl_give isl_printer
*print_grid(__isl_take isl_printer
*p
,
171 struct ppcg_kernel
*kernel
)
173 p
= isl_printer_start_line(p
);
174 p
= isl_printer_print_str(p
, "dim3 k");
175 p
= isl_printer_print_int(p
, kernel
->id
);
176 p
= isl_printer_print_str(p
, "_dimGrid");
177 p
= print_grid_size(p
, kernel
);
178 p
= isl_printer_print_str(p
, ";");
179 p
= isl_printer_end_line(p
);
184 /* Print the arguments to a kernel declaration or call. If "types" is set,
185 * then print a declaration (including the types of the arguments).
187 * The arguments are printed in the following order
188 * - the arrays accessed by the kernel
190 * - the host loop iterators
192 static __isl_give isl_printer
*print_kernel_arguments(__isl_take isl_printer
*p
,
193 struct gpu_prog
*prog
, struct ppcg_kernel
*kernel
, int types
)
201 for (i
= 0; i
< prog
->n_array
; ++i
) {
205 space
= isl_space_copy(prog
->array
[i
].space
);
206 arr
= isl_union_set_extract_set(kernel
->arrays
, space
);
207 empty
= isl_set_fast_is_empty(arr
);
213 p
= isl_printer_print_str(p
, ", ");
216 p
= isl_printer_print_str(p
, prog
->array
[i
].type
);
217 p
= isl_printer_print_str(p
, " ");
220 if (gpu_array_is_read_only_scalar(&prog
->array
[i
])) {
221 p
= isl_printer_print_str(p
, prog
->array
[i
].name
);
224 p
= isl_printer_print_str(p
, "*");
226 p
= isl_printer_print_str(p
, "dev_");
227 p
= isl_printer_print_str(p
, prog
->array
[i
].name
);
233 space
= isl_union_set_get_space(kernel
->arrays
);
234 nparam
= isl_space_dim(space
, isl_dim_param
);
235 for (i
= 0; i
< nparam
; ++i
) {
238 name
= isl_space_get_dim_name(space
, isl_dim_param
, i
);
241 p
= isl_printer_print_str(p
, ", ");
243 p
= isl_printer_print_str(p
, "int ");
244 p
= isl_printer_print_str(p
, name
);
248 isl_space_free(space
);
250 n
= isl_space_dim(kernel
->space
, isl_dim_set
);
251 type
= isl_options_get_ast_iterator_type(prog
->ctx
);
252 for (i
= 0; i
< n
; ++i
) {
257 p
= isl_printer_print_str(p
, ", ");
258 name
= isl_space_get_dim_name(kernel
->space
, isl_dim_set
, i
);
260 p
= isl_printer_print_str(p
, type
);
261 p
= isl_printer_print_str(p
, " ");
263 p
= isl_printer_print_str(p
, name
);
271 /* Print the header of the given kernel.
273 static __isl_give isl_printer
*print_kernel_header(__isl_take isl_printer
*p
,
274 struct gpu_prog
*prog
, struct ppcg_kernel
*kernel
)
276 p
= isl_printer_start_line(p
);
277 p
= isl_printer_print_str(p
, "__global__ void kernel");
278 p
= isl_printer_print_int(p
, kernel
->id
);
279 p
= isl_printer_print_str(p
, "(");
280 p
= print_kernel_arguments(p
, prog
, kernel
, 1);
281 p
= isl_printer_print_str(p
, ")");
286 /* Print the header of the given kernel to both gen->cuda.kernel_h
287 * and gen->cuda.kernel_c.
289 static void print_kernel_headers(struct gpu_prog
*prog
,
290 struct ppcg_kernel
*kernel
, struct cuda_info
*cuda
)
294 p
= isl_printer_to_file(prog
->ctx
, cuda
->kernel_h
);
295 p
= isl_printer_set_output_format(p
, ISL_FORMAT_C
);
296 p
= print_kernel_header(p
, prog
, kernel
);
297 p
= isl_printer_print_str(p
, ";");
298 p
= isl_printer_end_line(p
);
301 p
= isl_printer_to_file(prog
->ctx
, cuda
->kernel_c
);
302 p
= isl_printer_set_output_format(p
, ISL_FORMAT_C
);
303 p
= print_kernel_header(p
, prog
, kernel
);
304 p
= isl_printer_end_line(p
);
308 static void print_indent(FILE *dst
, int indent
)
310 fprintf(dst
, "%*s", indent
, "");
313 static void print_kernel_iterators(FILE *out
, struct ppcg_kernel
*kernel
)
316 isl_ctx
*ctx
= isl_ast_node_get_ctx(kernel
->tree
);
318 const char *block_dims
[] = { "blockIdx.x", "blockIdx.y" };
319 const char *thread_dims
[] = { "threadIdx.x", "threadIdx.y",
322 type
= isl_options_get_ast_iterator_type(ctx
);
324 n_grid
= isl_multi_pw_aff_dim(kernel
->grid_size
, isl_dim_set
);
326 print_indent(out
, 4);
327 fprintf(out
, "%s ", type
);
328 for (i
= 0; i
< n_grid
; ++i
) {
331 fprintf(out
, "b%d = %s",
332 i
, block_dims
[n_grid
- 1 - i
]);
337 if (kernel
->n_block
> 0) {
338 print_indent(out
, 4);
339 fprintf(out
, "%s ", type
);
340 for (i
= 0; i
< kernel
->n_block
; ++i
) {
343 fprintf(out
, "t%d = %s",
344 i
, thread_dims
[kernel
->n_block
- 1 - i
]);
350 static __isl_give isl_printer
*print_kernel_var(__isl_take isl_printer
*p
,
351 struct ppcg_kernel_var
*var
)
355 p
= isl_printer_start_line(p
);
356 if (var
->type
== ppcg_access_shared
)
357 p
= isl_printer_print_str(p
, "__shared__ ");
358 p
= isl_printer_print_str(p
, var
->array
->type
);
359 p
= isl_printer_print_str(p
, " ");
360 p
= isl_printer_print_str(p
, var
->name
);
361 for (j
= 0; j
< var
->array
->n_index
; ++j
) {
364 p
= isl_printer_print_str(p
, "[");
365 v
= isl_vec_get_element_val(var
->size
, j
);
366 p
= isl_printer_print_val(p
, v
);
368 p
= isl_printer_print_str(p
, "]");
370 p
= isl_printer_print_str(p
, ";");
371 p
= isl_printer_end_line(p
);
376 static __isl_give isl_printer
*print_kernel_vars(__isl_take isl_printer
*p
,
377 struct ppcg_kernel
*kernel
)
381 for (i
= 0; i
< kernel
->n_var
; ++i
)
382 p
= print_kernel_var(p
, &kernel
->var
[i
]);
387 /* Print a sync statement.
389 static __isl_give isl_printer
*print_sync(__isl_take isl_printer
*p
,
390 struct ppcg_kernel_stmt
*stmt
)
392 p
= isl_printer_start_line(p
);
393 p
= isl_printer_print_str(p
, "__syncthreads();");
394 p
= isl_printer_end_line(p
);
399 /* This function is called for each user statement in the AST,
400 * i.e., for each kernel body statement, copy statement or sync statement.
402 static __isl_give isl_printer
*print_kernel_stmt(__isl_take isl_printer
*p
,
403 __isl_take isl_ast_print_options
*print_options
,
404 __isl_keep isl_ast_node
*node
, void *user
)
407 struct ppcg_kernel_stmt
*stmt
;
409 id
= isl_ast_node_get_annotation(node
);
410 stmt
= isl_id_get_user(id
);
413 isl_ast_print_options_free(print_options
);
415 switch (stmt
->type
) {
416 case ppcg_kernel_copy
:
417 return ppcg_kernel_print_copy(p
, stmt
);
418 case ppcg_kernel_sync
:
419 return print_sync(p
, stmt
);
420 case ppcg_kernel_domain
:
421 return ppcg_kernel_print_domain(p
, stmt
);
427 static void print_kernel(struct gpu_prog
*prog
, struct ppcg_kernel
*kernel
,
428 struct cuda_info
*cuda
)
430 isl_ctx
*ctx
= isl_ast_node_get_ctx(kernel
->tree
);
431 isl_ast_print_options
*print_options
;
434 print_kernel_headers(prog
, kernel
, cuda
);
435 fprintf(cuda
->kernel_c
, "{\n");
436 print_kernel_iterators(cuda
->kernel_c
, kernel
);
438 p
= isl_printer_to_file(ctx
, cuda
->kernel_c
);
439 p
= isl_printer_set_output_format(p
, ISL_FORMAT_C
);
440 p
= isl_printer_indent(p
, 4);
442 p
= print_kernel_vars(p
, kernel
);
443 p
= isl_printer_end_line(p
);
444 p
= gpu_print_macros(p
, kernel
->tree
);
446 print_options
= isl_ast_print_options_alloc(ctx
);
447 print_options
= isl_ast_print_options_set_print_user(print_options
,
448 &print_kernel_stmt
, NULL
);
449 p
= isl_ast_node_print(kernel
->tree
, p
, print_options
);
452 fprintf(cuda
->kernel_c
, "}\n");
455 struct print_host_user_data
{
456 struct cuda_info
*cuda
;
457 struct gpu_prog
*prog
;
460 /* Print the user statement of the host code to "p".
462 * In particular, print a block of statements that defines the grid
463 * and the block and then launches the kernel.
465 static __isl_give isl_printer
*print_host_user(__isl_take isl_printer
*p
,
466 __isl_take isl_ast_print_options
*print_options
,
467 __isl_keep isl_ast_node
*node
, void *user
)
470 struct ppcg_kernel
*kernel
;
471 struct print_host_user_data
*data
;
473 id
= isl_ast_node_get_annotation(node
);
474 kernel
= isl_id_get_user(id
);
477 data
= (struct print_host_user_data
*) user
;
479 p
= isl_printer_start_line(p
);
480 p
= isl_printer_print_str(p
, "{");
481 p
= isl_printer_end_line(p
);
482 p
= isl_printer_indent(p
, 2);
484 p
= isl_printer_start_line(p
);
485 p
= isl_printer_print_str(p
, "dim3 k");
486 p
= isl_printer_print_int(p
, kernel
->id
);
487 p
= isl_printer_print_str(p
, "_dimBlock");
488 print_reverse_list(isl_printer_get_file(p
),
489 kernel
->n_block
, kernel
->block_dim
);
490 p
= isl_printer_print_str(p
, ";");
491 p
= isl_printer_end_line(p
);
493 p
= print_grid(p
, kernel
);
495 p
= isl_printer_start_line(p
);
496 p
= isl_printer_print_str(p
, "kernel");
497 p
= isl_printer_print_int(p
, kernel
->id
);
498 p
= isl_printer_print_str(p
, " <<<k");
499 p
= isl_printer_print_int(p
, kernel
->id
);
500 p
= isl_printer_print_str(p
, "_dimGrid, k");
501 p
= isl_printer_print_int(p
, kernel
->id
);
502 p
= isl_printer_print_str(p
, "_dimBlock>>> (");
503 p
= print_kernel_arguments(p
, data
->prog
, kernel
, 0);
504 p
= isl_printer_print_str(p
, ");");
505 p
= isl_printer_end_line(p
);
507 p
= isl_printer_start_line(p
);
508 p
= isl_printer_print_str(p
, "cudaCheckKernel();");
509 p
= isl_printer_end_line(p
);
511 p
= isl_printer_indent(p
, -2);
512 p
= isl_printer_start_line(p
);
513 p
= isl_printer_print_str(p
, "}");
514 p
= isl_printer_end_line(p
);
516 p
= isl_printer_start_line(p
);
517 p
= isl_printer_end_line(p
);
519 print_kernel(data
->prog
, kernel
, data
->cuda
);
521 isl_ast_print_options_free(print_options
);
526 static __isl_give isl_printer
*print_host_code(__isl_take isl_printer
*p
,
527 struct gpu_prog
*prog
, __isl_keep isl_ast_node
*tree
,
528 struct cuda_info
*cuda
)
530 isl_ast_print_options
*print_options
;
531 isl_ctx
*ctx
= isl_ast_node_get_ctx(tree
);
532 struct print_host_user_data data
= { cuda
, prog
};
534 print_options
= isl_ast_print_options_alloc(ctx
);
535 print_options
= isl_ast_print_options_set_print_user(print_options
,
536 &print_host_user
, &data
);
538 p
= gpu_print_macros(p
, tree
);
539 p
= isl_ast_node_print(tree
, p
, print_options
);
544 /* For each array that needs to be copied out (based on prog->copy_out),
545 * copy the contents back from the GPU to the host.
547 * If any element of a given array appears in prog->copy_out, then its
548 * entire extent is in prog->copy_out. The bounds on this extent have
549 * been precomputed in extract_array_info and are used in
550 * gpu_array_info_print_size.
552 static __isl_give isl_printer
*copy_arrays_from_device(
553 __isl_take isl_printer
*p
, struct gpu_prog
*prog
)
556 isl_union_set
*copy_out
;
557 copy_out
= isl_union_set_copy(prog
->copy_out
);
559 for (i
= 0; i
< prog
->n_array
; ++i
) {
564 dim
= isl_space_copy(prog
->array
[i
].space
);
565 copy_out_i
= isl_union_set_extract_set(copy_out
, dim
);
566 empty
= isl_set_fast_is_empty(copy_out_i
);
567 isl_set_free(copy_out_i
);
571 p
= isl_printer_start_line(p
);
572 p
= isl_printer_print_str(p
, "cudaCheckReturn(cudaMemcpy(");
573 if (gpu_array_is_scalar(&prog
->array
[i
]))
574 p
= isl_printer_print_str(p
, "&");
575 p
= isl_printer_print_str(p
, prog
->array
[i
].name
);
576 p
= isl_printer_print_str(p
, ", dev_");
577 p
= isl_printer_print_str(p
, prog
->array
[i
].name
);
578 p
= isl_printer_print_str(p
, ", ");
579 p
= gpu_array_info_print_size(p
, &prog
->array
[i
]);
580 p
= isl_printer_print_str(p
, ", cudaMemcpyDeviceToHost));");
581 p
= isl_printer_end_line(p
);
584 isl_union_set_free(copy_out
);
585 p
= isl_printer_start_line(p
);
586 p
= isl_printer_end_line(p
);
590 static __isl_give isl_printer
*free_device_arrays(__isl_take isl_printer
*p
,
591 struct gpu_prog
*prog
)
595 for (i
= 0; i
< prog
->n_array
; ++i
) {
596 if (gpu_array_is_read_only_scalar(&prog
->array
[i
]))
598 p
= isl_printer_start_line(p
);
599 p
= isl_printer_print_str(p
, "cudaCheckReturn(cudaFree(dev_");
600 p
= isl_printer_print_str(p
, prog
->array
[i
].name
);
601 p
= isl_printer_print_str(p
, "));");
602 p
= isl_printer_end_line(p
);
608 /* Given a gpu_prog "prog" and the corresponding transformed AST
609 * "tree", print the entire CUDA code to "p".
611 static __isl_give isl_printer
*print_cuda(__isl_take isl_printer
*p
,
612 struct gpu_prog
*prog
, __isl_keep isl_ast_node
*tree
,
615 struct cuda_info
*cuda
= user
;
617 p
= ppcg_start_block(p
);
619 p
= print_cuda_macros(p
);
621 p
= declare_device_arrays(p
, prog
);
622 p
= allocate_device_arrays(p
, prog
);
623 p
= copy_arrays_to_device(p
, prog
);
625 p
= print_host_code(p
, prog
, tree
, cuda
);
627 p
= copy_arrays_from_device(p
, prog
);
628 p
= free_device_arrays(p
, prog
);
630 p
= ppcg_end_block(p
);
635 /* Transform the code in the file called "input" by replacing
636 * all scops by corresponding CUDA code.
637 * The names of the output files are derived from "input".
639 * We let generate_gpu do all the hard work and then let it call
640 * us back for printing the AST in print_cuda.
642 * To prepare for this printing, we first open the output files
643 * and we close them after generate_gpu has finished.
645 int generate_cuda(isl_ctx
*ctx
, struct ppcg_options
*options
,
648 struct cuda_info cuda
;
651 cuda_open_files(&cuda
, input
);
653 r
= generate_gpu(ctx
, input
, cuda
.host_c
, options
, &print_cuda
, &cuda
);
655 cuda_close_files(&cuda
);