2 * Copyright 2012 Ecole Normale Superieure
4 * Use of this software is governed by the GNU LGPLv2.1 license
6 * Written by Sven Verdoolaege,
7 * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
13 #include "cuda_common.h"
16 #include "pet_printer.h"
19 void print_cuda_macros(FILE *file
)
22 "#define cudaCheckReturn(ret) assert((ret) == cudaSuccess)\n"
23 "#define cudaCheckKernel()"
24 " assert(cudaGetLastError() == cudaSuccess)\n\n";
28 static void print_array_size(isl_ctx
*ctx
, FILE *out
,
29 struct gpu_array_info
*array
)
34 prn
= isl_printer_to_file(ctx
, out
);
35 prn
= isl_printer_set_output_format(prn
, ISL_FORMAT_C
);
36 for (i
= 0; i
< array
->n_index
; ++i
) {
37 prn
= isl_printer_print_str(prn
, "(");
38 prn
= isl_printer_print_pw_aff(prn
, array
->bound
[i
]);
39 prn
= isl_printer_print_str(prn
, ") * ");
41 prn
= isl_printer_print_str(prn
, "sizeof(");
42 prn
= isl_printer_print_str(prn
, array
->type
);
43 prn
= isl_printer_print_str(prn
, ")");
44 isl_printer_free(prn
);
47 static void declare_device_arrays(FILE *out
, struct gpu_prog
*prog
)
51 for (i
= 0; i
< prog
->n_array
; ++i
) {
52 if (gpu_array_is_read_only_scalar(&prog
->array
[i
]))
54 fprintf(out
, "%s *dev_%s;\n",
55 prog
->array
[i
].type
, prog
->array
[i
].name
);
60 static void allocate_device_arrays(FILE *out
, struct gpu_prog
*prog
)
64 for (i
= 0; i
< prog
->n_array
; ++i
) {
65 if (gpu_array_is_read_only_scalar(&prog
->array
[i
]))
68 "cudaCheckReturn(cudaMalloc((void **) &dev_%s, ",
70 print_array_size(prog
->ctx
, out
, &prog
->array
[i
]);
71 fprintf(out
, "));\n");
76 static void copy_arrays_to_device(FILE *out
, struct gpu_prog
*prog
)
80 for (i
= 0; i
< prog
->n_array
; ++i
) {
85 if (gpu_array_is_read_only_scalar(&prog
->array
[i
]))
88 dim
= isl_space_copy(prog
->array
[i
].dim
);
89 read_i
= isl_union_set_extract_set(prog
->copy_in
, dim
);
90 empty
= isl_set_fast_is_empty(read_i
);
95 fprintf(out
, "cudaCheckReturn(cudaMemcpy(dev_%s,",
98 if (gpu_array_is_scalar(&prog
->array
[i
]))
99 fprintf(out
, " &%s, ", prog
->array
[i
].name
);
101 fprintf(out
, " %s, ", prog
->array
[i
].name
);
103 print_array_size(prog
->ctx
, out
, &prog
->array
[i
]);
104 fprintf(out
, ", cudaMemcpyHostToDevice));\n");
109 static void print_reverse_list(FILE *out
, int len
, int *list
)
117 for (i
= 0; i
< len
; ++i
) {
120 fprintf(out
, "%d", list
[len
- 1 - i
]);
125 /* Print the effective grid size as a list of the sizes in each
126 * dimension, from innermost to outermost.
128 * The grid size specified by the user or set by default
129 * in read_grid_sizes() and applied in tile_schedule(),
130 * may be too large for the given code in the sense that
131 * it may contain blocks that don't need to execute anything.
132 * We therefore don't print this grid size, but instead the
133 * smallest grid size that ensures that all blocks that actually
134 * execute code are included in the grid.
136 * For each block dimension, we compute the maximal value of the block id
139 static __isl_give isl_printer
*print_grid_size(__isl_take isl_printer
*p
,
140 struct ppcg_kernel
*kernel
)
145 dim
= isl_set_dim(kernel
->grid
, isl_dim_set
);
149 p
= isl_printer_print_str(p
, "(");
150 for (i
= dim
- 1; i
>= 0; --i
) {
155 bound
= isl_set_dim_max(isl_set_copy(kernel
->grid
), i
);
156 bound
= isl_pw_aff_coalesce(bound
);
157 bound
= isl_pw_aff_gist(bound
, isl_set_copy(kernel
->context
));
159 space
= isl_pw_aff_get_domain_space(bound
);
160 one
= isl_aff_zero_on_domain(isl_local_space_from_space(space
));
161 one
= isl_aff_add_constant_si(one
, 1);
162 bound
= isl_pw_aff_add(bound
, isl_pw_aff_from_aff(one
));
163 p
= isl_printer_print_pw_aff(p
, bound
);
164 isl_pw_aff_free(bound
);
167 p
= isl_printer_print_str(p
, ", ");
170 p
= isl_printer_print_str(p
, ")");
175 /* Print the grid definition.
177 static __isl_give isl_printer
*print_grid(__isl_take isl_printer
*p
,
178 struct ppcg_kernel
*kernel
)
180 p
= isl_printer_start_line(p
);
181 p
= isl_printer_print_str(p
, "dim3 k");
182 p
= isl_printer_print_int(p
, kernel
->id
);
183 p
= isl_printer_print_str(p
, "_dimGrid");
184 p
= print_grid_size(p
, kernel
);
185 p
= isl_printer_print_str(p
, ";");
186 p
= isl_printer_end_line(p
);
191 /* Print the arguments to a kernel declaration or call. If "types" is set,
192 * then print a declaration (including the types of the arguments).
194 * The arguments are printed in the following order
195 * - the arrays accessed by the kernel
197 * - the host loop iterators
199 static __isl_give isl_printer
*print_kernel_arguments(__isl_take isl_printer
*p
,
200 struct gpu_prog
*prog
, struct ppcg_kernel
*kernel
, int types
)
208 for (i
= 0; i
< prog
->n_array
; ++i
) {
212 space
= isl_space_copy(prog
->array
[i
].dim
);
213 arr
= isl_union_set_extract_set(kernel
->arrays
, space
);
214 empty
= isl_set_fast_is_empty(arr
);
220 p
= isl_printer_print_str(p
, ", ");
223 p
= isl_printer_print_str(p
, prog
->array
[i
].type
);
224 p
= isl_printer_print_str(p
, " ");
227 if (gpu_array_is_read_only_scalar(&prog
->array
[i
])) {
228 p
= isl_printer_print_str(p
, prog
->array
[i
].name
);
231 p
= isl_printer_print_str(p
, "*");
233 p
= isl_printer_print_str(p
, "dev_");
234 p
= isl_printer_print_str(p
, prog
->array
[i
].name
);
240 space
= isl_union_set_get_space(kernel
->arrays
);
241 nparam
= isl_space_dim(space
, isl_dim_param
);
242 for (i
= 0; i
< nparam
; ++i
) {
245 name
= isl_space_get_dim_name(space
, isl_dim_param
, i
);
248 p
= isl_printer_print_str(p
, ", ");
250 p
= isl_printer_print_str(p
, "int ");
251 p
= isl_printer_print_str(p
, name
);
255 isl_space_free(space
);
257 n
= isl_space_dim(kernel
->space
, isl_dim_set
);
258 type
= isl_options_get_ast_iterator_type(prog
->ctx
);
259 for (i
= 0; i
< n
; ++i
) {
264 p
= isl_printer_print_str(p
, ", ");
265 name
= isl_space_get_dim_name(kernel
->space
, isl_dim_set
, i
);
267 p
= isl_printer_print_str(p
, type
);
268 p
= isl_printer_print_str(p
, " ");
270 p
= isl_printer_print_str(p
, name
);
278 /* Print the header of the given kernel.
280 static __isl_give isl_printer
*print_kernel_header(__isl_take isl_printer
*p
,
281 struct gpu_prog
*prog
, struct ppcg_kernel
*kernel
)
283 p
= isl_printer_start_line(p
);
284 p
= isl_printer_print_str(p
, "__global__ void kernel");
285 p
= isl_printer_print_int(p
, kernel
->id
);
286 p
= isl_printer_print_str(p
, "(");
287 p
= print_kernel_arguments(p
, prog
, kernel
, 1);
288 p
= isl_printer_print_str(p
, ")");
293 /* Print the header of the given kernel to both gen->cuda.kernel_h
294 * and gen->cuda.kernel_c.
296 static void print_kernel_headers(struct gpu_prog
*prog
,
297 struct ppcg_kernel
*kernel
, struct cuda_info
*cuda
)
301 p
= isl_printer_to_file(prog
->ctx
, cuda
->kernel_h
);
302 p
= isl_printer_set_output_format(p
, ISL_FORMAT_C
);
303 p
= print_kernel_header(p
, prog
, kernel
);
304 p
= isl_printer_print_str(p
, ";");
305 p
= isl_printer_end_line(p
);
308 p
= isl_printer_to_file(prog
->ctx
, cuda
->kernel_c
);
309 p
= isl_printer_set_output_format(p
, ISL_FORMAT_C
);
310 p
= print_kernel_header(p
, prog
, kernel
);
311 p
= isl_printer_end_line(p
);
315 static void print_indent(FILE *dst
, int indent
)
317 fprintf(dst
, "%*s", indent
, "");
320 static void print_kernel_iterators(FILE *out
, struct ppcg_kernel
*kernel
)
323 const char *block_dims
[] = { "blockIdx.x", "blockIdx.y" };
324 const char *thread_dims
[] = { "threadIdx.x", "threadIdx.y",
327 if (kernel
->n_grid
> 0) {
328 print_indent(out
, 4);
329 fprintf(out
, "int ");
330 for (i
= 0; i
< kernel
->n_grid
; ++i
) {
333 fprintf(out
, "b%d = %s",
334 i
, block_dims
[kernel
->n_grid
- 1 - i
]);
339 if (kernel
->n_block
> 0) {
340 print_indent(out
, 4);
341 fprintf(out
, "int ");
342 for (i
= 0; i
< kernel
->n_block
; ++i
) {
345 fprintf(out
, "t%d = %s",
346 i
, thread_dims
[kernel
->n_block
- 1 - i
]);
352 static void print_kernel_var(FILE *out
, struct ppcg_kernel_var
*var
)
357 print_indent(out
, 4);
358 if (var
->type
== ppcg_access_shared
)
359 fprintf(out
, "__shared__ ");
360 fprintf(out
, "%s %s", var
->array
->type
, var
->name
);
362 for (j
= 0; j
< var
->array
->n_index
; ++j
) {
364 isl_vec_get_element(var
->size
, j
, &v
);
365 isl_int_print(out
, v
, 0);
372 static void print_kernel_vars(FILE *out
, struct ppcg_kernel
*kernel
)
376 for (i
= 0; i
< kernel
->n_var
; ++i
)
377 print_kernel_var(out
, &kernel
->var
[i
]);
380 /* Print an access to the element in the private/shared memory copy
381 * described by "stmt". The index of the copy is recorded in
384 static __isl_give isl_printer
*stmt_print_local_index(__isl_take isl_printer
*p
,
385 struct ppcg_kernel_stmt
*stmt
)
389 struct gpu_array_info
*array
= stmt
->u
.c
.array
;
391 name
= isl_pw_multi_aff_get_tuple_name(stmt
->u
.c
.local_index
,
393 p
= isl_printer_print_str(p
, name
);
394 for (i
= 0; i
< array
->n_index
; ++i
) {
396 pa
= isl_pw_multi_aff_get_pw_aff(stmt
->u
.c
.local_index
, i
);
398 p
= isl_printer_print_str(p
, "[");
399 p
= isl_printer_print_pw_aff(p
, pa
);
400 p
= isl_printer_print_str(p
, "]");
408 /* Print an access to the element in the global memory copy
409 * described by "stmt". The index of the copy is recorded in
412 * The copy in global memory has been linearized, so we need to take
413 * the array size into account.
415 static __isl_give isl_printer
*stmt_print_global_index(
416 __isl_take isl_printer
*p
, struct ppcg_kernel_stmt
*stmt
)
419 struct gpu_array_info
*array
= stmt
->u
.c
.array
;
420 isl_pw_aff_list
*bound
= stmt
->u
.c
.local_array
->bound
;
422 if (gpu_array_is_scalar(array
)) {
423 if (!array
->read_only
)
424 p
= isl_printer_print_str(p
, "*");
425 p
= isl_printer_print_str(p
, array
->name
);
429 p
= isl_printer_print_str(p
, array
->name
);
430 p
= isl_printer_print_str(p
, "[");
431 for (i
= 0; i
+ 1 < array
->n_index
; ++i
)
432 p
= isl_printer_print_str(p
, "(");
433 for (i
= 0; i
< array
->n_index
; ++i
) {
434 isl_pw_aff
*pa
= isl_pw_multi_aff_get_pw_aff(stmt
->u
.c
.index
, i
);
435 pa
= isl_pw_aff_coalesce(pa
);
436 pa
= isl_pw_aff_gist_params(pa
, isl_set_copy(stmt
->u
.c
.domain
));
439 bound_i
= isl_pw_aff_list_get_pw_aff(bound
, i
);
440 p
= isl_printer_print_str(p
, ") * (");
441 p
= isl_printer_print_pw_aff(p
, bound_i
);
442 p
= isl_printer_print_str(p
, ") + ");
443 isl_pw_aff_free(bound_i
);
445 p
= isl_printer_print_pw_aff(p
, pa
);
448 p
= isl_printer_print_str(p
, "]");
453 /* Print a copy statement.
455 * A read copy statement is printed as
459 * while a write copy statement is printed as
463 static __isl_give isl_printer
*print_copy(__isl_take isl_printer
*p
,
464 struct ppcg_kernel_stmt
*stmt
)
466 p
= isl_printer_start_line(p
);
467 if (stmt
->u
.c
.read
) {
468 p
= stmt_print_local_index(p
, stmt
);
469 p
= isl_printer_print_str(p
, " = ");
470 p
= stmt_print_global_index(p
, stmt
);
472 p
= stmt_print_global_index(p
, stmt
);
473 p
= isl_printer_print_str(p
, " = ");
474 p
= stmt_print_local_index(p
, stmt
);
476 p
= isl_printer_print_str(p
, ";");
477 p
= isl_printer_end_line(p
);
482 /* Print a sync statement.
484 static __isl_give isl_printer
*print_sync(__isl_take isl_printer
*p
,
485 struct ppcg_kernel_stmt
*stmt
)
487 p
= isl_printer_start_line(p
);
488 p
= isl_printer_print_str(p
, "__syncthreads();");
489 p
= isl_printer_end_line(p
);
494 /* Print an access based on the information in "access".
495 * If this an access to global memory, then the index expression
498 * If access->array is NULL, then we are
499 * accessing an iterator in the original program.
501 static __isl_give isl_printer
*print_access(__isl_take isl_printer
*p
,
502 struct ppcg_kernel_access
*access
)
506 struct gpu_array_info
*array
;
507 isl_pw_aff_list
*bound
;
509 array
= access
->array
;
510 bound
= array
? access
->local_array
->bound
: NULL
;
512 p
= isl_printer_print_str(p
, "(");
514 if (access
->type
== ppcg_access_global
&&
515 gpu_array_is_scalar(array
) && !array
->read_only
)
516 p
= isl_printer_print_str(p
, "*");
517 p
= isl_printer_print_str(p
, access
->local_name
);
518 if (gpu_array_is_scalar(array
))
520 p
= isl_printer_print_str(p
, "[");
523 n_index
= isl_ast_expr_list_n_ast_expr(access
->index
);
524 if (access
->type
== ppcg_access_global
)
525 for (i
= 0; i
+ 1 < n_index
; ++i
)
526 p
= isl_printer_print_str(p
, "(");
528 for (i
= 0; i
< n_index
; ++i
) {
531 index
= isl_ast_expr_list_get_ast_expr(access
->index
, i
);
533 if (access
->type
== ppcg_access_global
) {
535 bound_i
= isl_pw_aff_list_get_pw_aff(bound
, i
);
536 p
= isl_printer_print_str(p
, ") * (");
537 p
= isl_printer_print_pw_aff(p
, bound_i
);
538 p
= isl_printer_print_str(p
, ") + ");
539 isl_pw_aff_free(bound_i
);
541 p
= isl_printer_print_str(p
, "][");
543 p
= isl_printer_print_ast_expr(p
, index
);
544 isl_ast_expr_free(index
);
547 p
= isl_printer_print_str(p
, ")");
549 p
= isl_printer_print_str(p
, "]");
554 struct cuda_access_print_info
{
556 struct ppcg_kernel_stmt
*stmt
;
559 /* To print the cuda accesses we walk the list of cuda accesses simultaneously
560 * with the pet printer. This means that whenever the pet printer prints a
561 * pet access expression we have the corresponding cuda access available and can
562 * print the modified access.
564 static __isl_give isl_printer
*print_cuda_access(__isl_take isl_printer
*p
,
565 struct pet_expr
*expr
, void *usr
)
567 struct cuda_access_print_info
*info
=
568 (struct cuda_access_print_info
*) usr
;
570 p
= print_access(p
, &info
->stmt
->u
.d
.access
[info
->i
]);
576 static __isl_give isl_printer
*print_stmt_body(__isl_take isl_printer
*p
,
577 struct ppcg_kernel_stmt
*stmt
)
579 struct cuda_access_print_info info
;
584 p
= isl_printer_start_line(p
);
585 p
= print_pet_expr(p
, stmt
->u
.d
.stmt
->body
, &print_cuda_access
, &info
);
586 p
= isl_printer_print_str(p
, ";");
587 p
= isl_printer_end_line(p
);
592 /* This function is called for each user statement in the AST,
593 * i.e., for each kernel body statement, copy statement or sync statement.
595 static __isl_give isl_printer
*print_kernel_stmt(__isl_take isl_printer
*p
,
596 __isl_keep isl_ast_node
*node
, void *user
)
599 struct ppcg_kernel_stmt
*stmt
;
601 id
= isl_ast_node_get_annotation(node
);
602 stmt
= isl_id_get_user(id
);
605 switch (stmt
->type
) {
606 case ppcg_kernel_copy
:
607 return print_copy(p
, stmt
);
608 case ppcg_kernel_sync
:
609 return print_sync(p
, stmt
);
610 case ppcg_kernel_domain
:
611 return print_stmt_body(p
, stmt
);
617 static int print_macro(enum isl_ast_op_type type
, void *user
)
619 isl_printer
**p
= user
;
621 if (type
== isl_ast_op_fdiv_q
)
624 *p
= isl_ast_op_type_print_macro(type
, *p
);
629 /* Print the required macros for "node", including one for floord.
630 * We always print a macro for floord as it may also appear in the statements.
632 static __isl_give isl_printer
*print_macros(
633 __isl_keep isl_ast_node
*node
, __isl_take isl_printer
*p
)
635 p
= isl_ast_op_type_print_macro(isl_ast_op_fdiv_q
, p
);
636 if (isl_ast_node_foreach_ast_op_type(node
, &print_macro
, &p
) < 0)
637 return isl_printer_free(p
);
641 static void print_kernel(struct gpu_prog
*prog
, struct ppcg_kernel
*kernel
,
642 struct cuda_info
*cuda
)
644 isl_ctx
*ctx
= isl_ast_node_get_ctx(kernel
->tree
);
645 isl_ast_print_options
*print_options
;
648 print_kernel_headers(prog
, kernel
, cuda
);
649 fprintf(cuda
->kernel_c
, "{\n");
650 print_kernel_iterators(cuda
->kernel_c
, kernel
);
651 print_kernel_vars(cuda
->kernel_c
, kernel
);
652 fprintf(cuda
->kernel_c
, "\n");
654 print_options
= isl_ast_print_options_alloc(ctx
);
655 print_options
= isl_ast_print_options_set_print_user(print_options
,
656 &print_kernel_stmt
, NULL
);
658 p
= isl_printer_to_file(ctx
, cuda
->kernel_c
);
659 p
= isl_printer_set_output_format(p
, ISL_FORMAT_C
);
660 p
= isl_printer_indent(p
, 4);
661 p
= print_macros(kernel
->tree
, p
);
662 p
= isl_ast_node_print(kernel
->tree
, p
, print_options
);
665 isl_ast_print_options_free(print_options
);
667 fprintf(cuda
->kernel_c
, "}\n");
670 struct print_host_user_data
{
671 struct cuda_info
*cuda
;
672 struct gpu_prog
*prog
;
675 /* Print the user statement of the host code to "p".
677 * In particular, print a block of statements that defines the grid
678 * and the block and then launches the kernel.
680 static __isl_give isl_printer
*print_host_user(__isl_take isl_printer
*p
,
681 __isl_keep isl_ast_node
*node
, void *user
)
684 struct ppcg_kernel
*kernel
;
685 struct print_host_user_data
*data
;
687 id
= isl_ast_node_get_annotation(node
);
688 kernel
= isl_id_get_user(id
);
691 data
= (struct print_host_user_data
*) user
;
693 p
= isl_printer_start_line(p
);
694 p
= isl_printer_print_str(p
, "{");
695 p
= isl_printer_end_line(p
);
696 p
= isl_printer_indent(p
, 2);
698 p
= isl_printer_start_line(p
);
699 p
= isl_printer_print_str(p
, "dim3 k");
700 p
= isl_printer_print_int(p
, kernel
->id
);
701 p
= isl_printer_print_str(p
, "_dimBlock");
702 print_reverse_list(isl_printer_get_file(p
),
703 kernel
->n_block
, kernel
->block_dim
);
704 p
= isl_printer_print_str(p
, ";");
705 p
= isl_printer_end_line(p
);
707 p
= print_grid(p
, kernel
);
709 p
= isl_printer_start_line(p
);
710 p
= isl_printer_print_str(p
, "kernel");
711 p
= isl_printer_print_int(p
, kernel
->id
);
712 p
= isl_printer_print_str(p
, " <<<k");
713 p
= isl_printer_print_int(p
, kernel
->id
);
714 p
= isl_printer_print_str(p
, "_dimGrid, k");
715 p
= isl_printer_print_int(p
, kernel
->id
);
716 p
= isl_printer_print_str(p
, "_dimBlock>>> (");
717 p
= print_kernel_arguments(p
, data
->prog
, kernel
, 0);
718 p
= isl_printer_print_str(p
, ");");
719 p
= isl_printer_end_line(p
);
721 p
= isl_printer_start_line(p
);
722 p
= isl_printer_print_str(p
, "cudaCheckKernel();");
723 p
= isl_printer_end_line(p
);
725 p
= isl_printer_indent(p
, -2);
726 p
= isl_printer_start_line(p
);
727 p
= isl_printer_print_str(p
, "}");
728 p
= isl_printer_end_line(p
);
730 p
= isl_printer_start_line(p
);
731 p
= isl_printer_end_line(p
);
733 print_kernel(data
->prog
, kernel
, data
->cuda
);
738 static void print_host_code(FILE *out
, struct gpu_prog
*prog
,
739 __isl_keep isl_ast_node
*tree
, struct cuda_info
*cuda
)
741 isl_ast_print_options
*print_options
;
743 isl_ctx
*ctx
= isl_ast_node_get_ctx(tree
);
744 struct print_host_user_data data
= { cuda
, prog
};
746 print_options
= isl_ast_print_options_alloc(ctx
);
747 print_options
= isl_ast_print_options_set_print_user(print_options
,
748 &print_host_user
, &data
);
750 p
= isl_printer_to_file(ctx
, out
);
751 p
= isl_printer_set_output_format(p
, ISL_FORMAT_C
);
752 p
= print_macros(tree
, p
);
753 p
= isl_ast_node_print(tree
, p
, print_options
);
756 isl_ast_print_options_free(print_options
);
759 static void copy_arrays_from_device(FILE *out
, struct gpu_prog
*prog
)
762 isl_union_set
*write
;
763 write
= isl_union_map_range(isl_union_map_copy(prog
->write
));
765 for (i
= 0; i
< prog
->n_array
; ++i
) {
770 dim
= isl_space_copy(prog
->array
[i
].dim
);
771 write_i
= isl_union_set_extract_set(write
, dim
);
772 empty
= isl_set_fast_is_empty(write_i
);
773 isl_set_free(write_i
);
777 fprintf(out
, "cudaCheckReturn(cudaMemcpy(");
778 if (gpu_array_is_scalar(&prog
->array
[i
]))
779 fprintf(out
, "&%s, ", prog
->array
[i
].name
);
781 fprintf(out
, "%s, ", prog
->array
[i
].name
);
782 fprintf(out
, "dev_%s, ", prog
->array
[i
].name
);
783 print_array_size(prog
->ctx
, out
, &prog
->array
[i
]);
784 fprintf(out
, ", cudaMemcpyDeviceToHost));\n");
787 isl_union_set_free(write
);
791 static void free_device_arrays(FILE *out
, struct gpu_prog
*prog
)
795 for (i
= 0; i
< prog
->n_array
; ++i
) {
796 if (gpu_array_is_read_only_scalar(&prog
->array
[i
]))
798 fprintf(out
, "cudaCheckReturn(cudaFree(dev_%s));\n",
799 prog
->array
[i
].name
);
803 int generate_cuda(isl_ctx
*ctx
, struct pet_scop
*scop
,
804 struct ppcg_options
*options
, const char *input
)
806 struct cuda_info cuda
;
807 struct gpu_prog
*prog
;
813 scop
->context
= add_context_from_str(scop
->context
, options
->ctx
);
815 prog
= gpu_prog_alloc(ctx
, scop
);
817 tree
= generate_gpu(ctx
, prog
, options
);
819 cuda_open_files(&cuda
, input
);
821 fprintf(cuda
.host_c
, "{\n");
823 print_cuda_macros(cuda
.host_c
);
825 declare_device_arrays(cuda
.host_c
, prog
);
826 allocate_device_arrays(cuda
.host_c
, prog
);
827 copy_arrays_to_device(cuda
.host_c
, prog
);
829 print_host_code(cuda
.host_c
, prog
, tree
, &cuda
);
830 isl_ast_node_free(tree
);
832 copy_arrays_from_device(cuda
.host_c
, prog
);
833 free_device_arrays(cuda
.host_c
, prog
);
835 fprintf(cuda
.host_c
, "}\n");
837 cuda_close_files(&cuda
);