gpu.c: compute_schedule: take domain from ppcg_scop
[ppcg.git] / cuda.c
blobc1239c9ca47af543b11e4ad2351b355cb0965f0a
1 /*
2 * Copyright 2012 Ecole Normale Superieure
4 * Use of this software is governed by the GNU LGPLv2.1 license
6 * Written by Sven Verdoolaege,
7 * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
8 */
10 #include <isl/aff.h>
11 #include <isl/ast.h>
13 #include "cuda_common.h"
14 #include "cuda.h"
15 #include "gpu.h"
16 #include "pet_printer.h"
17 #include "schedule.h"
19 void print_cuda_macros(FILE *file)
21 const char *macros =
22 "#define cudaCheckReturn(ret) assert((ret) == cudaSuccess)\n"
23 "#define cudaCheckKernel()"
24 " assert(cudaGetLastError() == cudaSuccess)\n\n";
25 fputs(macros, file);
28 static void print_array_size(isl_ctx *ctx, FILE *out,
29 struct gpu_array_info *array)
31 int i;
32 isl_printer *prn;
34 prn = isl_printer_to_file(ctx, out);
35 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
36 for (i = 0; i < array->n_index; ++i) {
37 prn = isl_printer_print_str(prn, "(");
38 prn = isl_printer_print_pw_aff(prn, array->bound[i]);
39 prn = isl_printer_print_str(prn, ") * ");
41 prn = isl_printer_print_str(prn, "sizeof(");
42 prn = isl_printer_print_str(prn, array->type);
43 prn = isl_printer_print_str(prn, ")");
44 isl_printer_free(prn);
47 static void declare_device_arrays(FILE *out, struct gpu_prog *prog)
49 int i;
51 for (i = 0; i < prog->n_array; ++i) {
52 if (gpu_array_is_read_only_scalar(&prog->array[i]))
53 continue;
54 fprintf(out, "%s *dev_%s;\n",
55 prog->array[i].type, prog->array[i].name);
57 fprintf(out, "\n");
60 static void allocate_device_arrays(FILE *out, struct gpu_prog *prog)
62 int i;
64 for (i = 0; i < prog->n_array; ++i) {
65 if (gpu_array_is_read_only_scalar(&prog->array[i]))
66 continue;
67 fprintf(out,
68 "cudaCheckReturn(cudaMalloc((void **) &dev_%s, ",
69 prog->array[i].name);
70 print_array_size(prog->ctx, out, &prog->array[i]);
71 fprintf(out, "));\n");
73 fprintf(out, "\n");
76 static void copy_arrays_to_device(FILE *out, struct gpu_prog *prog)
78 int i;
80 for (i = 0; i < prog->n_array; ++i) {
81 isl_space *dim;
82 isl_set *read_i;
83 int empty;
85 if (gpu_array_is_read_only_scalar(&prog->array[i]))
86 continue;
88 dim = isl_space_copy(prog->array[i].dim);
89 read_i = isl_union_set_extract_set(prog->copy_in, dim);
90 empty = isl_set_fast_is_empty(read_i);
91 isl_set_free(read_i);
92 if (empty)
93 continue;
95 fprintf(out, "cudaCheckReturn(cudaMemcpy(dev_%s,",
96 prog->array[i].name);
98 if (gpu_array_is_scalar(&prog->array[i]))
99 fprintf(out, " &%s, ", prog->array[i].name);
100 else
101 fprintf(out, " %s, ", prog->array[i].name);
103 print_array_size(prog->ctx, out, &prog->array[i]);
104 fprintf(out, ", cudaMemcpyHostToDevice));\n");
106 fprintf(out, "\n");
109 static void print_reverse_list(FILE *out, int len, int *list)
111 int i;
113 if (len == 0)
114 return;
116 fprintf(out, "(");
117 for (i = 0; i < len; ++i) {
118 if (i)
119 fprintf(out, ", ");
120 fprintf(out, "%d", list[len - 1 - i]);
122 fprintf(out, ")");
125 /* Print the effective grid size as a list of the sizes in each
126 * dimension, from innermost to outermost.
128 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
129 struct ppcg_kernel *kernel)
131 int i;
132 int dim;
134 dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
135 if (dim == 0)
136 return p;
138 p = isl_printer_print_str(p, "(");
139 for (i = dim - 1; i >= 0; --i) {
140 isl_pw_aff *bound;
142 bound = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
143 p = isl_printer_print_pw_aff(p, bound);
144 isl_pw_aff_free(bound);
146 if (i > 0)
147 p = isl_printer_print_str(p, ", ");
150 p = isl_printer_print_str(p, ")");
152 return p;
155 /* Print the grid definition.
157 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
158 struct ppcg_kernel *kernel)
160 p = isl_printer_start_line(p);
161 p = isl_printer_print_str(p, "dim3 k");
162 p = isl_printer_print_int(p, kernel->id);
163 p = isl_printer_print_str(p, "_dimGrid");
164 p = print_grid_size(p, kernel);
165 p = isl_printer_print_str(p, ";");
166 p = isl_printer_end_line(p);
168 return p;
171 /* Print the arguments to a kernel declaration or call. If "types" is set,
172 * then print a declaration (including the types of the arguments).
174 * The arguments are printed in the following order
175 * - the arrays accessed by the kernel
176 * - the parameters
177 * - the host loop iterators
179 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
180 struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
182 int i, n;
183 int first = 1;
184 unsigned nparam;
185 isl_space *space;
186 const char *type;
188 for (i = 0; i < prog->n_array; ++i) {
189 isl_set *arr;
190 int empty;
192 space = isl_space_copy(prog->array[i].dim);
193 arr = isl_union_set_extract_set(kernel->arrays, space);
194 empty = isl_set_fast_is_empty(arr);
195 isl_set_free(arr);
196 if (empty)
197 continue;
199 if (!first)
200 p = isl_printer_print_str(p, ", ");
202 if (types) {
203 p = isl_printer_print_str(p, prog->array[i].type);
204 p = isl_printer_print_str(p, " ");
207 if (gpu_array_is_read_only_scalar(&prog->array[i])) {
208 p = isl_printer_print_str(p, prog->array[i].name);
209 } else {
210 if (types)
211 p = isl_printer_print_str(p, "*");
212 else
213 p = isl_printer_print_str(p, "dev_");
214 p = isl_printer_print_str(p, prog->array[i].name);
217 first = 0;
220 space = isl_union_set_get_space(kernel->arrays);
221 nparam = isl_space_dim(space, isl_dim_param);
222 for (i = 0; i < nparam; ++i) {
223 const char *name;
225 name = isl_space_get_dim_name(space, isl_dim_param, i);
227 if (!first)
228 p = isl_printer_print_str(p, ", ");
229 if (types)
230 p = isl_printer_print_str(p, "int ");
231 p = isl_printer_print_str(p, name);
233 first = 0;
235 isl_space_free(space);
237 n = isl_space_dim(kernel->space, isl_dim_set);
238 type = isl_options_get_ast_iterator_type(prog->ctx);
239 for (i = 0; i < n; ++i) {
240 const char *name;
241 isl_id *id;
243 if (!first)
244 p = isl_printer_print_str(p, ", ");
245 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
246 if (types) {
247 p = isl_printer_print_str(p, type);
248 p = isl_printer_print_str(p, " ");
250 p = isl_printer_print_str(p, name);
252 first = 0;
255 return p;
258 /* Print the header of the given kernel.
260 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
261 struct gpu_prog *prog, struct ppcg_kernel *kernel)
263 p = isl_printer_start_line(p);
264 p = isl_printer_print_str(p, "__global__ void kernel");
265 p = isl_printer_print_int(p, kernel->id);
266 p = isl_printer_print_str(p, "(");
267 p = print_kernel_arguments(p, prog, kernel, 1);
268 p = isl_printer_print_str(p, ")");
270 return p;
273 /* Print the header of the given kernel to both gen->cuda.kernel_h
274 * and gen->cuda.kernel_c.
276 static void print_kernel_headers(struct gpu_prog *prog,
277 struct ppcg_kernel *kernel, struct cuda_info *cuda)
279 isl_printer *p;
281 p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
282 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
283 p = print_kernel_header(p, prog, kernel);
284 p = isl_printer_print_str(p, ";");
285 p = isl_printer_end_line(p);
286 isl_printer_free(p);
288 p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
289 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
290 p = print_kernel_header(p, prog, kernel);
291 p = isl_printer_end_line(p);
292 isl_printer_free(p);
295 static void print_indent(FILE *dst, int indent)
297 fprintf(dst, "%*s", indent, "");
300 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
302 int i;
303 const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
304 const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
305 "threadIdx.z" };
307 if (kernel->n_grid > 0) {
308 print_indent(out, 4);
309 fprintf(out, "int ");
310 for (i = 0; i < kernel->n_grid; ++i) {
311 if (i)
312 fprintf(out, ", ");
313 fprintf(out, "b%d = %s",
314 i, block_dims[kernel->n_grid - 1 - i]);
316 fprintf(out, ";\n");
319 if (kernel->n_block > 0) {
320 print_indent(out, 4);
321 fprintf(out, "int ");
322 for (i = 0; i < kernel->n_block; ++i) {
323 if (i)
324 fprintf(out, ", ");
325 fprintf(out, "t%d = %s",
326 i, thread_dims[kernel->n_block - 1 - i]);
328 fprintf(out, ";\n");
332 static void print_kernel_var(FILE *out, struct ppcg_kernel_var *var)
334 int j;
335 isl_int v;
337 print_indent(out, 4);
338 if (var->type == ppcg_access_shared)
339 fprintf(out, "__shared__ ");
340 fprintf(out, "%s %s", var->array->type, var->name);
341 isl_int_init(v);
342 for (j = 0; j < var->array->n_index; ++j) {
343 fprintf(out, "[");
344 isl_vec_get_element(var->size, j, &v);
345 isl_int_print(out, v, 0);
346 fprintf(out, "]");
348 isl_int_clear(v);
349 fprintf(out, ";\n");
352 static void print_kernel_vars(FILE *out, struct ppcg_kernel *kernel)
354 int i;
356 for (i = 0; i < kernel->n_var; ++i)
357 print_kernel_var(out, &kernel->var[i]);
360 /* Print an access to the element in the private/shared memory copy
361 * described by "stmt". The index of the copy is recorded in
362 * stmt->local_index as a "call" to the array.
364 static __isl_give isl_printer *stmt_print_local_index(__isl_take isl_printer *p,
365 struct ppcg_kernel_stmt *stmt)
367 int i;
368 isl_ast_expr *expr;
369 struct gpu_array_info *array = stmt->u.c.array;
371 expr = isl_ast_expr_get_op_arg(stmt->u.c.local_index, 0);
372 p = isl_printer_print_ast_expr(p, expr);
373 isl_ast_expr_free(expr);
375 for (i = 0; i < array->n_index; ++i) {
376 expr = isl_ast_expr_get_op_arg(stmt->u.c.local_index, 1 + i);
378 p = isl_printer_print_str(p, "[");
379 p = isl_printer_print_ast_expr(p, expr);
380 p = isl_printer_print_str(p, "]");
382 isl_ast_expr_free(expr);
385 return p;
388 /* Print an access to the element in the global memory copy
389 * described by "stmt". The index of the copy is recorded in
390 * stmt->index as a "call" to the array.
392 * The copy in global memory has been linearized, so we need to take
393 * the array size into account.
395 static __isl_give isl_printer *stmt_print_global_index(
396 __isl_take isl_printer *p, struct ppcg_kernel_stmt *stmt)
398 int i;
399 struct gpu_array_info *array = stmt->u.c.array;
400 isl_pw_aff_list *bound = stmt->u.c.local_array->bound;
402 if (gpu_array_is_scalar(array)) {
403 if (!array->read_only)
404 p = isl_printer_print_str(p, "*");
405 p = isl_printer_print_str(p, array->name);
406 return p;
409 p = isl_printer_print_str(p, array->name);
410 p = isl_printer_print_str(p, "[");
411 for (i = 0; i + 1 < array->n_index; ++i)
412 p = isl_printer_print_str(p, "(");
413 for (i = 0; i < array->n_index; ++i) {
414 isl_ast_expr *expr;
415 expr = isl_ast_expr_get_op_arg(stmt->u.c.index, 1 + i);
416 if (i) {
417 isl_pw_aff *bound_i;
418 bound_i = isl_pw_aff_list_get_pw_aff(bound, i);
419 p = isl_printer_print_str(p, ") * (");
420 p = isl_printer_print_pw_aff(p, bound_i);
421 p = isl_printer_print_str(p, ") + ");
422 isl_pw_aff_free(bound_i);
424 p = isl_printer_print_ast_expr(p, expr);
425 isl_ast_expr_free(expr);
427 p = isl_printer_print_str(p, "]");
429 return p;
432 /* Print a copy statement.
434 * A read copy statement is printed as
436 * local = global;
438 * while a write copy statement is printed as
440 * global = local;
442 static __isl_give isl_printer *print_copy(__isl_take isl_printer *p,
443 struct ppcg_kernel_stmt *stmt)
445 p = isl_printer_start_line(p);
446 if (stmt->u.c.read) {
447 p = stmt_print_local_index(p, stmt);
448 p = isl_printer_print_str(p, " = ");
449 p = stmt_print_global_index(p, stmt);
450 } else {
451 p = stmt_print_global_index(p, stmt);
452 p = isl_printer_print_str(p, " = ");
453 p = stmt_print_local_index(p, stmt);
455 p = isl_printer_print_str(p, ";");
456 p = isl_printer_end_line(p);
458 return p;
461 /* Print a sync statement.
463 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
464 struct ppcg_kernel_stmt *stmt)
466 p = isl_printer_start_line(p);
467 p = isl_printer_print_str(p, "__syncthreads();");
468 p = isl_printer_end_line(p);
470 return p;
473 /* Print an access based on the information in "access".
474 * If this an access to global memory, then the index expression
475 * is linearized.
477 * If access->array is NULL, then we are
478 * accessing an iterator in the original program.
480 static __isl_give isl_printer *print_access(__isl_take isl_printer *p,
481 struct ppcg_kernel_access *access)
483 int i;
484 unsigned n_index;
485 struct gpu_array_info *array;
486 isl_pw_aff_list *bound;
488 array = access->array;
489 bound = array ? access->local_array->bound : NULL;
490 if (!array)
491 p = isl_printer_print_str(p, "(");
492 else {
493 if (access->type == ppcg_access_global &&
494 gpu_array_is_scalar(array) && !array->read_only)
495 p = isl_printer_print_str(p, "*");
496 p = isl_printer_print_str(p, access->local_name);
497 if (gpu_array_is_scalar(array))
498 return p;
499 p = isl_printer_print_str(p, "[");
502 n_index = isl_ast_expr_list_n_ast_expr(access->index);
503 if (access->type == ppcg_access_global)
504 for (i = 0; i + 1 < n_index; ++i)
505 p = isl_printer_print_str(p, "(");
507 for (i = 0; i < n_index; ++i) {
508 isl_ast_expr *index;
510 index = isl_ast_expr_list_get_ast_expr(access->index, i);
511 if (array && i) {
512 if (access->type == ppcg_access_global) {
513 isl_pw_aff *bound_i;
514 bound_i = isl_pw_aff_list_get_pw_aff(bound, i);
515 p = isl_printer_print_str(p, ") * (");
516 p = isl_printer_print_pw_aff(p, bound_i);
517 p = isl_printer_print_str(p, ") + ");
518 isl_pw_aff_free(bound_i);
519 } else
520 p = isl_printer_print_str(p, "][");
522 p = isl_printer_print_ast_expr(p, index);
523 isl_ast_expr_free(index);
525 if (!array)
526 p = isl_printer_print_str(p, ")");
527 else
528 p = isl_printer_print_str(p, "]");
530 return p;
533 struct cuda_access_print_info {
534 int i;
535 struct ppcg_kernel_stmt *stmt;
538 /* To print the cuda accesses we walk the list of cuda accesses simultaneously
539 * with the pet printer. This means that whenever the pet printer prints a
540 * pet access expression we have the corresponding cuda access available and can
541 * print the modified access.
543 static __isl_give isl_printer *print_cuda_access(__isl_take isl_printer *p,
544 struct pet_expr *expr, void *usr)
546 struct cuda_access_print_info *info =
547 (struct cuda_access_print_info *) usr;
549 p = print_access(p, &info->stmt->u.d.access[info->i]);
550 info->i++;
552 return p;
555 static __isl_give isl_printer *print_stmt_body(__isl_take isl_printer *p,
556 struct ppcg_kernel_stmt *stmt)
558 struct cuda_access_print_info info;
560 info.i = 0;
561 info.stmt = stmt;
563 p = isl_printer_start_line(p);
564 p = print_pet_expr(p, stmt->u.d.stmt->body, &print_cuda_access, &info);
565 p = isl_printer_print_str(p, ";");
566 p = isl_printer_end_line(p);
568 return p;
571 /* This function is called for each user statement in the AST,
572 * i.e., for each kernel body statement, copy statement or sync statement.
574 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
575 __isl_keep isl_ast_node *node, void *user)
577 isl_id *id;
578 struct ppcg_kernel_stmt *stmt;
580 id = isl_ast_node_get_annotation(node);
581 stmt = isl_id_get_user(id);
582 isl_id_free(id);
584 switch (stmt->type) {
585 case ppcg_kernel_copy:
586 return print_copy(p, stmt);
587 case ppcg_kernel_sync:
588 return print_sync(p, stmt);
589 case ppcg_kernel_domain:
590 return print_stmt_body(p, stmt);
593 return p;
596 static int print_macro(enum isl_ast_op_type type, void *user)
598 isl_printer **p = user;
600 if (type == isl_ast_op_fdiv_q)
601 return 0;
603 *p = isl_ast_op_type_print_macro(type, *p);
605 return 0;
608 /* Print the required macros for "node", including one for floord.
609 * We always print a macro for floord as it may also appear in the statements.
611 static __isl_give isl_printer *print_macros(
612 __isl_keep isl_ast_node *node, __isl_take isl_printer *p)
614 p = isl_ast_op_type_print_macro(isl_ast_op_fdiv_q, p);
615 if (isl_ast_node_foreach_ast_op_type(node, &print_macro, &p) < 0)
616 return isl_printer_free(p);
617 return p;
620 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
621 struct cuda_info *cuda)
623 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
624 isl_ast_print_options *print_options;
625 isl_printer *p;
627 print_kernel_headers(prog, kernel, cuda);
628 fprintf(cuda->kernel_c, "{\n");
629 print_kernel_iterators(cuda->kernel_c, kernel);
630 print_kernel_vars(cuda->kernel_c, kernel);
631 fprintf(cuda->kernel_c, "\n");
633 print_options = isl_ast_print_options_alloc(ctx);
634 print_options = isl_ast_print_options_set_print_user(print_options,
635 &print_kernel_stmt, NULL);
637 p = isl_printer_to_file(ctx, cuda->kernel_c);
638 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
639 p = isl_printer_indent(p, 4);
640 p = print_macros(kernel->tree, p);
641 p = isl_ast_node_print(kernel->tree, p, print_options);
642 isl_printer_free(p);
644 isl_ast_print_options_free(print_options);
646 fprintf(cuda->kernel_c, "}\n");
649 struct print_host_user_data {
650 struct cuda_info *cuda;
651 struct gpu_prog *prog;
654 /* Print the user statement of the host code to "p".
656 * In particular, print a block of statements that defines the grid
657 * and the block and then launches the kernel.
659 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
660 __isl_keep isl_ast_node *node, void *user)
662 isl_id *id;
663 struct ppcg_kernel *kernel;
664 struct print_host_user_data *data;
666 id = isl_ast_node_get_annotation(node);
667 kernel = isl_id_get_user(id);
668 isl_id_free(id);
670 data = (struct print_host_user_data *) user;
672 p = isl_printer_start_line(p);
673 p = isl_printer_print_str(p, "{");
674 p = isl_printer_end_line(p);
675 p = isl_printer_indent(p, 2);
677 p = isl_printer_start_line(p);
678 p = isl_printer_print_str(p, "dim3 k");
679 p = isl_printer_print_int(p, kernel->id);
680 p = isl_printer_print_str(p, "_dimBlock");
681 print_reverse_list(isl_printer_get_file(p),
682 kernel->n_block, kernel->block_dim);
683 p = isl_printer_print_str(p, ";");
684 p = isl_printer_end_line(p);
686 p = print_grid(p, kernel);
688 p = isl_printer_start_line(p);
689 p = isl_printer_print_str(p, "kernel");
690 p = isl_printer_print_int(p, kernel->id);
691 p = isl_printer_print_str(p, " <<<k");
692 p = isl_printer_print_int(p, kernel->id);
693 p = isl_printer_print_str(p, "_dimGrid, k");
694 p = isl_printer_print_int(p, kernel->id);
695 p = isl_printer_print_str(p, "_dimBlock>>> (");
696 p = print_kernel_arguments(p, data->prog, kernel, 0);
697 p = isl_printer_print_str(p, ");");
698 p = isl_printer_end_line(p);
700 p = isl_printer_start_line(p);
701 p = isl_printer_print_str(p, "cudaCheckKernel();");
702 p = isl_printer_end_line(p);
704 p = isl_printer_indent(p, -2);
705 p = isl_printer_start_line(p);
706 p = isl_printer_print_str(p, "}");
707 p = isl_printer_end_line(p);
709 p = isl_printer_start_line(p);
710 p = isl_printer_end_line(p);
712 print_kernel(data->prog, kernel, data->cuda);
714 return p;
717 static void print_host_code(FILE *out, struct gpu_prog *prog,
718 __isl_keep isl_ast_node *tree, struct cuda_info *cuda)
720 isl_ast_print_options *print_options;
721 isl_printer *p;
722 isl_ctx *ctx = isl_ast_node_get_ctx(tree);
723 struct print_host_user_data data = { cuda, prog };
725 print_options = isl_ast_print_options_alloc(ctx);
726 print_options = isl_ast_print_options_set_print_user(print_options,
727 &print_host_user, &data);
729 p = isl_printer_to_file(ctx, out);
730 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
731 p = print_macros(tree, p);
732 p = isl_ast_node_print(tree, p, print_options);
733 isl_printer_free(p);
735 isl_ast_print_options_free(print_options);
738 static void copy_arrays_from_device(FILE *out, struct gpu_prog *prog)
740 int i;
741 isl_union_set *write;
742 write = isl_union_map_range(isl_union_map_copy(prog->write));
744 for (i = 0; i < prog->n_array; ++i) {
745 isl_space *dim;
746 isl_set *write_i;
747 int empty;
749 dim = isl_space_copy(prog->array[i].dim);
750 write_i = isl_union_set_extract_set(write, dim);
751 empty = isl_set_fast_is_empty(write_i);
752 isl_set_free(write_i);
753 if (empty)
754 continue;
756 fprintf(out, "cudaCheckReturn(cudaMemcpy(");
757 if (gpu_array_is_scalar(&prog->array[i]))
758 fprintf(out, "&%s, ", prog->array[i].name);
759 else
760 fprintf(out, "%s, ", prog->array[i].name);
761 fprintf(out, "dev_%s, ", prog->array[i].name);
762 print_array_size(prog->ctx, out, &prog->array[i]);
763 fprintf(out, ", cudaMemcpyDeviceToHost));\n");
766 isl_union_set_free(write);
767 fprintf(out, "\n");
770 static void free_device_arrays(FILE *out, struct gpu_prog *prog)
772 int i;
774 for (i = 0; i < prog->n_array; ++i) {
775 if (gpu_array_is_read_only_scalar(&prog->array[i]))
776 continue;
777 fprintf(out, "cudaCheckReturn(cudaFree(dev_%s));\n",
778 prog->array[i].name);
782 int generate_cuda(isl_ctx *ctx, struct ppcg_scop *scop,
783 struct ppcg_options *options, const char *input)
785 struct cuda_info cuda;
786 struct gpu_prog *prog;
787 isl_ast_node *tree;
789 if (!scop)
790 return -1;
792 scop->context = add_context_from_str(scop->context, options->ctx);
794 prog = gpu_prog_alloc(ctx, scop);
796 tree = generate_gpu(ctx, prog, options);
798 cuda_open_files(&cuda, input);
800 fprintf(cuda.host_c, "{\n");
802 print_cuda_macros(cuda.host_c);
804 declare_device_arrays(cuda.host_c, prog);
805 allocate_device_arrays(cuda.host_c, prog);
806 copy_arrays_to_device(cuda.host_c, prog);
808 print_host_code(cuda.host_c, prog, tree, &cuda);
809 isl_ast_node_free(tree);
811 copy_arrays_from_device(cuda.host_c, prog);
812 free_device_arrays(cuda.host_c, prog);
814 fprintf(cuda.host_c, "}\n");
816 cuda_close_files(&cuda);
818 gpu_prog_free(prog);
820 return 0;