separate out CUDA printing
[ppcg.git] / cuda.c
bloba983d655ad9870956cab27ecd4f36c196ae4da57
1 /*
2 * Copyright 2012 Ecole Normale Superieure
4 * Use of this software is governed by the GNU LGPLv2.1 license
6 * Written by Sven Verdoolaege,
7 * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
8 */
10 #include <isl/aff.h>
11 #include <isl/ast.h>
13 #include "cuda_common.h"
14 #include "cuda.h"
15 #include "gpu.h"
16 #include "pet_printer.h"
17 #include "schedule.h"
19 void print_cuda_macros(FILE *file)
21 const char *macros =
22 "#define cudaCheckReturn(ret) assert((ret) == cudaSuccess)\n"
23 "#define cudaCheckKernel()"
24 " assert(cudaGetLastError() == cudaSuccess)\n\n";
25 fputs(macros, file);
28 static void print_array_size(isl_ctx *ctx, FILE *out,
29 struct gpu_array_info *array)
31 int i;
32 isl_printer *prn;
34 prn = isl_printer_to_file(ctx, out);
35 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
36 for (i = 0; i < array->n_index; ++i) {
37 prn = isl_printer_print_str(prn, "(");
38 prn = isl_printer_print_pw_aff(prn, array->bound[i]);
39 prn = isl_printer_print_str(prn, ") * ");
41 prn = isl_printer_print_str(prn, "sizeof(");
42 prn = isl_printer_print_str(prn, array->type);
43 prn = isl_printer_print_str(prn, ")");
44 isl_printer_free(prn);
47 static void declare_device_arrays(FILE *out, struct gpu_prog *prog)
49 int i;
51 for (i = 0; i < prog->n_array; ++i) {
52 if (gpu_array_is_read_only_scalar(&prog->array[i]))
53 continue;
54 fprintf(out, "%s *dev_%s;\n",
55 prog->array[i].type, prog->array[i].name);
57 fprintf(out, "\n");
60 static void allocate_device_arrays(FILE *out, struct gpu_prog *prog)
62 int i;
64 for (i = 0; i < prog->n_array; ++i) {
65 if (gpu_array_is_read_only_scalar(&prog->array[i]))
66 continue;
67 fprintf(out,
68 "cudaCheckReturn(cudaMalloc((void **) &dev_%s, ",
69 prog->array[i].name);
70 print_array_size(prog->ctx, out, &prog->array[i]);
71 fprintf(out, "));\n");
73 fprintf(out, "\n");
76 static void copy_arrays_to_device(FILE *out, struct gpu_prog *prog)
78 int i;
80 for (i = 0; i < prog->n_array; ++i) {
81 isl_space *dim;
82 isl_set *read_i;
83 int empty;
85 if (gpu_array_is_read_only_scalar(&prog->array[i]))
86 continue;
88 dim = isl_space_copy(prog->array[i].dim);
89 read_i = isl_union_set_extract_set(prog->copy_in, dim);
90 empty = isl_set_fast_is_empty(read_i);
91 isl_set_free(read_i);
92 if (empty)
93 continue;
95 fprintf(out, "cudaCheckReturn(cudaMemcpy(dev_%s,",
96 prog->array[i].name);
98 if (gpu_array_is_scalar(&prog->array[i]))
99 fprintf(out, " &%s, ", prog->array[i].name);
100 else
101 fprintf(out, " %s, ", prog->array[i].name);
103 print_array_size(prog->ctx, out, &prog->array[i]);
104 fprintf(out, ", cudaMemcpyHostToDevice));\n");
106 fprintf(out, "\n");
109 static void print_reverse_list(FILE *out, int len, int *list)
111 int i;
113 if (len == 0)
114 return;
116 fprintf(out, "(");
117 for (i = 0; i < len; ++i) {
118 if (i)
119 fprintf(out, ", ");
120 fprintf(out, "%d", list[len - 1 - i]);
122 fprintf(out, ")");
125 /* Print the effective grid size as a list of the sizes in each
126 * dimension, from innermost to outermost.
128 * The grid size specified by the user or set by default
129 * in read_grid_sizes() and applied in tile_schedule(),
130 * may be too large for the given code in the sense that
131 * it may contain blocks that don't need to execute anything.
132 * We therefore don't print this grid size, but instead the
133 * smallest grid size that ensures that all blocks that actually
134 * execute code are included in the grid.
136 * For each block dimension, we compute the maximal value of the block id
137 * and add one.
139 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
140 struct ppcg_kernel *kernel)
142 int i;
143 int dim;
145 dim = isl_set_dim(kernel->grid, isl_dim_set);
146 if (dim == 0)
147 return p;
149 p = isl_printer_print_str(p, "(");
150 for (i = dim - 1; i >= 0; --i) {
151 isl_space *space;
152 isl_aff *one;
153 isl_pw_aff *bound ;
155 bound = isl_set_dim_max(isl_set_copy(kernel->grid), i);
156 bound = isl_pw_aff_coalesce(bound);
157 bound = isl_pw_aff_gist(bound, isl_set_copy(kernel->context));
159 space = isl_pw_aff_get_domain_space(bound);
160 one = isl_aff_zero_on_domain(isl_local_space_from_space(space));
161 one = isl_aff_add_constant_si(one, 1);
162 bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one));
163 p = isl_printer_print_pw_aff(p, bound);
164 isl_pw_aff_free(bound);
166 if (i > 0)
167 p = isl_printer_print_str(p, ", ");
170 p = isl_printer_print_str(p, ")");
172 return p;
175 /* Print the grid definition.
177 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
178 struct ppcg_kernel *kernel)
180 p = isl_printer_start_line(p);
181 p = isl_printer_print_str(p, "dim3 k");
182 p = isl_printer_print_int(p, kernel->id);
183 p = isl_printer_print_str(p, "_dimGrid");
184 p = print_grid_size(p, kernel);
185 p = isl_printer_print_str(p, ";");
186 p = isl_printer_end_line(p);
188 return p;
191 /* Print the arguments to a kernel declaration or call. If "types" is set,
192 * then print a declaration (including the types of the arguments).
194 * The arguments are printed in the following order
195 * - the arrays accessed by the kernel
196 * - the parameters
197 * - the host loop iterators
199 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
200 struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
202 int i, n;
203 int first = 1;
204 unsigned nparam;
205 isl_space *space;
206 const char *type;
208 for (i = 0; i < prog->n_array; ++i) {
209 isl_set *arr;
210 int empty;
212 space = isl_space_copy(prog->array[i].dim);
213 arr = isl_union_set_extract_set(kernel->arrays, space);
214 empty = isl_set_fast_is_empty(arr);
215 isl_set_free(arr);
216 if (empty)
217 continue;
219 if (!first)
220 p = isl_printer_print_str(p, ", ");
222 if (types) {
223 p = isl_printer_print_str(p, prog->array[i].type);
224 p = isl_printer_print_str(p, " ");
227 if (gpu_array_is_read_only_scalar(&prog->array[i])) {
228 p = isl_printer_print_str(p, prog->array[i].name);
229 } else {
230 if (types)
231 p = isl_printer_print_str(p, "*");
232 else
233 p = isl_printer_print_str(p, "dev_");
234 p = isl_printer_print_str(p, prog->array[i].name);
237 first = 0;
240 space = isl_union_set_get_space(kernel->arrays);
241 nparam = isl_space_dim(space, isl_dim_param);
242 for (i = 0; i < nparam; ++i) {
243 const char *name;
245 name = isl_space_get_dim_name(space, isl_dim_param, i);
247 if (!first)
248 p = isl_printer_print_str(p, ", ");
249 if (types)
250 p = isl_printer_print_str(p, "int ");
251 p = isl_printer_print_str(p, name);
253 first = 0;
255 isl_space_free(space);
257 n = isl_space_dim(kernel->space, isl_dim_set);
258 type = isl_options_get_ast_iterator_type(prog->ctx);
259 for (i = 0; i < n; ++i) {
260 const char *name;
261 isl_id *id;
263 if (!first)
264 p = isl_printer_print_str(p, ", ");
265 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
266 if (types) {
267 p = isl_printer_print_str(p, type);
268 p = isl_printer_print_str(p, " ");
270 p = isl_printer_print_str(p, name);
272 first = 0;
275 return p;
278 /* Print the header of the given kernel.
280 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
281 struct gpu_prog *prog, struct ppcg_kernel *kernel)
283 p = isl_printer_start_line(p);
284 p = isl_printer_print_str(p, "__global__ void kernel");
285 p = isl_printer_print_int(p, kernel->id);
286 p = isl_printer_print_str(p, "(");
287 p = print_kernel_arguments(p, prog, kernel, 1);
288 p = isl_printer_print_str(p, ")");
290 return p;
293 /* Print the header of the given kernel to both gen->cuda.kernel_h
294 * and gen->cuda.kernel_c.
296 static void print_kernel_headers(struct gpu_prog *prog,
297 struct ppcg_kernel *kernel, struct cuda_info *cuda)
299 isl_printer *p;
301 p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
302 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
303 p = print_kernel_header(p, prog, kernel);
304 p = isl_printer_print_str(p, ";");
305 p = isl_printer_end_line(p);
306 isl_printer_free(p);
308 p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
309 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
310 p = print_kernel_header(p, prog, kernel);
311 p = isl_printer_end_line(p);
312 isl_printer_free(p);
315 static void print_indent(FILE *dst, int indent)
317 fprintf(dst, "%*s", indent, "");
320 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
322 int i;
323 const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
324 const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
325 "threadIdx.z" };
327 if (kernel->n_grid > 0) {
328 print_indent(out, 4);
329 fprintf(out, "int ");
330 for (i = 0; i < kernel->n_grid; ++i) {
331 if (i)
332 fprintf(out, ", ");
333 fprintf(out, "b%d = %s",
334 i, block_dims[kernel->n_grid - 1 - i]);
336 fprintf(out, ";\n");
339 if (kernel->n_block > 0) {
340 print_indent(out, 4);
341 fprintf(out, "int ");
342 for (i = 0; i < kernel->n_block; ++i) {
343 if (i)
344 fprintf(out, ", ");
345 fprintf(out, "t%d = %s",
346 i, thread_dims[kernel->n_block - 1 - i]);
348 fprintf(out, ";\n");
352 static void print_kernel_var(FILE *out, struct ppcg_kernel_var *var)
354 int j;
355 isl_int v;
357 print_indent(out, 4);
358 if (var->type == ppcg_access_shared)
359 fprintf(out, "__shared__ ");
360 fprintf(out, "%s %s", var->array->type, var->name);
361 isl_int_init(v);
362 for (j = 0; j < var->array->n_index; ++j) {
363 fprintf(out, "[");
364 isl_vec_get_element(var->size, j, &v);
365 isl_int_print(out, v, 0);
366 fprintf(out, "]");
368 isl_int_clear(v);
369 fprintf(out, ";\n");
372 static void print_kernel_vars(FILE *out, struct ppcg_kernel *kernel)
374 int i;
376 for (i = 0; i < kernel->n_var; ++i)
377 print_kernel_var(out, &kernel->var[i]);
380 /* Print an access to the element in the private/shared memory copy
381 * described by "stmt". The index of the copy is recorded in
382 * stmt->local_index.
384 static __isl_give isl_printer *stmt_print_local_index(__isl_take isl_printer *p,
385 struct ppcg_kernel_stmt *stmt)
387 int i;
388 const char *name;
389 struct gpu_array_info *array = stmt->u.c.array;
391 name = isl_pw_multi_aff_get_tuple_name(stmt->u.c.local_index,
392 isl_dim_out);
393 p = isl_printer_print_str(p, name);
394 for (i = 0; i < array->n_index; ++i) {
395 isl_pw_aff *pa;
396 pa = isl_pw_multi_aff_get_pw_aff(stmt->u.c.local_index, i);
398 p = isl_printer_print_str(p, "[");
399 p = isl_printer_print_pw_aff(p, pa);
400 p = isl_printer_print_str(p, "]");
402 isl_pw_aff_free(pa);
405 return p;
408 /* Print an access to the element in the global memory copy
409 * described by "stmt". The index of the copy is recorded in
410 * stmt->index.
412 * The copy in global memory has been linearized, so we need to take
413 * the array size into account.
415 static __isl_give isl_printer *stmt_print_global_index(
416 __isl_take isl_printer *p, struct ppcg_kernel_stmt *stmt)
418 int i;
419 struct gpu_array_info *array = stmt->u.c.array;
420 isl_pw_aff_list *bound = stmt->u.c.local_array->bound;
422 if (gpu_array_is_scalar(array)) {
423 if (!array->read_only)
424 p = isl_printer_print_str(p, "*");
425 p = isl_printer_print_str(p, array->name);
426 return p;
429 p = isl_printer_print_str(p, array->name);
430 p = isl_printer_print_str(p, "[");
431 for (i = 0; i + 1 < array->n_index; ++i)
432 p = isl_printer_print_str(p, "(");
433 for (i = 0; i < array->n_index; ++i) {
434 isl_pw_aff *pa = isl_pw_multi_aff_get_pw_aff(stmt->u.c.index, i);
435 pa = isl_pw_aff_coalesce(pa);
436 pa = isl_pw_aff_gist_params(pa, isl_set_copy(stmt->u.c.domain));
437 if (i) {
438 isl_pw_aff *bound_i;
439 bound_i = isl_pw_aff_list_get_pw_aff(bound, i);
440 p = isl_printer_print_str(p, ") * (");
441 p = isl_printer_print_pw_aff(p, bound_i);
442 p = isl_printer_print_str(p, ") + ");
443 isl_pw_aff_free(bound_i);
445 p = isl_printer_print_pw_aff(p, pa);
446 isl_pw_aff_free(pa);
448 p = isl_printer_print_str(p, "]");
450 return p;
453 /* Print a copy statement.
455 * A read copy statement is printed as
457 * local = global;
459 * while a write copy statement is printed as
461 * global = local;
463 static __isl_give isl_printer *print_copy(__isl_take isl_printer *p,
464 struct ppcg_kernel_stmt *stmt)
466 p = isl_printer_start_line(p);
467 if (stmt->u.c.read) {
468 p = stmt_print_local_index(p, stmt);
469 p = isl_printer_print_str(p, " = ");
470 p = stmt_print_global_index(p, stmt);
471 } else {
472 p = stmt_print_global_index(p, stmt);
473 p = isl_printer_print_str(p, " = ");
474 p = stmt_print_local_index(p, stmt);
476 p = isl_printer_print_str(p, ";");
477 p = isl_printer_end_line(p);
479 return p;
482 /* Print a sync statement.
484 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
485 struct ppcg_kernel_stmt *stmt)
487 p = isl_printer_start_line(p);
488 p = isl_printer_print_str(p, "__syncthreads();");
489 p = isl_printer_end_line(p);
491 return p;
494 /* Print an access based on the information in "access".
495 * If this an access to global memory, then the index expression
496 * is linearized.
498 * If access->array is NULL, then we are
499 * accessing an iterator in the original program.
501 static __isl_give isl_printer *print_access(__isl_take isl_printer *p,
502 struct ppcg_kernel_access *access)
504 int i;
505 unsigned n_index;
506 struct gpu_array_info *array;
507 isl_pw_aff_list *bound;
509 array = access->array;
510 bound = array ? access->local_array->bound : NULL;
511 if (!array)
512 p = isl_printer_print_str(p, "(");
513 else {
514 if (access->type == ppcg_access_global &&
515 gpu_array_is_scalar(array) && !array->read_only)
516 p = isl_printer_print_str(p, "*");
517 p = isl_printer_print_str(p, access->local_name);
518 if (gpu_array_is_scalar(array))
519 return p;
520 p = isl_printer_print_str(p, "[");
523 n_index = isl_ast_expr_list_n_ast_expr(access->index);
524 if (access->type == ppcg_access_global)
525 for (i = 0; i + 1 < n_index; ++i)
526 p = isl_printer_print_str(p, "(");
528 for (i = 0; i < n_index; ++i) {
529 isl_ast_expr *index;
531 index = isl_ast_expr_list_get_ast_expr(access->index, i);
532 if (array && i) {
533 if (access->type == ppcg_access_global) {
534 isl_pw_aff *bound_i;
535 bound_i = isl_pw_aff_list_get_pw_aff(bound, i);
536 p = isl_printer_print_str(p, ") * (");
537 p = isl_printer_print_pw_aff(p, bound_i);
538 p = isl_printer_print_str(p, ") + ");
539 isl_pw_aff_free(bound_i);
540 } else
541 p = isl_printer_print_str(p, "][");
543 p = isl_printer_print_ast_expr(p, index);
544 isl_ast_expr_free(index);
546 if (!array)
547 p = isl_printer_print_str(p, ")");
548 else
549 p = isl_printer_print_str(p, "]");
551 return p;
554 struct cuda_access_print_info {
555 int i;
556 struct ppcg_kernel_stmt *stmt;
559 /* To print the cuda accesses we walk the list of cuda accesses simultaneously
560 * with the pet printer. This means that whenever the pet printer prints a
561 * pet access expression we have the corresponding cuda access available and can
562 * print the modified access.
564 static __isl_give isl_printer *print_cuda_access(__isl_take isl_printer *p,
565 struct pet_expr *expr, void *usr)
567 struct cuda_access_print_info *info =
568 (struct cuda_access_print_info *) usr;
570 p = print_access(p, &info->stmt->u.d.access[info->i]);
571 info->i++;
573 return p;
576 static __isl_give isl_printer *print_stmt_body(__isl_take isl_printer *p,
577 struct ppcg_kernel_stmt *stmt)
579 struct cuda_access_print_info info;
581 info.i = 0;
582 info.stmt = stmt;
584 p = isl_printer_start_line(p);
585 p = print_pet_expr(p, stmt->u.d.stmt->body, &print_cuda_access, &info);
586 p = isl_printer_print_str(p, ";");
587 p = isl_printer_end_line(p);
589 return p;
592 /* This function is called for each user statement in the AST,
593 * i.e., for each kernel body statement, copy statement or sync statement.
595 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
596 __isl_keep isl_ast_node *node, void *user)
598 isl_id *id;
599 struct ppcg_kernel_stmt *stmt;
601 id = isl_ast_node_get_annotation(node);
602 stmt = isl_id_get_user(id);
603 isl_id_free(id);
605 switch (stmt->type) {
606 case ppcg_kernel_copy:
607 return print_copy(p, stmt);
608 case ppcg_kernel_sync:
609 return print_sync(p, stmt);
610 case ppcg_kernel_domain:
611 return print_stmt_body(p, stmt);
614 return p;
617 static int print_macro(enum isl_ast_op_type type, void *user)
619 isl_printer **p = user;
621 if (type == isl_ast_op_fdiv_q)
622 return 0;
624 *p = isl_ast_op_type_print_macro(type, *p);
626 return 0;
629 /* Print the required macros for "node", including one for floord.
630 * We always print a macro for floord as it may also appear in the statements.
632 static __isl_give isl_printer *print_macros(
633 __isl_keep isl_ast_node *node, __isl_take isl_printer *p)
635 p = isl_ast_op_type_print_macro(isl_ast_op_fdiv_q, p);
636 if (isl_ast_node_foreach_ast_op_type(node, &print_macro, &p) < 0)
637 return isl_printer_free(p);
638 return p;
641 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
642 struct cuda_info *cuda)
644 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
645 isl_ast_print_options *print_options;
646 isl_printer *p;
648 print_kernel_headers(prog, kernel, cuda);
649 fprintf(cuda->kernel_c, "{\n");
650 print_kernel_iterators(cuda->kernel_c, kernel);
651 print_kernel_vars(cuda->kernel_c, kernel);
652 fprintf(cuda->kernel_c, "\n");
654 print_options = isl_ast_print_options_alloc(ctx);
655 print_options = isl_ast_print_options_set_print_user(print_options,
656 &print_kernel_stmt, NULL);
658 p = isl_printer_to_file(ctx, cuda->kernel_c);
659 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
660 p = isl_printer_indent(p, 4);
661 p = print_macros(kernel->tree, p);
662 p = isl_ast_node_print(kernel->tree, p, print_options);
663 isl_printer_free(p);
665 isl_ast_print_options_free(print_options);
667 fprintf(cuda->kernel_c, "}\n");
670 struct print_host_user_data {
671 struct cuda_info *cuda;
672 struct gpu_prog *prog;
675 /* Print the user statement of the host code to "p".
677 * In particular, print a block of statements that defines the grid
678 * and the block and then launches the kernel.
680 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
681 __isl_keep isl_ast_node *node, void *user)
683 isl_id *id;
684 struct ppcg_kernel *kernel;
685 struct print_host_user_data *data;
687 id = isl_ast_node_get_annotation(node);
688 kernel = isl_id_get_user(id);
689 isl_id_free(id);
691 data = (struct print_host_user_data *) user;
693 p = isl_printer_start_line(p);
694 p = isl_printer_print_str(p, "{");
695 p = isl_printer_end_line(p);
696 p = isl_printer_indent(p, 2);
698 p = isl_printer_start_line(p);
699 p = isl_printer_print_str(p, "dim3 k");
700 p = isl_printer_print_int(p, kernel->id);
701 p = isl_printer_print_str(p, "_dimBlock");
702 print_reverse_list(isl_printer_get_file(p),
703 kernel->n_block, kernel->block_dim);
704 p = isl_printer_print_str(p, ";");
705 p = isl_printer_end_line(p);
707 p = print_grid(p, kernel);
709 p = isl_printer_start_line(p);
710 p = isl_printer_print_str(p, "kernel");
711 p = isl_printer_print_int(p, kernel->id);
712 p = isl_printer_print_str(p, " <<<k");
713 p = isl_printer_print_int(p, kernel->id);
714 p = isl_printer_print_str(p, "_dimGrid, k");
715 p = isl_printer_print_int(p, kernel->id);
716 p = isl_printer_print_str(p, "_dimBlock>>> (");
717 p = print_kernel_arguments(p, data->prog, kernel, 0);
718 p = isl_printer_print_str(p, ");");
719 p = isl_printer_end_line(p);
721 p = isl_printer_start_line(p);
722 p = isl_printer_print_str(p, "cudaCheckKernel();");
723 p = isl_printer_end_line(p);
725 p = isl_printer_indent(p, -2);
726 p = isl_printer_start_line(p);
727 p = isl_printer_print_str(p, "}");
728 p = isl_printer_end_line(p);
730 p = isl_printer_start_line(p);
731 p = isl_printer_end_line(p);
733 print_kernel(data->prog, kernel, data->cuda);
735 return p;
738 static void print_host_code(FILE *out, struct gpu_prog *prog,
739 __isl_keep isl_ast_node *tree, struct cuda_info *cuda)
741 isl_ast_print_options *print_options;
742 isl_printer *p;
743 isl_ctx *ctx = isl_ast_node_get_ctx(tree);
744 struct print_host_user_data data = { cuda, prog };
746 print_options = isl_ast_print_options_alloc(ctx);
747 print_options = isl_ast_print_options_set_print_user(print_options,
748 &print_host_user, &data);
750 p = isl_printer_to_file(ctx, out);
751 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
752 p = print_macros(tree, p);
753 p = isl_ast_node_print(tree, p, print_options);
754 isl_printer_free(p);
756 isl_ast_print_options_free(print_options);
759 static void copy_arrays_from_device(FILE *out, struct gpu_prog *prog)
761 int i;
762 isl_union_set *write;
763 write = isl_union_map_range(isl_union_map_copy(prog->write));
765 for (i = 0; i < prog->n_array; ++i) {
766 isl_space *dim;
767 isl_set *write_i;
768 int empty;
770 dim = isl_space_copy(prog->array[i].dim);
771 write_i = isl_union_set_extract_set(write, dim);
772 empty = isl_set_fast_is_empty(write_i);
773 isl_set_free(write_i);
774 if (empty)
775 continue;
777 fprintf(out, "cudaCheckReturn(cudaMemcpy(");
778 if (gpu_array_is_scalar(&prog->array[i]))
779 fprintf(out, "&%s, ", prog->array[i].name);
780 else
781 fprintf(out, "%s, ", prog->array[i].name);
782 fprintf(out, "dev_%s, ", prog->array[i].name);
783 print_array_size(prog->ctx, out, &prog->array[i]);
784 fprintf(out, ", cudaMemcpyDeviceToHost));\n");
787 isl_union_set_free(write);
788 fprintf(out, "\n");
791 static void free_device_arrays(FILE *out, struct gpu_prog *prog)
793 int i;
795 for (i = 0; i < prog->n_array; ++i) {
796 if (gpu_array_is_read_only_scalar(&prog->array[i]))
797 continue;
798 fprintf(out, "cudaCheckReturn(cudaFree(dev_%s));\n",
799 prog->array[i].name);
803 int generate_cuda(isl_ctx *ctx, struct pet_scop *scop,
804 struct ppcg_options *options, const char *input)
806 struct cuda_info cuda;
807 struct gpu_prog *prog;
808 isl_ast_node *tree;
810 if (!scop)
811 return -1;
813 scop->context = add_context_from_str(scop->context, options->ctx);
815 prog = gpu_prog_alloc(ctx, scop);
817 tree = generate_gpu(ctx, prog, options);
819 cuda_open_files(&cuda, input);
821 fprintf(cuda.host_c, "{\n");
823 print_cuda_macros(cuda.host_c);
825 declare_device_arrays(cuda.host_c, prog);
826 allocate_device_arrays(cuda.host_c, prog);
827 copy_arrays_to_device(cuda.host_c, prog);
829 print_host_code(cuda.host_c, prog, tree, &cuda);
830 isl_ast_node_free(tree);
832 copy_arrays_from_device(cuda.host_c, prog);
833 free_device_arrays(cuda.host_c, prog);
835 fprintf(cuda.host_c, "}\n");
837 cuda_close_files(&cuda);
839 gpu_prog_free(prog);
841 return 0;