Print descriptive error message on CUDA error
[ppcg.git] / cuda.c
blobdb19e4e504fc7206f1d006da7b93df2c98f27319
1 /*
2 * Copyright 2012 Ecole Normale Superieure
4 * Use of this software is governed by the GNU LGPLv2.1 license
6 * Written by Sven Verdoolaege,
7 * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
8 */
10 #include <isl/aff.h>
11 #include <isl/ast.h>
13 #include "cuda_common.h"
14 #include "cuda.h"
15 #include "gpu.h"
16 #include "pet_printer.h"
17 #include "print.h"
18 #include "schedule.h"
20 static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
22 const char *macros =
23 "#define cudaCheckReturn(ret) \\\n"
24 " do { \\\n"
25 " cudaError_t e = (ret); \\\n"
26 " if (e != cudaSuccess) { \\\n"
27 " fprintf(stderr, \"CUDA error: %s\\n\", "
28 "cudaGetErrorString(e)); \\\n"
29 " fflush(stderr); \\\n"
30 " } \\\n"
31 " assert(e == cudaSuccess); \\\n"
32 " } while(0)\n"
33 "#define cudaCheckKernel() \\\n"
34 " do { \\\n"
35 " cudaError_t e = cudaGetLastError(); \\\n"
36 " cudaCheckReturn(e); \\\n"
37 " } while(0)\n\n";
39 p = isl_printer_print_str(p, macros);
40 return p;
43 static __isl_give isl_printer *print_array_size(__isl_take isl_printer *prn,
44 struct gpu_array_info *array)
46 int i;
48 for (i = 0; i < array->n_index; ++i) {
49 prn = isl_printer_print_str(prn, "(");
50 prn = isl_printer_print_pw_aff(prn, array->bound[i]);
51 prn = isl_printer_print_str(prn, ") * ");
53 prn = isl_printer_print_str(prn, "sizeof(");
54 prn = isl_printer_print_str(prn, array->type);
55 prn = isl_printer_print_str(prn, ")");
57 return prn;
60 static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
61 struct gpu_prog *prog)
63 int i;
65 for (i = 0; i < prog->n_array; ++i) {
66 if (gpu_array_is_read_only_scalar(&prog->array[i]))
67 continue;
68 p = isl_printer_start_line(p);
69 p = isl_printer_print_str(p, prog->array[i].type);
70 p = isl_printer_print_str(p, " *dev_");
71 p = isl_printer_print_str(p, prog->array[i].name);
72 p = isl_printer_print_str(p, ";");
73 p = isl_printer_end_line(p);
75 p = isl_printer_start_line(p);
76 p = isl_printer_end_line(p);
77 return p;
80 static __isl_give isl_printer *allocate_device_arrays(
81 __isl_take isl_printer *p, struct gpu_prog *prog)
83 int i;
85 for (i = 0; i < prog->n_array; ++i) {
86 if (gpu_array_is_read_only_scalar(&prog->array[i]))
87 continue;
88 p = isl_printer_start_line(p);
89 p = isl_printer_print_str(p,
90 "cudaCheckReturn(cudaMalloc((void **) &dev_");
91 p = isl_printer_print_str(p, prog->array[i].name);
92 p = isl_printer_print_str(p, ", ");
93 p = print_array_size(p, &prog->array[i]);
94 p = isl_printer_print_str(p, "));");
95 p = isl_printer_end_line(p);
97 p = isl_printer_start_line(p);
98 p = isl_printer_end_line(p);
99 return p;
102 static __isl_give isl_printer *copy_arrays_to_device(__isl_take isl_printer *p,
103 struct gpu_prog *prog)
105 int i;
107 for (i = 0; i < prog->n_array; ++i) {
108 isl_space *dim;
109 isl_set *read_i;
110 int empty;
112 if (gpu_array_is_read_only_scalar(&prog->array[i]))
113 continue;
115 dim = isl_space_copy(prog->array[i].dim);
116 read_i = isl_union_set_extract_set(prog->copy_in, dim);
117 empty = isl_set_fast_is_empty(read_i);
118 isl_set_free(read_i);
119 if (empty)
120 continue;
122 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
123 p = isl_printer_print_str(p, prog->array[i].name);
124 p = isl_printer_print_str(p, ", ");
126 if (gpu_array_is_scalar(&prog->array[i]))
127 p = isl_printer_print_str(p, "&");
128 p = isl_printer_print_str(p, prog->array[i].name);
129 p = isl_printer_print_str(p, ", ");
131 p = print_array_size(p, &prog->array[i]);
132 p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
133 p = isl_printer_end_line(p);
135 p = isl_printer_start_line(p);
136 p = isl_printer_end_line(p);
137 return p;
140 static void print_reverse_list(FILE *out, int len, int *list)
142 int i;
144 if (len == 0)
145 return;
147 fprintf(out, "(");
148 for (i = 0; i < len; ++i) {
149 if (i)
150 fprintf(out, ", ");
151 fprintf(out, "%d", list[len - 1 - i]);
153 fprintf(out, ")");
156 /* Print the effective grid size as a list of the sizes in each
157 * dimension, from innermost to outermost.
159 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
160 struct ppcg_kernel *kernel)
162 int i;
163 int dim;
165 dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
166 if (dim == 0)
167 return p;
169 p = isl_printer_print_str(p, "(");
170 for (i = dim - 1; i >= 0; --i) {
171 isl_pw_aff *bound;
173 bound = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
174 p = isl_printer_print_pw_aff(p, bound);
175 isl_pw_aff_free(bound);
177 if (i > 0)
178 p = isl_printer_print_str(p, ", ");
181 p = isl_printer_print_str(p, ")");
183 return p;
186 /* Print the grid definition.
188 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
189 struct ppcg_kernel *kernel)
191 p = isl_printer_start_line(p);
192 p = isl_printer_print_str(p, "dim3 k");
193 p = isl_printer_print_int(p, kernel->id);
194 p = isl_printer_print_str(p, "_dimGrid");
195 p = print_grid_size(p, kernel);
196 p = isl_printer_print_str(p, ";");
197 p = isl_printer_end_line(p);
199 return p;
202 /* Print the arguments to a kernel declaration or call. If "types" is set,
203 * then print a declaration (including the types of the arguments).
205 * The arguments are printed in the following order
206 * - the arrays accessed by the kernel
207 * - the parameters
208 * - the host loop iterators
210 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
211 struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
213 int i, n;
214 int first = 1;
215 unsigned nparam;
216 isl_space *space;
217 const char *type;
219 for (i = 0; i < prog->n_array; ++i) {
220 isl_set *arr;
221 int empty;
223 space = isl_space_copy(prog->array[i].dim);
224 arr = isl_union_set_extract_set(kernel->arrays, space);
225 empty = isl_set_fast_is_empty(arr);
226 isl_set_free(arr);
227 if (empty)
228 continue;
230 if (!first)
231 p = isl_printer_print_str(p, ", ");
233 if (types) {
234 p = isl_printer_print_str(p, prog->array[i].type);
235 p = isl_printer_print_str(p, " ");
238 if (gpu_array_is_read_only_scalar(&prog->array[i])) {
239 p = isl_printer_print_str(p, prog->array[i].name);
240 } else {
241 if (types)
242 p = isl_printer_print_str(p, "*");
243 else
244 p = isl_printer_print_str(p, "dev_");
245 p = isl_printer_print_str(p, prog->array[i].name);
248 first = 0;
251 space = isl_union_set_get_space(kernel->arrays);
252 nparam = isl_space_dim(space, isl_dim_param);
253 for (i = 0; i < nparam; ++i) {
254 const char *name;
256 name = isl_space_get_dim_name(space, isl_dim_param, i);
258 if (!first)
259 p = isl_printer_print_str(p, ", ");
260 if (types)
261 p = isl_printer_print_str(p, "int ");
262 p = isl_printer_print_str(p, name);
264 first = 0;
266 isl_space_free(space);
268 n = isl_space_dim(kernel->space, isl_dim_set);
269 type = isl_options_get_ast_iterator_type(prog->ctx);
270 for (i = 0; i < n; ++i) {
271 const char *name;
272 isl_id *id;
274 if (!first)
275 p = isl_printer_print_str(p, ", ");
276 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
277 if (types) {
278 p = isl_printer_print_str(p, type);
279 p = isl_printer_print_str(p, " ");
281 p = isl_printer_print_str(p, name);
283 first = 0;
286 return p;
289 /* Print the header of the given kernel.
291 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
292 struct gpu_prog *prog, struct ppcg_kernel *kernel)
294 p = isl_printer_start_line(p);
295 p = isl_printer_print_str(p, "__global__ void kernel");
296 p = isl_printer_print_int(p, kernel->id);
297 p = isl_printer_print_str(p, "(");
298 p = print_kernel_arguments(p, prog, kernel, 1);
299 p = isl_printer_print_str(p, ")");
301 return p;
304 /* Print the header of the given kernel to both gen->cuda.kernel_h
305 * and gen->cuda.kernel_c.
307 static void print_kernel_headers(struct gpu_prog *prog,
308 struct ppcg_kernel *kernel, struct cuda_info *cuda)
310 isl_printer *p;
312 p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
313 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
314 p = print_kernel_header(p, prog, kernel);
315 p = isl_printer_print_str(p, ";");
316 p = isl_printer_end_line(p);
317 isl_printer_free(p);
319 p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
320 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
321 p = print_kernel_header(p, prog, kernel);
322 p = isl_printer_end_line(p);
323 isl_printer_free(p);
326 static void print_indent(FILE *dst, int indent)
328 fprintf(dst, "%*s", indent, "");
331 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
333 int i;
334 const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
335 const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
336 "threadIdx.z" };
338 if (kernel->n_grid > 0) {
339 print_indent(out, 4);
340 fprintf(out, "int ");
341 for (i = 0; i < kernel->n_grid; ++i) {
342 if (i)
343 fprintf(out, ", ");
344 fprintf(out, "b%d = %s",
345 i, block_dims[kernel->n_grid - 1 - i]);
347 fprintf(out, ";\n");
350 if (kernel->n_block > 0) {
351 print_indent(out, 4);
352 fprintf(out, "int ");
353 for (i = 0; i < kernel->n_block; ++i) {
354 if (i)
355 fprintf(out, ", ");
356 fprintf(out, "t%d = %s",
357 i, thread_dims[kernel->n_block - 1 - i]);
359 fprintf(out, ";\n");
363 static void print_kernel_var(FILE *out, struct ppcg_kernel_var *var)
365 int j;
366 isl_int v;
368 print_indent(out, 4);
369 if (var->type == ppcg_access_shared)
370 fprintf(out, "__shared__ ");
371 fprintf(out, "%s %s", var->array->type, var->name);
372 isl_int_init(v);
373 for (j = 0; j < var->array->n_index; ++j) {
374 fprintf(out, "[");
375 isl_vec_get_element(var->size, j, &v);
376 isl_int_print(out, v, 0);
377 fprintf(out, "]");
379 isl_int_clear(v);
380 fprintf(out, ";\n");
383 static void print_kernel_vars(FILE *out, struct ppcg_kernel *kernel)
385 int i;
387 for (i = 0; i < kernel->n_var; ++i)
388 print_kernel_var(out, &kernel->var[i]);
391 /* Print an access to the element in the private/shared memory copy
392 * described by "stmt". The index of the copy is recorded in
393 * stmt->local_index as a "call" to the array.
395 static __isl_give isl_printer *stmt_print_local_index(__isl_take isl_printer *p,
396 struct ppcg_kernel_stmt *stmt)
398 int i;
399 isl_ast_expr *expr;
400 struct gpu_array_info *array = stmt->u.c.array;
402 expr = isl_ast_expr_get_op_arg(stmt->u.c.local_index, 0);
403 p = isl_printer_print_ast_expr(p, expr);
404 isl_ast_expr_free(expr);
406 for (i = 0; i < array->n_index; ++i) {
407 expr = isl_ast_expr_get_op_arg(stmt->u.c.local_index, 1 + i);
409 p = isl_printer_print_str(p, "[");
410 p = isl_printer_print_ast_expr(p, expr);
411 p = isl_printer_print_str(p, "]");
413 isl_ast_expr_free(expr);
416 return p;
419 /* Print an access to the element in the global memory copy
420 * described by "stmt". The index of the copy is recorded in
421 * stmt->index as a "call" to the array.
423 * The copy in global memory has been linearized, so we need to take
424 * the array size into account.
426 static __isl_give isl_printer *stmt_print_global_index(
427 __isl_take isl_printer *p, struct ppcg_kernel_stmt *stmt)
429 int i;
430 struct gpu_array_info *array = stmt->u.c.array;
431 isl_pw_aff_list *bound = stmt->u.c.local_array->bound;
433 if (gpu_array_is_scalar(array)) {
434 if (!array->read_only)
435 p = isl_printer_print_str(p, "*");
436 p = isl_printer_print_str(p, array->name);
437 return p;
440 p = isl_printer_print_str(p, array->name);
441 p = isl_printer_print_str(p, "[");
442 for (i = 0; i + 1 < array->n_index; ++i)
443 p = isl_printer_print_str(p, "(");
444 for (i = 0; i < array->n_index; ++i) {
445 isl_ast_expr *expr;
446 expr = isl_ast_expr_get_op_arg(stmt->u.c.index, 1 + i);
447 if (i) {
448 isl_pw_aff *bound_i;
449 bound_i = isl_pw_aff_list_get_pw_aff(bound, i);
450 p = isl_printer_print_str(p, ") * (");
451 p = isl_printer_print_pw_aff(p, bound_i);
452 p = isl_printer_print_str(p, ") + ");
453 isl_pw_aff_free(bound_i);
455 p = isl_printer_print_ast_expr(p, expr);
456 isl_ast_expr_free(expr);
458 p = isl_printer_print_str(p, "]");
460 return p;
463 /* Print a copy statement.
465 * A read copy statement is printed as
467 * local = global;
469 * while a write copy statement is printed as
471 * global = local;
473 static __isl_give isl_printer *print_copy(__isl_take isl_printer *p,
474 struct ppcg_kernel_stmt *stmt)
476 p = isl_printer_start_line(p);
477 if (stmt->u.c.read) {
478 p = stmt_print_local_index(p, stmt);
479 p = isl_printer_print_str(p, " = ");
480 p = stmt_print_global_index(p, stmt);
481 } else {
482 p = stmt_print_global_index(p, stmt);
483 p = isl_printer_print_str(p, " = ");
484 p = stmt_print_local_index(p, stmt);
486 p = isl_printer_print_str(p, ";");
487 p = isl_printer_end_line(p);
489 return p;
492 /* Print a sync statement.
494 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
495 struct ppcg_kernel_stmt *stmt)
497 p = isl_printer_start_line(p);
498 p = isl_printer_print_str(p, "__syncthreads();");
499 p = isl_printer_end_line(p);
501 return p;
504 /* Print an access based on the information in "access".
505 * If this an access to global memory, then the index expression
506 * is linearized.
508 * If access->array is NULL, then we are
509 * accessing an iterator in the original program.
511 static __isl_give isl_printer *print_access(__isl_take isl_printer *p,
512 struct ppcg_kernel_access *access)
514 int i;
515 unsigned n_index;
516 struct gpu_array_info *array;
517 isl_pw_aff_list *bound;
519 array = access->array;
520 bound = array ? access->local_array->bound : NULL;
521 if (!array)
522 p = isl_printer_print_str(p, "(");
523 else {
524 if (access->type == ppcg_access_global &&
525 gpu_array_is_scalar(array) && !array->read_only)
526 p = isl_printer_print_str(p, "*");
527 p = isl_printer_print_str(p, access->local_name);
528 if (gpu_array_is_scalar(array))
529 return p;
530 p = isl_printer_print_str(p, "[");
533 n_index = isl_ast_expr_list_n_ast_expr(access->index);
534 if (access->type == ppcg_access_global)
535 for (i = 0; i + 1 < n_index; ++i)
536 p = isl_printer_print_str(p, "(");
538 for (i = 0; i < n_index; ++i) {
539 isl_ast_expr *index;
541 index = isl_ast_expr_list_get_ast_expr(access->index, i);
542 if (array && i) {
543 if (access->type == ppcg_access_global) {
544 isl_pw_aff *bound_i;
545 bound_i = isl_pw_aff_list_get_pw_aff(bound, i);
546 p = isl_printer_print_str(p, ") * (");
547 p = isl_printer_print_pw_aff(p, bound_i);
548 p = isl_printer_print_str(p, ") + ");
549 isl_pw_aff_free(bound_i);
550 } else
551 p = isl_printer_print_str(p, "][");
553 p = isl_printer_print_ast_expr(p, index);
554 isl_ast_expr_free(index);
556 if (!array)
557 p = isl_printer_print_str(p, ")");
558 else
559 p = isl_printer_print_str(p, "]");
561 return p;
564 struct cuda_access_print_info {
565 int i;
566 struct ppcg_kernel_stmt *stmt;
569 /* To print the cuda accesses we walk the list of cuda accesses simultaneously
570 * with the pet printer. This means that whenever the pet printer prints a
571 * pet access expression we have the corresponding cuda access available and can
572 * print the modified access.
574 static __isl_give isl_printer *print_cuda_access(__isl_take isl_printer *p,
575 struct pet_expr *expr, void *usr)
577 struct cuda_access_print_info *info =
578 (struct cuda_access_print_info *) usr;
580 p = print_access(p, &info->stmt->u.d.access[info->i]);
581 info->i++;
583 return p;
586 static __isl_give isl_printer *print_stmt_body(__isl_take isl_printer *p,
587 struct ppcg_kernel_stmt *stmt)
589 struct cuda_access_print_info info;
591 info.i = 0;
592 info.stmt = stmt;
594 p = isl_printer_start_line(p);
595 p = print_pet_expr(p, stmt->u.d.stmt->body, &print_cuda_access, &info);
596 p = isl_printer_print_str(p, ";");
597 p = isl_printer_end_line(p);
599 return p;
602 /* This function is called for each user statement in the AST,
603 * i.e., for each kernel body statement, copy statement or sync statement.
605 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
606 __isl_take isl_ast_print_options *print_options,
607 __isl_keep isl_ast_node *node, void *user)
609 isl_id *id;
610 struct ppcg_kernel_stmt *stmt;
612 id = isl_ast_node_get_annotation(node);
613 stmt = isl_id_get_user(id);
614 isl_id_free(id);
616 isl_ast_print_options_free(print_options);
618 switch (stmt->type) {
619 case ppcg_kernel_copy:
620 return print_copy(p, stmt);
621 case ppcg_kernel_sync:
622 return print_sync(p, stmt);
623 case ppcg_kernel_domain:
624 return print_stmt_body(p, stmt);
627 return p;
630 static int print_macro(enum isl_ast_op_type type, void *user)
632 isl_printer **p = user;
634 if (type == isl_ast_op_fdiv_q)
635 return 0;
637 *p = isl_ast_op_type_print_macro(type, *p);
639 return 0;
642 /* Print the required macros for "node", including one for floord.
643 * We always print a macro for floord as it may also appear in the statements.
645 static __isl_give isl_printer *print_macros(
646 __isl_keep isl_ast_node *node, __isl_take isl_printer *p)
648 p = isl_ast_op_type_print_macro(isl_ast_op_fdiv_q, p);
649 if (isl_ast_node_foreach_ast_op_type(node, &print_macro, &p) < 0)
650 return isl_printer_free(p);
651 return p;
654 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
655 struct cuda_info *cuda)
657 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
658 isl_ast_print_options *print_options;
659 isl_printer *p;
661 print_kernel_headers(prog, kernel, cuda);
662 fprintf(cuda->kernel_c, "{\n");
663 print_kernel_iterators(cuda->kernel_c, kernel);
664 print_kernel_vars(cuda->kernel_c, kernel);
665 fprintf(cuda->kernel_c, "\n");
667 print_options = isl_ast_print_options_alloc(ctx);
668 print_options = isl_ast_print_options_set_print_user(print_options,
669 &print_kernel_stmt, NULL);
671 p = isl_printer_to_file(ctx, cuda->kernel_c);
672 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
673 p = isl_printer_indent(p, 4);
674 p = print_macros(kernel->tree, p);
675 p = isl_ast_node_print(kernel->tree, p, print_options);
676 isl_printer_free(p);
678 fprintf(cuda->kernel_c, "}\n");
681 struct print_host_user_data {
682 struct cuda_info *cuda;
683 struct gpu_prog *prog;
686 /* Print the user statement of the host code to "p".
688 * In particular, print a block of statements that defines the grid
689 * and the block and then launches the kernel.
691 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
692 __isl_take isl_ast_print_options *print_options,
693 __isl_keep isl_ast_node *node, void *user)
695 isl_id *id;
696 struct ppcg_kernel *kernel;
697 struct print_host_user_data *data;
699 id = isl_ast_node_get_annotation(node);
700 kernel = isl_id_get_user(id);
701 isl_id_free(id);
703 data = (struct print_host_user_data *) user;
705 p = isl_printer_start_line(p);
706 p = isl_printer_print_str(p, "{");
707 p = isl_printer_end_line(p);
708 p = isl_printer_indent(p, 2);
710 p = isl_printer_start_line(p);
711 p = isl_printer_print_str(p, "dim3 k");
712 p = isl_printer_print_int(p, kernel->id);
713 p = isl_printer_print_str(p, "_dimBlock");
714 print_reverse_list(isl_printer_get_file(p),
715 kernel->n_block, kernel->block_dim);
716 p = isl_printer_print_str(p, ";");
717 p = isl_printer_end_line(p);
719 p = print_grid(p, kernel);
721 p = isl_printer_start_line(p);
722 p = isl_printer_print_str(p, "kernel");
723 p = isl_printer_print_int(p, kernel->id);
724 p = isl_printer_print_str(p, " <<<k");
725 p = isl_printer_print_int(p, kernel->id);
726 p = isl_printer_print_str(p, "_dimGrid, k");
727 p = isl_printer_print_int(p, kernel->id);
728 p = isl_printer_print_str(p, "_dimBlock>>> (");
729 p = print_kernel_arguments(p, data->prog, kernel, 0);
730 p = isl_printer_print_str(p, ");");
731 p = isl_printer_end_line(p);
733 p = isl_printer_start_line(p);
734 p = isl_printer_print_str(p, "cudaCheckKernel();");
735 p = isl_printer_end_line(p);
737 p = isl_printer_indent(p, -2);
738 p = isl_printer_start_line(p);
739 p = isl_printer_print_str(p, "}");
740 p = isl_printer_end_line(p);
742 p = isl_printer_start_line(p);
743 p = isl_printer_end_line(p);
745 print_kernel(data->prog, kernel, data->cuda);
747 isl_ast_print_options_free(print_options);
749 return p;
752 static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
753 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
754 struct cuda_info *cuda)
756 isl_ast_print_options *print_options;
757 isl_ctx *ctx = isl_ast_node_get_ctx(tree);
758 struct print_host_user_data data = { cuda, prog };
760 print_options = isl_ast_print_options_alloc(ctx);
761 print_options = isl_ast_print_options_set_print_user(print_options,
762 &print_host_user, &data);
764 p = print_macros(tree, p);
765 p = isl_ast_node_print(tree, p, print_options);
767 return p;
770 /* For each array that is written anywhere in the gpu_prog,
771 * copy the contents back from the GPU to the host.
773 * Arrays that are not visible outside the corresponding scop
774 * do not need to be copied back.
776 static __isl_give isl_printer *copy_arrays_from_device(
777 __isl_take isl_printer *p, struct gpu_prog *prog)
779 int i;
780 isl_union_set *write;
781 write = isl_union_map_range(isl_union_map_copy(prog->write));
783 for (i = 0; i < prog->n_array; ++i) {
784 isl_space *dim;
785 isl_set *write_i;
786 int empty;
788 if (prog->array[i].local)
789 continue;
791 dim = isl_space_copy(prog->array[i].dim);
792 write_i = isl_union_set_extract_set(write, dim);
793 empty = isl_set_fast_is_empty(write_i);
794 isl_set_free(write_i);
795 if (empty)
796 continue;
798 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
799 if (gpu_array_is_scalar(&prog->array[i]))
800 p = isl_printer_print_str(p, "&");
801 p = isl_printer_print_str(p, prog->array[i].name);
802 p = isl_printer_print_str(p, ", dev_");
803 p = isl_printer_print_str(p, prog->array[i].name);
804 p = isl_printer_print_str(p, ", ");
805 p = print_array_size(p, &prog->array[i]);
806 p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
807 p = isl_printer_end_line(p);
810 isl_union_set_free(write);
811 p = isl_printer_start_line(p);
812 p = isl_printer_end_line(p);
813 return p;
816 static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
817 struct gpu_prog *prog)
819 int i;
821 for (i = 0; i < prog->n_array; ++i) {
822 if (gpu_array_is_read_only_scalar(&prog->array[i]))
823 continue;
824 p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
825 p = isl_printer_print_str(p, prog->array[i].name);
826 p = isl_printer_print_str(p, "));");
827 p = isl_printer_end_line(p);
830 return p;
833 int generate_cuda(isl_ctx *ctx, struct ppcg_scop *scop,
834 struct ppcg_options *options, const char *input)
836 struct cuda_info cuda;
837 struct gpu_prog *prog;
838 isl_ast_node *tree;
839 isl_printer *p;
841 if (!scop)
842 return -1;
844 scop->context = add_context_from_str(scop->context, options->ctx);
846 prog = gpu_prog_alloc(ctx, scop);
848 tree = generate_gpu(ctx, prog, options);
850 cuda_open_files(&cuda, input);
852 p = isl_printer_to_file(ctx, cuda.host_c);
853 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
854 p = ppcg_print_exposed_declarations(p, scop);
855 p = ppcg_start_block(p);
857 p = print_cuda_macros(p);
859 p = declare_device_arrays(p, prog);
860 p = allocate_device_arrays(p, prog);
861 p = copy_arrays_to_device(p, prog);
863 p = print_host_code(p, prog, tree, &cuda);
864 isl_ast_node_free(tree);
866 p = copy_arrays_from_device(p, prog);
867 p = free_device_arrays(p, prog);
869 p = ppcg_end_block(p);
870 isl_printer_free(p);
872 cuda_close_files(&cuda);
874 gpu_prog_free(prog);
876 return 0;