cuda.c: extract out copy_array_{to,from}_device
[ppcg.git] / cuda.c
blob890bd19c8002ac05c22f9222af38066c03d60847
1 /*
2 * Copyright 2012 Ecole Normale Superieure
4 * Use of this software is governed by the MIT license
6 * Written by Sven Verdoolaege,
7 * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
8 */
10 #include <isl/aff.h>
11 #include <isl/ast.h>
13 #include "cuda_common.h"
14 #include "cuda.h"
15 #include "gpu.h"
16 #include "gpu_print.h"
17 #include "print.h"
19 static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
21 const char *macros =
22 "#define cudaCheckReturn(ret) \\\n"
23 " do { \\\n"
24 " cudaError_t cudaCheckReturn_e = (ret); \\\n"
25 " if (cudaCheckReturn_e != cudaSuccess) { \\\n"
26 " fprintf(stderr, \"CUDA error: %s\\n\", "
27 "cudaGetErrorString(cudaCheckReturn_e)); \\\n"
28 " fflush(stderr); \\\n"
29 " } \\\n"
30 " assert(cudaCheckReturn_e == cudaSuccess); \\\n"
31 " } while(0)\n"
32 "#define cudaCheckKernel() \\\n"
33 " do { \\\n"
34 " cudaCheckReturn(cudaGetLastError()); \\\n"
35 " } while(0)\n\n";
37 p = isl_printer_print_str(p, macros);
38 return p;
41 /* Print a declaration for the device array corresponding to "array" on "p".
43 static __isl_give isl_printer *declare_device_array(__isl_take isl_printer *p,
44 struct gpu_array_info *array)
46 int i;
48 p = isl_printer_start_line(p);
49 p = isl_printer_print_str(p, array->type);
50 p = isl_printer_print_str(p, " ");
51 if (!array->linearize && array->n_index > 1)
52 p = isl_printer_print_str(p, "(");
53 p = isl_printer_print_str(p, "*dev_");
54 p = isl_printer_print_str(p, array->name);
55 if (!array->linearize && array->n_index > 1) {
56 p = isl_printer_print_str(p, ")");
57 for (i = 1; i < array->n_index; i++) {
58 p = isl_printer_print_str(p, "[");
59 p = isl_printer_print_pw_aff(p, array->bound[i]);
60 p = isl_printer_print_str(p, "]");
63 p = isl_printer_print_str(p, ";");
64 p = isl_printer_end_line(p);
66 return p;
69 static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
70 struct gpu_prog *prog)
72 int i;
74 for (i = 0; i < prog->n_array; ++i) {
75 if (gpu_array_is_read_only_scalar(&prog->array[i]))
76 continue;
77 if (!prog->array[i].accessed)
78 continue;
80 p = declare_device_array(p, &prog->array[i]);
82 p = isl_printer_start_line(p);
83 p = isl_printer_end_line(p);
84 return p;
87 static __isl_give isl_printer *allocate_device_arrays(
88 __isl_take isl_printer *p, struct gpu_prog *prog)
90 int i;
92 for (i = 0; i < prog->n_array; ++i) {
93 if (gpu_array_is_read_only_scalar(&prog->array[i]))
94 continue;
95 if (!prog->array[i].accessed)
96 continue;
97 p = isl_printer_start_line(p);
98 p = isl_printer_print_str(p,
99 "cudaCheckReturn(cudaMalloc((void **) &dev_");
100 p = isl_printer_print_str(p, prog->array[i].name);
101 p = isl_printer_print_str(p, ", ");
102 p = gpu_array_info_print_size(p, &prog->array[i]);
103 p = isl_printer_print_str(p, "));");
104 p = isl_printer_end_line(p);
106 p = isl_printer_start_line(p);
107 p = isl_printer_end_line(p);
108 return p;
111 /* Print code to "p" for copying "array" from the host to the device
112 * in its entirety. The bounds on the extent of "array" have
113 * been precomputed in extract_array_info and are used in
114 * gpu_array_info_print_size.
116 static __isl_give isl_printer *copy_array_to_device(__isl_take isl_printer *p,
117 struct gpu_array_info *array)
119 p = isl_printer_start_line(p);
120 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
121 p = isl_printer_print_str(p, array->name);
122 p = isl_printer_print_str(p, ", ");
124 if (gpu_array_is_scalar(array))
125 p = isl_printer_print_str(p, "&");
126 p = isl_printer_print_str(p, array->name);
127 p = isl_printer_print_str(p, ", ");
129 p = gpu_array_info_print_size(p, array);
130 p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
131 p = isl_printer_end_line(p);
133 return p;
136 /* Print code to "p" for copying "array" back from the device to the host
137 * in its entirety. The bounds on the extent of "array" have
138 * been precomputed in extract_array_info and are used in
139 * gpu_array_info_print_size.
141 static __isl_give isl_printer *copy_array_from_device(
142 __isl_take isl_printer *p, struct gpu_array_info *array)
144 p = isl_printer_start_line(p);
145 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
146 if (gpu_array_is_scalar(array))
147 p = isl_printer_print_str(p, "&");
148 p = isl_printer_print_str(p, array->name);
149 p = isl_printer_print_str(p, ", dev_");
150 p = isl_printer_print_str(p, array->name);
151 p = isl_printer_print_str(p, ", ");
152 p = gpu_array_info_print_size(p, array);
153 p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
154 p = isl_printer_end_line(p);
156 return p;
159 static __isl_give isl_printer *copy_arrays_to_device(__isl_take isl_printer *p,
160 struct gpu_prog *prog)
162 int i;
164 for (i = 0; i < prog->n_array; ++i) {
165 isl_space *space;
166 isl_set *read_i;
167 int empty;
169 if (gpu_array_is_read_only_scalar(&prog->array[i]))
170 continue;
172 space = isl_space_copy(prog->array[i].space);
173 read_i = isl_union_set_extract_set(prog->copy_in, space);
174 empty = isl_set_plain_is_empty(read_i);
175 isl_set_free(read_i);
176 if (empty)
177 continue;
179 p = copy_array_to_device(p, &prog->array[i]);
181 p = isl_printer_start_line(p);
182 p = isl_printer_end_line(p);
183 return p;
186 static void print_reverse_list(FILE *out, int len, int *list)
188 int i;
190 if (len == 0)
191 return;
193 fprintf(out, "(");
194 for (i = 0; i < len; ++i) {
195 if (i)
196 fprintf(out, ", ");
197 fprintf(out, "%d", list[len - 1 - i]);
199 fprintf(out, ")");
202 /* Print the effective grid size as a list of the sizes in each
203 * dimension, from innermost to outermost.
205 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
206 struct ppcg_kernel *kernel)
208 int i;
209 int dim;
211 dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
212 if (dim == 0)
213 return p;
215 p = isl_printer_print_str(p, "(");
216 for (i = dim - 1; i >= 0; --i) {
217 isl_pw_aff *bound;
219 bound = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
220 p = isl_printer_print_pw_aff(p, bound);
221 isl_pw_aff_free(bound);
223 if (i > 0)
224 p = isl_printer_print_str(p, ", ");
227 p = isl_printer_print_str(p, ")");
229 return p;
232 /* Print the grid definition.
234 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
235 struct ppcg_kernel *kernel)
237 p = isl_printer_start_line(p);
238 p = isl_printer_print_str(p, "dim3 k");
239 p = isl_printer_print_int(p, kernel->id);
240 p = isl_printer_print_str(p, "_dimGrid");
241 p = print_grid_size(p, kernel);
242 p = isl_printer_print_str(p, ";");
243 p = isl_printer_end_line(p);
245 return p;
248 /* Print the arguments to a kernel declaration or call. If "types" is set,
249 * then print a declaration (including the types of the arguments).
251 * The arguments are printed in the following order
252 * - the arrays accessed by the kernel
253 * - the parameters
254 * - the host loop iterators
256 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
257 struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
259 int i, n;
260 int first = 1;
261 unsigned nparam;
262 isl_space *space;
263 const char *type;
265 for (i = 0; i < prog->n_array; ++i) {
266 isl_set *arr;
267 int empty;
269 space = isl_space_copy(prog->array[i].space);
270 arr = isl_union_set_extract_set(kernel->arrays, space);
271 empty = isl_set_plain_is_empty(arr);
272 isl_set_free(arr);
273 if (empty)
274 continue;
276 if (!first)
277 p = isl_printer_print_str(p, ", ");
279 if (types)
280 p = gpu_array_info_print_declaration_argument(p,
281 &prog->array[i], NULL);
282 else
283 p = gpu_array_info_print_call_argument(p,
284 &prog->array[i]);
286 first = 0;
289 space = isl_union_set_get_space(kernel->arrays);
290 nparam = isl_space_dim(space, isl_dim_param);
291 for (i = 0; i < nparam; ++i) {
292 const char *name;
294 name = isl_space_get_dim_name(space, isl_dim_param, i);
296 if (!first)
297 p = isl_printer_print_str(p, ", ");
298 if (types)
299 p = isl_printer_print_str(p, "int ");
300 p = isl_printer_print_str(p, name);
302 first = 0;
304 isl_space_free(space);
306 n = isl_space_dim(kernel->space, isl_dim_set);
307 type = isl_options_get_ast_iterator_type(prog->ctx);
308 for (i = 0; i < n; ++i) {
309 const char *name;
311 if (!first)
312 p = isl_printer_print_str(p, ", ");
313 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
314 if (types) {
315 p = isl_printer_print_str(p, type);
316 p = isl_printer_print_str(p, " ");
318 p = isl_printer_print_str(p, name);
320 first = 0;
323 return p;
326 /* Print the header of the given kernel.
328 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
329 struct gpu_prog *prog, struct ppcg_kernel *kernel)
331 p = isl_printer_start_line(p);
332 p = isl_printer_print_str(p, "__global__ void kernel");
333 p = isl_printer_print_int(p, kernel->id);
334 p = isl_printer_print_str(p, "(");
335 p = print_kernel_arguments(p, prog, kernel, 1);
336 p = isl_printer_print_str(p, ")");
338 return p;
341 /* Print the header of the given kernel to both gen->cuda.kernel_h
342 * and gen->cuda.kernel_c.
344 static void print_kernel_headers(struct gpu_prog *prog,
345 struct ppcg_kernel *kernel, struct cuda_info *cuda)
347 isl_printer *p;
349 p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
350 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
351 p = print_kernel_header(p, prog, kernel);
352 p = isl_printer_print_str(p, ";");
353 p = isl_printer_end_line(p);
354 isl_printer_free(p);
356 p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
357 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
358 p = print_kernel_header(p, prog, kernel);
359 p = isl_printer_end_line(p);
360 isl_printer_free(p);
363 static void print_indent(FILE *dst, int indent)
365 fprintf(dst, "%*s", indent, "");
368 /* Print a list of iterators of type "type" with names "ids" to "out".
369 * Each iterator is assigned one of the cuda identifiers in cuda_dims.
370 * In particular, the last iterator is assigned the x identifier
371 * (the first in the list of cuda identifiers).
373 static void print_iterators(FILE *out, const char *type,
374 __isl_keep isl_id_list *ids, const char *cuda_dims[])
376 int i, n;
378 n = isl_id_list_n_id(ids);
379 if (n <= 0)
380 return;
381 print_indent(out, 4);
382 fprintf(out, "%s ", type);
383 for (i = 0; i < n; ++i) {
384 isl_id *id;
386 if (i)
387 fprintf(out, ", ");
388 id = isl_id_list_get_id(ids, i);
389 fprintf(out, "%s = %s", isl_id_get_name(id),
390 cuda_dims[n - 1 - i]);
391 isl_id_free(id);
393 fprintf(out, ";\n");
396 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
398 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
399 const char *type;
400 const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
401 const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
402 "threadIdx.z" };
404 type = isl_options_get_ast_iterator_type(ctx);
406 print_iterators(out, type, kernel->block_ids, block_dims);
407 print_iterators(out, type, kernel->thread_ids, thread_dims);
410 static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p,
411 struct ppcg_kernel_var *var)
413 int j;
415 p = isl_printer_start_line(p);
416 if (var->type == ppcg_access_shared)
417 p = isl_printer_print_str(p, "__shared__ ");
418 p = isl_printer_print_str(p, var->array->type);
419 p = isl_printer_print_str(p, " ");
420 p = isl_printer_print_str(p, var->name);
421 for (j = 0; j < var->array->n_index; ++j) {
422 isl_val *v;
424 p = isl_printer_print_str(p, "[");
425 v = isl_vec_get_element_val(var->size, j);
426 p = isl_printer_print_val(p, v);
427 isl_val_free(v);
428 p = isl_printer_print_str(p, "]");
430 p = isl_printer_print_str(p, ";");
431 p = isl_printer_end_line(p);
433 return p;
436 static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p,
437 struct ppcg_kernel *kernel)
439 int i;
441 for (i = 0; i < kernel->n_var; ++i)
442 p = print_kernel_var(p, &kernel->var[i]);
444 return p;
447 /* Print a sync statement.
449 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
450 struct ppcg_kernel_stmt *stmt)
452 p = isl_printer_start_line(p);
453 p = isl_printer_print_str(p, "__syncthreads();");
454 p = isl_printer_end_line(p);
456 return p;
459 /* This function is called for each user statement in the AST,
460 * i.e., for each kernel body statement, copy statement or sync statement.
462 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
463 __isl_take isl_ast_print_options *print_options,
464 __isl_keep isl_ast_node *node, void *user)
466 isl_id *id;
467 struct ppcg_kernel_stmt *stmt;
469 id = isl_ast_node_get_annotation(node);
470 stmt = isl_id_get_user(id);
471 isl_id_free(id);
473 isl_ast_print_options_free(print_options);
475 switch (stmt->type) {
476 case ppcg_kernel_copy:
477 return ppcg_kernel_print_copy(p, stmt);
478 case ppcg_kernel_sync:
479 return print_sync(p, stmt);
480 case ppcg_kernel_domain:
481 return ppcg_kernel_print_domain(p, stmt);
484 return p;
487 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
488 struct cuda_info *cuda)
490 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
491 isl_ast_print_options *print_options;
492 isl_printer *p;
494 print_kernel_headers(prog, kernel, cuda);
495 fprintf(cuda->kernel_c, "{\n");
496 print_kernel_iterators(cuda->kernel_c, kernel);
498 p = isl_printer_to_file(ctx, cuda->kernel_c);
499 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
500 p = isl_printer_indent(p, 4);
502 p = print_kernel_vars(p, kernel);
503 p = isl_printer_end_line(p);
504 p = gpu_print_macros(p, kernel->tree);
506 print_options = isl_ast_print_options_alloc(ctx);
507 print_options = isl_ast_print_options_set_print_user(print_options,
508 &print_kernel_stmt, NULL);
509 p = isl_ast_node_print(kernel->tree, p, print_options);
510 isl_printer_free(p);
512 fprintf(cuda->kernel_c, "}\n");
515 struct print_host_user_data {
516 struct cuda_info *cuda;
517 struct gpu_prog *prog;
520 /* Print the user statement of the host code to "p".
522 * In particular, print a block of statements that defines the grid
523 * and the block and then launches the kernel.
525 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
526 __isl_take isl_ast_print_options *print_options,
527 __isl_keep isl_ast_node *node, void *user)
529 isl_id *id;
530 struct ppcg_kernel *kernel;
531 struct print_host_user_data *data;
533 id = isl_ast_node_get_annotation(node);
534 kernel = isl_id_get_user(id);
535 isl_id_free(id);
537 data = (struct print_host_user_data *) user;
539 p = isl_printer_start_line(p);
540 p = isl_printer_print_str(p, "{");
541 p = isl_printer_end_line(p);
542 p = isl_printer_indent(p, 2);
544 p = isl_printer_start_line(p);
545 p = isl_printer_print_str(p, "dim3 k");
546 p = isl_printer_print_int(p, kernel->id);
547 p = isl_printer_print_str(p, "_dimBlock");
548 print_reverse_list(isl_printer_get_file(p),
549 kernel->n_block, kernel->block_dim);
550 p = isl_printer_print_str(p, ";");
551 p = isl_printer_end_line(p);
553 p = print_grid(p, kernel);
555 p = isl_printer_start_line(p);
556 p = isl_printer_print_str(p, "kernel");
557 p = isl_printer_print_int(p, kernel->id);
558 p = isl_printer_print_str(p, " <<<k");
559 p = isl_printer_print_int(p, kernel->id);
560 p = isl_printer_print_str(p, "_dimGrid, k");
561 p = isl_printer_print_int(p, kernel->id);
562 p = isl_printer_print_str(p, "_dimBlock>>> (");
563 p = print_kernel_arguments(p, data->prog, kernel, 0);
564 p = isl_printer_print_str(p, ");");
565 p = isl_printer_end_line(p);
567 p = isl_printer_start_line(p);
568 p = isl_printer_print_str(p, "cudaCheckKernel();");
569 p = isl_printer_end_line(p);
571 p = isl_printer_indent(p, -2);
572 p = isl_printer_start_line(p);
573 p = isl_printer_print_str(p, "}");
574 p = isl_printer_end_line(p);
576 p = isl_printer_start_line(p);
577 p = isl_printer_end_line(p);
579 print_kernel(data->prog, kernel, data->cuda);
581 isl_ast_print_options_free(print_options);
583 return p;
586 static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
587 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
588 struct cuda_info *cuda)
590 isl_ast_print_options *print_options;
591 isl_ctx *ctx = isl_ast_node_get_ctx(tree);
592 struct print_host_user_data data = { cuda, prog };
594 print_options = isl_ast_print_options_alloc(ctx);
595 print_options = isl_ast_print_options_set_print_user(print_options,
596 &print_host_user, &data);
598 p = gpu_print_macros(p, tree);
599 p = isl_ast_node_print(tree, p, print_options);
601 return p;
604 /* For each array that needs to be copied out (based on prog->copy_out),
605 * copy the contents back from the GPU to the host.
607 * If any element of a given array appears in prog->copy_out, then its
608 * entire extent is in prog->copy_out. The bounds on this extent have
609 * been precomputed in extract_array_info and are used in
610 * gpu_array_info_print_size.
612 static __isl_give isl_printer *copy_arrays_from_device(
613 __isl_take isl_printer *p, struct gpu_prog *prog)
615 int i;
616 isl_union_set *copy_out;
617 copy_out = isl_union_set_copy(prog->copy_out);
619 for (i = 0; i < prog->n_array; ++i) {
620 isl_space *space;
621 isl_set *copy_out_i;
622 int empty;
624 space = isl_space_copy(prog->array[i].space);
625 copy_out_i = isl_union_set_extract_set(copy_out, space);
626 empty = isl_set_plain_is_empty(copy_out_i);
627 isl_set_free(copy_out_i);
628 if (empty)
629 continue;
631 p = copy_array_from_device(p, &prog->array[i]);
634 isl_union_set_free(copy_out);
635 p = isl_printer_start_line(p);
636 p = isl_printer_end_line(p);
637 return p;
640 static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
641 struct gpu_prog *prog)
643 int i;
645 for (i = 0; i < prog->n_array; ++i) {
646 if (gpu_array_is_read_only_scalar(&prog->array[i]))
647 continue;
648 if (!prog->array[i].accessed)
649 continue;
650 p = isl_printer_start_line(p);
651 p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
652 p = isl_printer_print_str(p, prog->array[i].name);
653 p = isl_printer_print_str(p, "));");
654 p = isl_printer_end_line(p);
657 return p;
660 /* Given a gpu_prog "prog" and the corresponding transformed AST
661 * "tree", print the entire CUDA code to "p".
662 * "types" collects the types for which a definition has already
663 * been printed.
665 static __isl_give isl_printer *print_cuda(__isl_take isl_printer *p,
666 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
667 struct gpu_types *types, void *user)
669 struct cuda_info *cuda = user;
670 isl_printer *kernel;
672 kernel = isl_printer_to_file(isl_printer_get_ctx(p), cuda->kernel_c);
673 kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
674 kernel = gpu_print_types(kernel, types, prog);
675 isl_printer_free(kernel);
677 if (!kernel)
678 return isl_printer_free(p);
680 p = ppcg_start_block(p);
682 p = print_cuda_macros(p);
684 p = declare_device_arrays(p, prog);
685 p = allocate_device_arrays(p, prog);
686 p = copy_arrays_to_device(p, prog);
688 p = print_host_code(p, prog, tree, cuda);
690 p = copy_arrays_from_device(p, prog);
691 p = free_device_arrays(p, prog);
693 p = ppcg_end_block(p);
695 return p;
698 /* Transform the code in the file called "input" by replacing
699 * all scops by corresponding CUDA code.
700 * The names of the output files are derived from "input".
702 * We let generate_gpu do all the hard work and then let it call
703 * us back for printing the AST in print_cuda.
705 * To prepare for this printing, we first open the output files
706 * and we close them after generate_gpu has finished.
708 int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
709 const char *input)
711 struct cuda_info cuda;
712 int r;
714 cuda_open_files(&cuda, input);
716 r = generate_gpu(ctx, input, cuda.host_c, options, &print_cuda, &cuda);
718 cuda_close_files(&cuda);
720 return r;