gpu backend: declare local variables that are used on the host
[ppcg.git] / cuda.c
blob5b8158f2f55892b9747efb8ab5dbb3a7fbe1dd8c
1 /*
2 * Copyright 2012 Ecole Normale Superieure
4 * Use of this software is governed by the MIT license
6 * Written by Sven Verdoolaege,
7 * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
8 */
10 #include <isl/aff.h>
11 #include <isl/ast.h>
13 #include "cuda_common.h"
14 #include "cuda.h"
15 #include "gpu.h"
16 #include "gpu_print.h"
17 #include "print.h"
18 #include "util.h"
20 static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
22 const char *macros =
23 "#define cudaCheckReturn(ret) \\\n"
24 " do { \\\n"
25 " cudaError_t cudaCheckReturn_e = (ret); \\\n"
26 " if (cudaCheckReturn_e != cudaSuccess) { \\\n"
27 " fprintf(stderr, \"CUDA error: %s\\n\", "
28 "cudaGetErrorString(cudaCheckReturn_e)); \\\n"
29 " fflush(stderr); \\\n"
30 " } \\\n"
31 " assert(cudaCheckReturn_e == cudaSuccess); \\\n"
32 " } while(0)\n"
33 "#define cudaCheckKernel() \\\n"
34 " do { \\\n"
35 " cudaCheckReturn(cudaGetLastError()); \\\n"
36 " } while(0)\n\n";
38 p = isl_printer_print_str(p, macros);
39 return p;
42 /* Print a declaration for the device array corresponding to "array" on "p".
44 static __isl_give isl_printer *declare_device_array(__isl_take isl_printer *p,
45 struct gpu_array_info *array)
47 int i;
49 p = isl_printer_start_line(p);
50 p = isl_printer_print_str(p, array->type);
51 p = isl_printer_print_str(p, " ");
52 if (!array->linearize && array->n_index > 1)
53 p = isl_printer_print_str(p, "(");
54 p = isl_printer_print_str(p, "*dev_");
55 p = isl_printer_print_str(p, array->name);
56 if (!array->linearize && array->n_index > 1) {
57 p = isl_printer_print_str(p, ")");
58 for (i = 1; i < array->n_index; i++) {
59 p = isl_printer_print_str(p, "[");
60 p = isl_printer_print_pw_aff(p, array->bound[i]);
61 p = isl_printer_print_str(p, "]");
64 p = isl_printer_print_str(p, ";");
65 p = isl_printer_end_line(p);
67 return p;
70 static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
71 struct gpu_prog *prog)
73 int i;
75 for (i = 0; i < prog->n_array; ++i) {
76 if (gpu_array_is_read_only_scalar(&prog->array[i]))
77 continue;
78 if (!prog->array[i].accessed)
79 continue;
81 p = declare_device_array(p, &prog->array[i]);
83 p = isl_printer_start_line(p);
84 p = isl_printer_end_line(p);
85 return p;
88 static __isl_give isl_printer *allocate_device_arrays(
89 __isl_take isl_printer *p, struct gpu_prog *prog)
91 int i;
93 for (i = 0; i < prog->n_array; ++i) {
94 if (gpu_array_is_read_only_scalar(&prog->array[i]))
95 continue;
96 if (!prog->array[i].accessed)
97 continue;
98 p = isl_printer_start_line(p);
99 p = isl_printer_print_str(p,
100 "cudaCheckReturn(cudaMalloc((void **) &dev_");
101 p = isl_printer_print_str(p, prog->array[i].name);
102 p = isl_printer_print_str(p, ", ");
103 p = gpu_array_info_print_size(p, &prog->array[i]);
104 p = isl_printer_print_str(p, "));");
105 p = isl_printer_end_line(p);
107 p = isl_printer_start_line(p);
108 p = isl_printer_end_line(p);
109 return p;
112 /* Print code to "p" for copying "array" from the host to the device
113 * in its entirety. The bounds on the extent of "array" have
114 * been precomputed in extract_array_info and are used in
115 * gpu_array_info_print_size.
117 static __isl_give isl_printer *copy_array_to_device(__isl_take isl_printer *p,
118 struct gpu_array_info *array)
120 p = isl_printer_start_line(p);
121 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
122 p = isl_printer_print_str(p, array->name);
123 p = isl_printer_print_str(p, ", ");
125 if (gpu_array_is_scalar(array))
126 p = isl_printer_print_str(p, "&");
127 p = isl_printer_print_str(p, array->name);
128 p = isl_printer_print_str(p, ", ");
130 p = gpu_array_info_print_size(p, array);
131 p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
132 p = isl_printer_end_line(p);
134 return p;
137 /* Print code to "p" for copying "array" back from the device to the host
138 * in its entirety. The bounds on the extent of "array" have
139 * been precomputed in extract_array_info and are used in
140 * gpu_array_info_print_size.
142 static __isl_give isl_printer *copy_array_from_device(
143 __isl_take isl_printer *p, struct gpu_array_info *array)
145 p = isl_printer_start_line(p);
146 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
147 if (gpu_array_is_scalar(array))
148 p = isl_printer_print_str(p, "&");
149 p = isl_printer_print_str(p, array->name);
150 p = isl_printer_print_str(p, ", dev_");
151 p = isl_printer_print_str(p, array->name);
152 p = isl_printer_print_str(p, ", ");
153 p = gpu_array_info_print_size(p, array);
154 p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
155 p = isl_printer_end_line(p);
157 return p;
160 static void print_reverse_list(FILE *out, int len, int *list)
162 int i;
164 if (len == 0)
165 return;
167 fprintf(out, "(");
168 for (i = 0; i < len; ++i) {
169 if (i)
170 fprintf(out, ", ");
171 fprintf(out, "%d", list[len - 1 - i]);
173 fprintf(out, ")");
176 /* Print the effective grid size as a list of the sizes in each
177 * dimension, from innermost to outermost.
179 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
180 struct ppcg_kernel *kernel)
182 int i;
183 int dim;
185 dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
186 if (dim == 0)
187 return p;
189 p = isl_printer_print_str(p, "(");
190 for (i = dim - 1; i >= 0; --i) {
191 isl_pw_aff *bound;
193 bound = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
194 p = isl_printer_print_pw_aff(p, bound);
195 isl_pw_aff_free(bound);
197 if (i > 0)
198 p = isl_printer_print_str(p, ", ");
201 p = isl_printer_print_str(p, ")");
203 return p;
206 /* Print the grid definition.
208 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
209 struct ppcg_kernel *kernel)
211 p = isl_printer_start_line(p);
212 p = isl_printer_print_str(p, "dim3 k");
213 p = isl_printer_print_int(p, kernel->id);
214 p = isl_printer_print_str(p, "_dimGrid");
215 p = print_grid_size(p, kernel);
216 p = isl_printer_print_str(p, ";");
217 p = isl_printer_end_line(p);
219 return p;
222 /* Print the arguments to a kernel declaration or call. If "types" is set,
223 * then print a declaration (including the types of the arguments).
225 * The arguments are printed in the following order
226 * - the arrays accessed by the kernel
227 * - the parameters
228 * - the host loop iterators
230 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
231 struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
233 int i, n;
234 int first = 1;
235 unsigned nparam;
236 isl_space *space;
237 const char *type;
239 for (i = 0; i < prog->n_array; ++i) {
240 isl_set *arr;
241 int empty;
243 space = isl_space_copy(prog->array[i].space);
244 arr = isl_union_set_extract_set(kernel->arrays, space);
245 empty = isl_set_plain_is_empty(arr);
246 isl_set_free(arr);
247 if (empty)
248 continue;
250 if (!first)
251 p = isl_printer_print_str(p, ", ");
253 if (types)
254 p = gpu_array_info_print_declaration_argument(p,
255 &prog->array[i], NULL);
256 else
257 p = gpu_array_info_print_call_argument(p,
258 &prog->array[i]);
260 first = 0;
263 space = isl_union_set_get_space(kernel->arrays);
264 nparam = isl_space_dim(space, isl_dim_param);
265 for (i = 0; i < nparam; ++i) {
266 const char *name;
268 name = isl_space_get_dim_name(space, isl_dim_param, i);
270 if (!first)
271 p = isl_printer_print_str(p, ", ");
272 if (types)
273 p = isl_printer_print_str(p, "int ");
274 p = isl_printer_print_str(p, name);
276 first = 0;
278 isl_space_free(space);
280 n = isl_space_dim(kernel->space, isl_dim_set);
281 type = isl_options_get_ast_iterator_type(prog->ctx);
282 for (i = 0; i < n; ++i) {
283 const char *name;
285 if (!first)
286 p = isl_printer_print_str(p, ", ");
287 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
288 if (types) {
289 p = isl_printer_print_str(p, type);
290 p = isl_printer_print_str(p, " ");
292 p = isl_printer_print_str(p, name);
294 first = 0;
297 return p;
300 /* Print the header of the given kernel.
302 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
303 struct gpu_prog *prog, struct ppcg_kernel *kernel)
305 p = isl_printer_start_line(p);
306 p = isl_printer_print_str(p, "__global__ void kernel");
307 p = isl_printer_print_int(p, kernel->id);
308 p = isl_printer_print_str(p, "(");
309 p = print_kernel_arguments(p, prog, kernel, 1);
310 p = isl_printer_print_str(p, ")");
312 return p;
315 /* Print the header of the given kernel to both gen->cuda.kernel_h
316 * and gen->cuda.kernel_c.
318 static void print_kernel_headers(struct gpu_prog *prog,
319 struct ppcg_kernel *kernel, struct cuda_info *cuda)
321 isl_printer *p;
323 p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
324 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
325 p = print_kernel_header(p, prog, kernel);
326 p = isl_printer_print_str(p, ";");
327 p = isl_printer_end_line(p);
328 isl_printer_free(p);
330 p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
331 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
332 p = print_kernel_header(p, prog, kernel);
333 p = isl_printer_end_line(p);
334 isl_printer_free(p);
337 static void print_indent(FILE *dst, int indent)
339 fprintf(dst, "%*s", indent, "");
342 /* Print a list of iterators of type "type" with names "ids" to "out".
343 * Each iterator is assigned one of the cuda identifiers in cuda_dims.
344 * In particular, the last iterator is assigned the x identifier
345 * (the first in the list of cuda identifiers).
347 static void print_iterators(FILE *out, const char *type,
348 __isl_keep isl_id_list *ids, const char *cuda_dims[])
350 int i, n;
352 n = isl_id_list_n_id(ids);
353 if (n <= 0)
354 return;
355 print_indent(out, 4);
356 fprintf(out, "%s ", type);
357 for (i = 0; i < n; ++i) {
358 isl_id *id;
360 if (i)
361 fprintf(out, ", ");
362 id = isl_id_list_get_id(ids, i);
363 fprintf(out, "%s = %s", isl_id_get_name(id),
364 cuda_dims[n - 1 - i]);
365 isl_id_free(id);
367 fprintf(out, ";\n");
370 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
372 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
373 const char *type;
374 const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
375 const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
376 "threadIdx.z" };
378 type = isl_options_get_ast_iterator_type(ctx);
380 print_iterators(out, type, kernel->block_ids, block_dims);
381 print_iterators(out, type, kernel->thread_ids, thread_dims);
384 static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p,
385 struct ppcg_kernel_var *var)
387 int j;
389 p = isl_printer_start_line(p);
390 if (var->type == ppcg_access_shared)
391 p = isl_printer_print_str(p, "__shared__ ");
392 p = isl_printer_print_str(p, var->array->type);
393 p = isl_printer_print_str(p, " ");
394 p = isl_printer_print_str(p, var->name);
395 for (j = 0; j < var->array->n_index; ++j) {
396 isl_val *v;
398 p = isl_printer_print_str(p, "[");
399 v = isl_vec_get_element_val(var->size, j);
400 p = isl_printer_print_val(p, v);
401 isl_val_free(v);
402 p = isl_printer_print_str(p, "]");
404 p = isl_printer_print_str(p, ";");
405 p = isl_printer_end_line(p);
407 return p;
410 static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p,
411 struct ppcg_kernel *kernel)
413 int i;
415 for (i = 0; i < kernel->n_var; ++i)
416 p = print_kernel_var(p, &kernel->var[i]);
418 return p;
421 /* Print a sync statement.
423 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
424 struct ppcg_kernel_stmt *stmt)
426 p = isl_printer_start_line(p);
427 p = isl_printer_print_str(p, "__syncthreads();");
428 p = isl_printer_end_line(p);
430 return p;
433 /* This function is called for each user statement in the AST,
434 * i.e., for each kernel body statement, copy statement or sync statement.
436 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
437 __isl_take isl_ast_print_options *print_options,
438 __isl_keep isl_ast_node *node, void *user)
440 isl_id *id;
441 struct ppcg_kernel_stmt *stmt;
443 id = isl_ast_node_get_annotation(node);
444 stmt = isl_id_get_user(id);
445 isl_id_free(id);
447 isl_ast_print_options_free(print_options);
449 switch (stmt->type) {
450 case ppcg_kernel_copy:
451 return ppcg_kernel_print_copy(p, stmt);
452 case ppcg_kernel_sync:
453 return print_sync(p, stmt);
454 case ppcg_kernel_domain:
455 return ppcg_kernel_print_domain(p, stmt);
458 return p;
461 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
462 struct cuda_info *cuda)
464 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
465 isl_ast_print_options *print_options;
466 isl_printer *p;
468 print_kernel_headers(prog, kernel, cuda);
469 fprintf(cuda->kernel_c, "{\n");
470 print_kernel_iterators(cuda->kernel_c, kernel);
472 p = isl_printer_to_file(ctx, cuda->kernel_c);
473 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
474 p = isl_printer_indent(p, 4);
476 p = print_kernel_vars(p, kernel);
477 p = isl_printer_end_line(p);
478 p = gpu_print_macros(p, kernel->tree);
480 print_options = isl_ast_print_options_alloc(ctx);
481 print_options = isl_ast_print_options_set_print_user(print_options,
482 &print_kernel_stmt, NULL);
483 p = isl_ast_node_print(kernel->tree, p, print_options);
484 isl_printer_free(p);
486 fprintf(cuda->kernel_c, "}\n");
489 /* Print a statement for copying an array to or from the device.
490 * The statement identifier is called "to_device_<array name>" or
491 * "from_device_<array name>" and its user pointer points
492 * to the gpu_array_info of the array that needs to be copied.
494 * Extract the array from the identifier and call
495 * copy_array_to_device or copy_array_from_device.
497 static __isl_give isl_printer *print_to_from_device(__isl_take isl_printer *p,
498 __isl_keep isl_ast_node *node, struct gpu_prog *prog)
500 isl_ast_expr *expr, *arg;
501 isl_id *id;
502 const char *name;
503 struct gpu_array_info *array;
505 expr = isl_ast_node_user_get_expr(node);
506 arg = isl_ast_expr_get_op_arg(expr, 0);
507 id = isl_ast_expr_get_id(arg);
508 name = isl_id_get_name(id);
509 array = isl_id_get_user(id);
510 isl_id_free(id);
511 isl_ast_expr_free(arg);
512 isl_ast_expr_free(expr);
514 if (!name)
515 array = NULL;
516 if (!array)
517 return isl_printer_free(p);
519 if (!prefixcmp(name, "to_device"))
520 return copy_array_to_device(p, array);
521 else
522 return copy_array_from_device(p, array);
525 struct print_host_user_data {
526 struct cuda_info *cuda;
527 struct gpu_prog *prog;
530 /* Print the user statement of the host code to "p".
532 * The host code only contains kernel launches and statements
533 * that copy data to/from the device.
534 * The kernel launches have an associated annotation, while
535 * the data copy statements do not.
536 * The latter are handled by print_to_from_device.
538 * In case of a kernel launch, print a block of statements that
539 * defines the grid and the block and then launches the kernel.
541 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
542 __isl_take isl_ast_print_options *print_options,
543 __isl_keep isl_ast_node *node, void *user)
545 isl_id *id;
546 struct ppcg_kernel *kernel;
547 struct print_host_user_data *data;
549 isl_ast_print_options_free(print_options);
551 data = (struct print_host_user_data *) user;
553 id = isl_ast_node_get_annotation(node);
554 if (!id)
555 return print_to_from_device(p, node, data->prog);
557 kernel = isl_id_get_user(id);
558 isl_id_free(id);
560 p = isl_printer_start_line(p);
561 p = isl_printer_print_str(p, "{");
562 p = isl_printer_end_line(p);
563 p = isl_printer_indent(p, 2);
565 p = isl_printer_start_line(p);
566 p = isl_printer_print_str(p, "dim3 k");
567 p = isl_printer_print_int(p, kernel->id);
568 p = isl_printer_print_str(p, "_dimBlock");
569 print_reverse_list(isl_printer_get_file(p),
570 kernel->n_block, kernel->block_dim);
571 p = isl_printer_print_str(p, ";");
572 p = isl_printer_end_line(p);
574 p = print_grid(p, kernel);
576 p = isl_printer_start_line(p);
577 p = isl_printer_print_str(p, "kernel");
578 p = isl_printer_print_int(p, kernel->id);
579 p = isl_printer_print_str(p, " <<<k");
580 p = isl_printer_print_int(p, kernel->id);
581 p = isl_printer_print_str(p, "_dimGrid, k");
582 p = isl_printer_print_int(p, kernel->id);
583 p = isl_printer_print_str(p, "_dimBlock>>> (");
584 p = print_kernel_arguments(p, data->prog, kernel, 0);
585 p = isl_printer_print_str(p, ");");
586 p = isl_printer_end_line(p);
588 p = isl_printer_start_line(p);
589 p = isl_printer_print_str(p, "cudaCheckKernel();");
590 p = isl_printer_end_line(p);
592 p = isl_printer_indent(p, -2);
593 p = isl_printer_start_line(p);
594 p = isl_printer_print_str(p, "}");
595 p = isl_printer_end_line(p);
597 p = isl_printer_start_line(p);
598 p = isl_printer_end_line(p);
600 print_kernel(data->prog, kernel, data->cuda);
602 return p;
605 static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
606 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
607 struct cuda_info *cuda)
609 isl_ast_print_options *print_options;
610 isl_ctx *ctx = isl_ast_node_get_ctx(tree);
611 struct print_host_user_data data = { cuda, prog };
613 print_options = isl_ast_print_options_alloc(ctx);
614 print_options = isl_ast_print_options_set_print_user(print_options,
615 &print_host_user, &data);
617 p = gpu_print_macros(p, tree);
618 p = isl_ast_node_print(tree, p, print_options);
620 return p;
623 static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
624 struct gpu_prog *prog)
626 int i;
628 for (i = 0; i < prog->n_array; ++i) {
629 if (gpu_array_is_read_only_scalar(&prog->array[i]))
630 continue;
631 if (!prog->array[i].accessed)
632 continue;
633 p = isl_printer_start_line(p);
634 p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
635 p = isl_printer_print_str(p, prog->array[i].name);
636 p = isl_printer_print_str(p, "));");
637 p = isl_printer_end_line(p);
640 return p;
643 /* Given a gpu_prog "prog" and the corresponding transformed AST
644 * "tree", print the entire CUDA code to "p".
645 * "types" collects the types for which a definition has already
646 * been printed.
648 static __isl_give isl_printer *print_cuda(__isl_take isl_printer *p,
649 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
650 struct gpu_types *types, void *user)
652 struct cuda_info *cuda = user;
653 isl_printer *kernel;
655 kernel = isl_printer_to_file(isl_printer_get_ctx(p), cuda->kernel_c);
656 kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
657 kernel = gpu_print_types(kernel, types, prog);
658 isl_printer_free(kernel);
660 if (!kernel)
661 return isl_printer_free(p);
663 p = ppcg_start_block(p);
665 p = print_cuda_macros(p);
667 p = gpu_print_local_declarations(p, prog);
668 p = declare_device_arrays(p, prog);
669 p = allocate_device_arrays(p, prog);
671 p = print_host_code(p, prog, tree, cuda);
673 p = free_device_arrays(p, prog);
675 p = ppcg_end_block(p);
677 return p;
680 /* Transform the code in the file called "input" by replacing
681 * all scops by corresponding CUDA code.
682 * The names of the output files are derived from "input".
684 * We let generate_gpu do all the hard work and then let it call
685 * us back for printing the AST in print_cuda.
687 * To prepare for this printing, we first open the output files
688 * and we close them after generate_gpu has finished.
690 int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
691 const char *input)
693 struct cuda_info cuda;
694 int r;
696 cuda_open_files(&cuda, input);
698 r = generate_gpu(ctx, input, cuda.host_c, options, &print_cuda, &cuda);
700 cuda_close_files(&cuda);
702 return r;