ppcg_kernel_requires_array_argument: check if kernel acceses global memory
[ppcg.git] / cuda.c
blobf362b0a7092b957ded2dc5bd6aefd1e36a0169d2
1 /*
2 * Copyright 2012 Ecole Normale Superieure
4 * Use of this software is governed by the MIT license
6 * Written by Sven Verdoolaege,
7 * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
8 */
10 #include <isl/aff.h>
11 #include <isl/ast.h>
13 #include "cuda_common.h"
14 #include "cuda.h"
15 #include "gpu.h"
16 #include "gpu_print.h"
17 #include "print.h"
18 #include "util.h"
20 static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
22 const char *macros =
23 "#define cudaCheckReturn(ret) \\\n"
24 " do { \\\n"
25 " cudaError_t cudaCheckReturn_e = (ret); \\\n"
26 " if (cudaCheckReturn_e != cudaSuccess) { \\\n"
27 " fprintf(stderr, \"CUDA error: %s\\n\", "
28 "cudaGetErrorString(cudaCheckReturn_e)); \\\n"
29 " fflush(stderr); \\\n"
30 " } \\\n"
31 " assert(cudaCheckReturn_e == cudaSuccess); \\\n"
32 " } while(0)\n"
33 "#define cudaCheckKernel() \\\n"
34 " do { \\\n"
35 " cudaCheckReturn(cudaGetLastError()); \\\n"
36 " } while(0)\n\n";
38 p = isl_printer_print_str(p, macros);
39 return p;
42 /* Print a declaration for the device array corresponding to "array" on "p".
44 static __isl_give isl_printer *declare_device_array(__isl_take isl_printer *p,
45 struct gpu_array_info *array)
47 int i;
49 p = isl_printer_start_line(p);
50 p = isl_printer_print_str(p, array->type);
51 p = isl_printer_print_str(p, " ");
52 if (!array->linearize && array->n_index > 1)
53 p = isl_printer_print_str(p, "(");
54 p = isl_printer_print_str(p, "*dev_");
55 p = isl_printer_print_str(p, array->name);
56 if (!array->linearize && array->n_index > 1) {
57 p = isl_printer_print_str(p, ")");
58 for (i = 1; i < array->n_index; i++) {
59 p = isl_printer_print_str(p, "[");
60 p = isl_printer_print_pw_aff(p, array->bound[i]);
61 p = isl_printer_print_str(p, "]");
64 p = isl_printer_print_str(p, ";");
65 p = isl_printer_end_line(p);
67 return p;
70 static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
71 struct gpu_prog *prog)
73 int i;
75 for (i = 0; i < prog->n_array; ++i) {
76 if (gpu_array_is_read_only_scalar(&prog->array[i]))
77 continue;
78 if (!prog->array[i].accessed)
79 continue;
81 p = declare_device_array(p, &prog->array[i]);
83 p = isl_printer_start_line(p);
84 p = isl_printer_end_line(p);
85 return p;
88 static __isl_give isl_printer *allocate_device_arrays(
89 __isl_take isl_printer *p, struct gpu_prog *prog)
91 int i;
93 for (i = 0; i < prog->n_array; ++i) {
94 if (gpu_array_is_read_only_scalar(&prog->array[i]))
95 continue;
96 if (!prog->array[i].accessed)
97 continue;
98 p = isl_printer_start_line(p);
99 p = isl_printer_print_str(p,
100 "cudaCheckReturn(cudaMalloc((void **) &dev_");
101 p = isl_printer_print_str(p, prog->array[i].name);
102 p = isl_printer_print_str(p, ", ");
103 p = gpu_array_info_print_size(p, &prog->array[i]);
104 p = isl_printer_print_str(p, "));");
105 p = isl_printer_end_line(p);
107 p = isl_printer_start_line(p);
108 p = isl_printer_end_line(p);
109 return p;
112 /* Print code to "p" for copying "array" from the host to the device
113 * in its entirety. The bounds on the extent of "array" have
114 * been precomputed in extract_array_info and are used in
115 * gpu_array_info_print_size.
117 static __isl_give isl_printer *copy_array_to_device(__isl_take isl_printer *p,
118 struct gpu_array_info *array)
120 p = isl_printer_start_line(p);
121 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
122 p = isl_printer_print_str(p, array->name);
123 p = isl_printer_print_str(p, ", ");
125 if (gpu_array_is_scalar(array))
126 p = isl_printer_print_str(p, "&");
127 p = isl_printer_print_str(p, array->name);
128 p = isl_printer_print_str(p, ", ");
130 p = gpu_array_info_print_size(p, array);
131 p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
132 p = isl_printer_end_line(p);
134 return p;
137 /* Print code to "p" for copying "array" back from the device to the host
138 * in its entirety. The bounds on the extent of "array" have
139 * been precomputed in extract_array_info and are used in
140 * gpu_array_info_print_size.
142 static __isl_give isl_printer *copy_array_from_device(
143 __isl_take isl_printer *p, struct gpu_array_info *array)
145 p = isl_printer_start_line(p);
146 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
147 if (gpu_array_is_scalar(array))
148 p = isl_printer_print_str(p, "&");
149 p = isl_printer_print_str(p, array->name);
150 p = isl_printer_print_str(p, ", dev_");
151 p = isl_printer_print_str(p, array->name);
152 p = isl_printer_print_str(p, ", ");
153 p = gpu_array_info_print_size(p, array);
154 p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
155 p = isl_printer_end_line(p);
157 return p;
160 static void print_reverse_list(FILE *out, int len, int *list)
162 int i;
164 if (len == 0)
165 return;
167 fprintf(out, "(");
168 for (i = 0; i < len; ++i) {
169 if (i)
170 fprintf(out, ", ");
171 fprintf(out, "%d", list[len - 1 - i]);
173 fprintf(out, ")");
176 /* Print the effective grid size as a list of the sizes in each
177 * dimension, from innermost to outermost.
179 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
180 struct ppcg_kernel *kernel)
182 int i;
183 int dim;
185 dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
186 if (dim == 0)
187 return p;
189 p = isl_printer_print_str(p, "(");
190 for (i = dim - 1; i >= 0; --i) {
191 isl_pw_aff *bound;
193 bound = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
194 p = isl_printer_print_pw_aff(p, bound);
195 isl_pw_aff_free(bound);
197 if (i > 0)
198 p = isl_printer_print_str(p, ", ");
201 p = isl_printer_print_str(p, ")");
203 return p;
206 /* Print the grid definition.
208 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
209 struct ppcg_kernel *kernel)
211 p = isl_printer_start_line(p);
212 p = isl_printer_print_str(p, "dim3 k");
213 p = isl_printer_print_int(p, kernel->id);
214 p = isl_printer_print_str(p, "_dimGrid");
215 p = print_grid_size(p, kernel);
216 p = isl_printer_print_str(p, ";");
217 p = isl_printer_end_line(p);
219 return p;
222 /* Print the arguments to a kernel declaration or call. If "types" is set,
223 * then print a declaration (including the types of the arguments).
225 * The arguments are printed in the following order
226 * - the arrays accessed by the kernel
227 * - the parameters
228 * - the host loop iterators
230 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
231 struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
233 int i, n;
234 int first = 1;
235 unsigned nparam;
236 isl_space *space;
237 const char *type;
239 for (i = 0; i < prog->n_array; ++i) {
240 int required;
242 required = ppcg_kernel_requires_array_argument(kernel, i);
243 if (required < 0)
244 return isl_printer_free(p);
245 if (!required)
246 continue;
248 if (!first)
249 p = isl_printer_print_str(p, ", ");
251 if (types)
252 p = gpu_array_info_print_declaration_argument(p,
253 &prog->array[i], NULL);
254 else
255 p = gpu_array_info_print_call_argument(p,
256 &prog->array[i]);
258 first = 0;
261 space = isl_union_set_get_space(kernel->arrays);
262 nparam = isl_space_dim(space, isl_dim_param);
263 for (i = 0; i < nparam; ++i) {
264 const char *name;
266 name = isl_space_get_dim_name(space, isl_dim_param, i);
268 if (!first)
269 p = isl_printer_print_str(p, ", ");
270 if (types)
271 p = isl_printer_print_str(p, "int ");
272 p = isl_printer_print_str(p, name);
274 first = 0;
276 isl_space_free(space);
278 n = isl_space_dim(kernel->space, isl_dim_set);
279 type = isl_options_get_ast_iterator_type(prog->ctx);
280 for (i = 0; i < n; ++i) {
281 const char *name;
283 if (!first)
284 p = isl_printer_print_str(p, ", ");
285 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
286 if (types) {
287 p = isl_printer_print_str(p, type);
288 p = isl_printer_print_str(p, " ");
290 p = isl_printer_print_str(p, name);
292 first = 0;
295 return p;
298 /* Print the header of the given kernel.
300 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
301 struct gpu_prog *prog, struct ppcg_kernel *kernel)
303 p = isl_printer_start_line(p);
304 p = isl_printer_print_str(p, "__global__ void kernel");
305 p = isl_printer_print_int(p, kernel->id);
306 p = isl_printer_print_str(p, "(");
307 p = print_kernel_arguments(p, prog, kernel, 1);
308 p = isl_printer_print_str(p, ")");
310 return p;
313 /* Print the header of the given kernel to both gen->cuda.kernel_h
314 * and gen->cuda.kernel_c.
316 static void print_kernel_headers(struct gpu_prog *prog,
317 struct ppcg_kernel *kernel, struct cuda_info *cuda)
319 isl_printer *p;
321 p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
322 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
323 p = print_kernel_header(p, prog, kernel);
324 p = isl_printer_print_str(p, ";");
325 p = isl_printer_end_line(p);
326 isl_printer_free(p);
328 p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
329 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
330 p = print_kernel_header(p, prog, kernel);
331 p = isl_printer_end_line(p);
332 isl_printer_free(p);
335 static void print_indent(FILE *dst, int indent)
337 fprintf(dst, "%*s", indent, "");
340 /* Print a list of iterators of type "type" with names "ids" to "out".
341 * Each iterator is assigned one of the cuda identifiers in cuda_dims.
342 * In particular, the last iterator is assigned the x identifier
343 * (the first in the list of cuda identifiers).
345 static void print_iterators(FILE *out, const char *type,
346 __isl_keep isl_id_list *ids, const char *cuda_dims[])
348 int i, n;
350 n = isl_id_list_n_id(ids);
351 if (n <= 0)
352 return;
353 print_indent(out, 4);
354 fprintf(out, "%s ", type);
355 for (i = 0; i < n; ++i) {
356 isl_id *id;
358 if (i)
359 fprintf(out, ", ");
360 id = isl_id_list_get_id(ids, i);
361 fprintf(out, "%s = %s", isl_id_get_name(id),
362 cuda_dims[n - 1 - i]);
363 isl_id_free(id);
365 fprintf(out, ";\n");
368 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
370 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
371 const char *type;
372 const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
373 const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
374 "threadIdx.z" };
376 type = isl_options_get_ast_iterator_type(ctx);
378 print_iterators(out, type, kernel->block_ids, block_dims);
379 print_iterators(out, type, kernel->thread_ids, thread_dims);
382 static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p,
383 struct ppcg_kernel_var *var)
385 int j;
387 p = isl_printer_start_line(p);
388 if (var->type == ppcg_access_shared)
389 p = isl_printer_print_str(p, "__shared__ ");
390 p = isl_printer_print_str(p, var->array->type);
391 p = isl_printer_print_str(p, " ");
392 p = isl_printer_print_str(p, var->name);
393 for (j = 0; j < var->array->n_index; ++j) {
394 isl_val *v;
396 p = isl_printer_print_str(p, "[");
397 v = isl_vec_get_element_val(var->size, j);
398 p = isl_printer_print_val(p, v);
399 isl_val_free(v);
400 p = isl_printer_print_str(p, "]");
402 p = isl_printer_print_str(p, ";");
403 p = isl_printer_end_line(p);
405 return p;
408 static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p,
409 struct ppcg_kernel *kernel)
411 int i;
413 for (i = 0; i < kernel->n_var; ++i)
414 p = print_kernel_var(p, &kernel->var[i]);
416 return p;
419 /* Print a sync statement.
421 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
422 struct ppcg_kernel_stmt *stmt)
424 p = isl_printer_start_line(p);
425 p = isl_printer_print_str(p, "__syncthreads();");
426 p = isl_printer_end_line(p);
428 return p;
431 /* This function is called for each user statement in the AST,
432 * i.e., for each kernel body statement, copy statement or sync statement.
434 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
435 __isl_take isl_ast_print_options *print_options,
436 __isl_keep isl_ast_node *node, void *user)
438 isl_id *id;
439 struct ppcg_kernel_stmt *stmt;
441 id = isl_ast_node_get_annotation(node);
442 stmt = isl_id_get_user(id);
443 isl_id_free(id);
445 isl_ast_print_options_free(print_options);
447 switch (stmt->type) {
448 case ppcg_kernel_copy:
449 return ppcg_kernel_print_copy(p, stmt);
450 case ppcg_kernel_sync:
451 return print_sync(p, stmt);
452 case ppcg_kernel_domain:
453 return ppcg_kernel_print_domain(p, stmt);
456 return p;
459 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
460 struct cuda_info *cuda)
462 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
463 isl_ast_print_options *print_options;
464 isl_printer *p;
466 print_kernel_headers(prog, kernel, cuda);
467 fprintf(cuda->kernel_c, "{\n");
468 print_kernel_iterators(cuda->kernel_c, kernel);
470 p = isl_printer_to_file(ctx, cuda->kernel_c);
471 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
472 p = isl_printer_indent(p, 4);
474 p = print_kernel_vars(p, kernel);
475 p = isl_printer_end_line(p);
476 p = gpu_print_macros(p, kernel->tree);
478 print_options = isl_ast_print_options_alloc(ctx);
479 print_options = isl_ast_print_options_set_print_user(print_options,
480 &print_kernel_stmt, NULL);
481 p = isl_ast_node_print(kernel->tree, p, print_options);
482 isl_printer_free(p);
484 fprintf(cuda->kernel_c, "}\n");
487 /* Print a statement for copying an array to or from the device.
488 * The statement identifier is called "to_device_<array name>" or
489 * "from_device_<array name>" and its user pointer points
490 * to the gpu_array_info of the array that needs to be copied.
492 * Extract the array from the identifier and call
493 * copy_array_to_device or copy_array_from_device.
495 static __isl_give isl_printer *print_to_from_device(__isl_take isl_printer *p,
496 __isl_keep isl_ast_node *node, struct gpu_prog *prog)
498 isl_ast_expr *expr, *arg;
499 isl_id *id;
500 const char *name;
501 struct gpu_array_info *array;
503 expr = isl_ast_node_user_get_expr(node);
504 arg = isl_ast_expr_get_op_arg(expr, 0);
505 id = isl_ast_expr_get_id(arg);
506 name = isl_id_get_name(id);
507 array = isl_id_get_user(id);
508 isl_id_free(id);
509 isl_ast_expr_free(arg);
510 isl_ast_expr_free(expr);
512 if (!name)
513 array = NULL;
514 if (!array)
515 return isl_printer_free(p);
517 if (!prefixcmp(name, "to_device"))
518 return copy_array_to_device(p, array);
519 else
520 return copy_array_from_device(p, array);
523 struct print_host_user_data {
524 struct cuda_info *cuda;
525 struct gpu_prog *prog;
528 /* Print the user statement of the host code to "p".
530 * The host code may contain original user statements, kernel launches and
531 * statements that copy data to/from the device.
532 * The original user statements and the kernel launches have
533 * an associated annotation, while the data copy statements do not.
534 * The latter are handled by print_to_from_device.
535 * The annotation on the user statements is called "user".
537 * In case of a kernel launch, print a block of statements that
538 * defines the grid and the block and then launches the kernel.
540 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
541 __isl_take isl_ast_print_options *print_options,
542 __isl_keep isl_ast_node *node, void *user)
544 isl_id *id;
545 int is_user;
546 struct ppcg_kernel *kernel;
547 struct ppcg_kernel_stmt *stmt;
548 struct print_host_user_data *data;
550 isl_ast_print_options_free(print_options);
552 data = (struct print_host_user_data *) user;
554 id = isl_ast_node_get_annotation(node);
555 if (!id)
556 return print_to_from_device(p, node, data->prog);
558 is_user = !strcmp(isl_id_get_name(id), "user");
559 kernel = is_user ? NULL : isl_id_get_user(id);
560 stmt = is_user ? isl_id_get_user(id) : NULL;
561 isl_id_free(id);
563 if (is_user)
564 return ppcg_kernel_print_domain(p, stmt);
566 p = isl_printer_start_line(p);
567 p = isl_printer_print_str(p, "{");
568 p = isl_printer_end_line(p);
569 p = isl_printer_indent(p, 2);
571 p = isl_printer_start_line(p);
572 p = isl_printer_print_str(p, "dim3 k");
573 p = isl_printer_print_int(p, kernel->id);
574 p = isl_printer_print_str(p, "_dimBlock");
575 print_reverse_list(isl_printer_get_file(p),
576 kernel->n_block, kernel->block_dim);
577 p = isl_printer_print_str(p, ";");
578 p = isl_printer_end_line(p);
580 p = print_grid(p, kernel);
582 p = isl_printer_start_line(p);
583 p = isl_printer_print_str(p, "kernel");
584 p = isl_printer_print_int(p, kernel->id);
585 p = isl_printer_print_str(p, " <<<k");
586 p = isl_printer_print_int(p, kernel->id);
587 p = isl_printer_print_str(p, "_dimGrid, k");
588 p = isl_printer_print_int(p, kernel->id);
589 p = isl_printer_print_str(p, "_dimBlock>>> (");
590 p = print_kernel_arguments(p, data->prog, kernel, 0);
591 p = isl_printer_print_str(p, ");");
592 p = isl_printer_end_line(p);
594 p = isl_printer_start_line(p);
595 p = isl_printer_print_str(p, "cudaCheckKernel();");
596 p = isl_printer_end_line(p);
598 p = isl_printer_indent(p, -2);
599 p = isl_printer_start_line(p);
600 p = isl_printer_print_str(p, "}");
601 p = isl_printer_end_line(p);
603 p = isl_printer_start_line(p);
604 p = isl_printer_end_line(p);
606 print_kernel(data->prog, kernel, data->cuda);
608 return p;
611 static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
612 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
613 struct cuda_info *cuda)
615 isl_ast_print_options *print_options;
616 isl_ctx *ctx = isl_ast_node_get_ctx(tree);
617 struct print_host_user_data data = { cuda, prog };
619 print_options = isl_ast_print_options_alloc(ctx);
620 print_options = isl_ast_print_options_set_print_user(print_options,
621 &print_host_user, &data);
623 p = gpu_print_macros(p, tree);
624 p = isl_ast_node_print(tree, p, print_options);
626 return p;
629 static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
630 struct gpu_prog *prog)
632 int i;
634 for (i = 0; i < prog->n_array; ++i) {
635 if (gpu_array_is_read_only_scalar(&prog->array[i]))
636 continue;
637 if (!prog->array[i].accessed)
638 continue;
639 p = isl_printer_start_line(p);
640 p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
641 p = isl_printer_print_str(p, prog->array[i].name);
642 p = isl_printer_print_str(p, "));");
643 p = isl_printer_end_line(p);
646 return p;
649 /* Given a gpu_prog "prog" and the corresponding transformed AST
650 * "tree", print the entire CUDA code to "p".
651 * "types" collects the types for which a definition has already
652 * been printed.
654 static __isl_give isl_printer *print_cuda(__isl_take isl_printer *p,
655 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
656 struct gpu_types *types, void *user)
658 struct cuda_info *cuda = user;
659 isl_printer *kernel;
661 kernel = isl_printer_to_file(isl_printer_get_ctx(p), cuda->kernel_c);
662 kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
663 kernel = gpu_print_types(kernel, types, prog);
664 isl_printer_free(kernel);
666 if (!kernel)
667 return isl_printer_free(p);
669 p = ppcg_start_block(p);
671 p = print_cuda_macros(p);
673 p = gpu_print_local_declarations(p, prog);
674 p = declare_device_arrays(p, prog);
675 p = allocate_device_arrays(p, prog);
677 p = print_host_code(p, prog, tree, cuda);
679 p = free_device_arrays(p, prog);
681 p = ppcg_end_block(p);
683 return p;
686 /* Transform the code in the file called "input" by replacing
687 * all scops by corresponding CUDA code.
688 * The names of the output files are derived from "input".
690 * We let generate_gpu do all the hard work and then let it call
691 * us back for printing the AST in print_cuda.
693 * To prepare for this printing, we first open the output files
694 * and we close them after generate_gpu has finished.
696 int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
697 const char *input)
699 struct cuda_info cuda;
700 int r;
702 cuda_open_files(&cuda, input);
704 r = generate_gpu(ctx, input, cuda.host_c, options, &print_cuda, &cuda);
706 cuda_close_files(&cuda);
708 return r;