gpu: store grid size in ppcg_kernel
[ppcg.git] / cuda.c
blob47cadc97f677022e0cb14e33ba42da4e9fa05b23
1 /*
2 * Copyright 2012 Ecole Normale Superieure
4 * Use of this software is governed by the MIT license
6 * Written by Sven Verdoolaege,
7 * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
8 */
10 #include <isl/aff.h>
11 #include <isl/ast.h>
13 #include "cuda_common.h"
14 #include "cuda.h"
15 #include "gpu.h"
16 #include "gpu_print.h"
17 #include "print.h"
19 static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
21 const char *macros =
22 "#define cudaCheckReturn(ret) \\\n"
23 " do { \\\n"
24 " cudaError_t cudaCheckReturn_e = (ret); \\\n"
25 " if (cudaCheckReturn_e != cudaSuccess) { \\\n"
26 " fprintf(stderr, \"CUDA error: %s\\n\", "
27 "cudaGetErrorString(cudaCheckReturn_e)); \\\n"
28 " fflush(stderr); \\\n"
29 " } \\\n"
30 " assert(cudaCheckReturn_e == cudaSuccess); \\\n"
31 " } while(0)\n"
32 "#define cudaCheckKernel() \\\n"
33 " do { \\\n"
34 " cudaCheckReturn(cudaGetLastError()); \\\n"
35 " } while(0)\n\n";
37 p = isl_printer_print_str(p, macros);
38 return p;
41 /* Print a declaration for the device array corresponding to "array" on "p".
43 static __isl_give isl_printer *declare_device_array(__isl_take isl_printer *p,
44 struct gpu_array_info *array)
46 int i;
48 p = isl_printer_start_line(p);
49 p = isl_printer_print_str(p, array->type);
50 p = isl_printer_print_str(p, " ");
51 if (!array->linearize && array->n_index > 1)
52 p = isl_printer_print_str(p, "(");
53 p = isl_printer_print_str(p, "*dev_");
54 p = isl_printer_print_str(p, array->name);
55 if (!array->linearize && array->n_index > 1) {
56 p = isl_printer_print_str(p, ")");
57 for (i = 1; i < array->n_index; i++) {
58 p = isl_printer_print_str(p, "[");
59 p = isl_printer_print_pw_aff(p, array->bound[i]);
60 p = isl_printer_print_str(p, "]");
63 p = isl_printer_print_str(p, ";");
64 p = isl_printer_end_line(p);
66 return p;
69 static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
70 struct gpu_prog *prog)
72 int i;
74 for (i = 0; i < prog->n_array; ++i) {
75 if (gpu_array_is_read_only_scalar(&prog->array[i]))
76 continue;
77 if (!prog->array[i].accessed)
78 continue;
80 p = declare_device_array(p, &prog->array[i]);
82 p = isl_printer_start_line(p);
83 p = isl_printer_end_line(p);
84 return p;
87 static __isl_give isl_printer *allocate_device_arrays(
88 __isl_take isl_printer *p, struct gpu_prog *prog)
90 int i;
92 for (i = 0; i < prog->n_array; ++i) {
93 if (gpu_array_is_read_only_scalar(&prog->array[i]))
94 continue;
95 if (!prog->array[i].accessed)
96 continue;
97 p = isl_printer_start_line(p);
98 p = isl_printer_print_str(p,
99 "cudaCheckReturn(cudaMalloc((void **) &dev_");
100 p = isl_printer_print_str(p, prog->array[i].name);
101 p = isl_printer_print_str(p, ", ");
102 p = gpu_array_info_print_size(p, &prog->array[i]);
103 p = isl_printer_print_str(p, "));");
104 p = isl_printer_end_line(p);
106 p = isl_printer_start_line(p);
107 p = isl_printer_end_line(p);
108 return p;
111 static __isl_give isl_printer *copy_arrays_to_device(__isl_take isl_printer *p,
112 struct gpu_prog *prog)
114 int i;
116 for (i = 0; i < prog->n_array; ++i) {
117 isl_space *space;
118 isl_set *read_i;
119 int empty;
121 if (gpu_array_is_read_only_scalar(&prog->array[i]))
122 continue;
124 space = isl_space_copy(prog->array[i].space);
125 read_i = isl_union_set_extract_set(prog->copy_in, space);
126 empty = isl_set_plain_is_empty(read_i);
127 isl_set_free(read_i);
128 if (empty)
129 continue;
131 p = isl_printer_start_line(p);
132 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
133 p = isl_printer_print_str(p, prog->array[i].name);
134 p = isl_printer_print_str(p, ", ");
136 if (gpu_array_is_scalar(&prog->array[i]))
137 p = isl_printer_print_str(p, "&");
138 p = isl_printer_print_str(p, prog->array[i].name);
139 p = isl_printer_print_str(p, ", ");
141 p = gpu_array_info_print_size(p, &prog->array[i]);
142 p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
143 p = isl_printer_end_line(p);
145 p = isl_printer_start_line(p);
146 p = isl_printer_end_line(p);
147 return p;
150 static void print_reverse_list(FILE *out, int len, int *list)
152 int i;
154 if (len == 0)
155 return;
157 fprintf(out, "(");
158 for (i = 0; i < len; ++i) {
159 if (i)
160 fprintf(out, ", ");
161 fprintf(out, "%d", list[len - 1 - i]);
163 fprintf(out, ")");
166 /* Print the effective grid size as a list of the sizes in each
167 * dimension, from innermost to outermost.
169 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
170 struct ppcg_kernel *kernel)
172 int i;
173 int dim;
175 dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
176 if (dim == 0)
177 return p;
179 p = isl_printer_print_str(p, "(");
180 for (i = dim - 1; i >= 0; --i) {
181 isl_pw_aff *bound;
183 bound = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
184 p = isl_printer_print_pw_aff(p, bound);
185 isl_pw_aff_free(bound);
187 if (i > 0)
188 p = isl_printer_print_str(p, ", ");
191 p = isl_printer_print_str(p, ")");
193 return p;
196 /* Print the grid definition.
198 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
199 struct ppcg_kernel *kernel)
201 p = isl_printer_start_line(p);
202 p = isl_printer_print_str(p, "dim3 k");
203 p = isl_printer_print_int(p, kernel->id);
204 p = isl_printer_print_str(p, "_dimGrid");
205 p = print_grid_size(p, kernel);
206 p = isl_printer_print_str(p, ";");
207 p = isl_printer_end_line(p);
209 return p;
212 /* Print the arguments to a kernel declaration or call. If "types" is set,
213 * then print a declaration (including the types of the arguments).
215 * The arguments are printed in the following order
216 * - the arrays accessed by the kernel
217 * - the parameters
218 * - the host loop iterators
220 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
221 struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
223 int i, n;
224 int first = 1;
225 unsigned nparam;
226 isl_space *space;
227 const char *type;
229 for (i = 0; i < prog->n_array; ++i) {
230 isl_set *arr;
231 int empty;
233 space = isl_space_copy(prog->array[i].space);
234 arr = isl_union_set_extract_set(kernel->arrays, space);
235 empty = isl_set_plain_is_empty(arr);
236 isl_set_free(arr);
237 if (empty)
238 continue;
240 if (!first)
241 p = isl_printer_print_str(p, ", ");
243 if (types)
244 p = gpu_array_info_print_declaration_argument(p,
245 &prog->array[i], NULL);
246 else
247 p = gpu_array_info_print_call_argument(p,
248 &prog->array[i]);
250 first = 0;
253 space = isl_union_set_get_space(kernel->arrays);
254 nparam = isl_space_dim(space, isl_dim_param);
255 for (i = 0; i < nparam; ++i) {
256 const char *name;
258 name = isl_space_get_dim_name(space, isl_dim_param, i);
260 if (!first)
261 p = isl_printer_print_str(p, ", ");
262 if (types)
263 p = isl_printer_print_str(p, "int ");
264 p = isl_printer_print_str(p, name);
266 first = 0;
268 isl_space_free(space);
270 n = isl_space_dim(kernel->space, isl_dim_set);
271 type = isl_options_get_ast_iterator_type(prog->ctx);
272 for (i = 0; i < n; ++i) {
273 const char *name;
275 if (!first)
276 p = isl_printer_print_str(p, ", ");
277 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
278 if (types) {
279 p = isl_printer_print_str(p, type);
280 p = isl_printer_print_str(p, " ");
282 p = isl_printer_print_str(p, name);
284 first = 0;
287 return p;
290 /* Print the header of the given kernel.
292 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
293 struct gpu_prog *prog, struct ppcg_kernel *kernel)
295 p = isl_printer_start_line(p);
296 p = isl_printer_print_str(p, "__global__ void kernel");
297 p = isl_printer_print_int(p, kernel->id);
298 p = isl_printer_print_str(p, "(");
299 p = print_kernel_arguments(p, prog, kernel, 1);
300 p = isl_printer_print_str(p, ")");
302 return p;
305 /* Print the header of the given kernel to both gen->cuda.kernel_h
306 * and gen->cuda.kernel_c.
308 static void print_kernel_headers(struct gpu_prog *prog,
309 struct ppcg_kernel *kernel, struct cuda_info *cuda)
311 isl_printer *p;
313 p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
314 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
315 p = print_kernel_header(p, prog, kernel);
316 p = isl_printer_print_str(p, ";");
317 p = isl_printer_end_line(p);
318 isl_printer_free(p);
320 p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
321 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
322 p = print_kernel_header(p, prog, kernel);
323 p = isl_printer_end_line(p);
324 isl_printer_free(p);
327 static void print_indent(FILE *dst, int indent)
329 fprintf(dst, "%*s", indent, "");
332 /* Print a list of iterators of type "type" with names "ids" to "out".
333 * Each iterator is assigned one of the cuda identifiers in cuda_dims.
334 * In particular, the last iterator is assigned the x identifier
335 * (the first in the list of cuda identifiers).
337 static void print_iterators(FILE *out, const char *type,
338 __isl_keep isl_id_list *ids, const char *cuda_dims[])
340 int i, n;
342 n = isl_id_list_n_id(ids);
343 if (n <= 0)
344 return;
345 print_indent(out, 4);
346 fprintf(out, "%s ", type);
347 for (i = 0; i < n; ++i) {
348 isl_id *id;
350 if (i)
351 fprintf(out, ", ");
352 id = isl_id_list_get_id(ids, i);
353 fprintf(out, "%s = %s", isl_id_get_name(id),
354 cuda_dims[n - 1 - i]);
355 isl_id_free(id);
357 fprintf(out, ";\n");
360 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
362 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
363 const char *type;
364 const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
365 const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
366 "threadIdx.z" };
368 type = isl_options_get_ast_iterator_type(ctx);
370 print_iterators(out, type, kernel->block_ids, block_dims);
371 print_iterators(out, type, kernel->thread_ids, thread_dims);
374 static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p,
375 struct ppcg_kernel_var *var)
377 int j;
379 p = isl_printer_start_line(p);
380 if (var->type == ppcg_access_shared)
381 p = isl_printer_print_str(p, "__shared__ ");
382 p = isl_printer_print_str(p, var->array->type);
383 p = isl_printer_print_str(p, " ");
384 p = isl_printer_print_str(p, var->name);
385 for (j = 0; j < var->array->n_index; ++j) {
386 isl_val *v;
388 p = isl_printer_print_str(p, "[");
389 v = isl_vec_get_element_val(var->size, j);
390 p = isl_printer_print_val(p, v);
391 isl_val_free(v);
392 p = isl_printer_print_str(p, "]");
394 p = isl_printer_print_str(p, ";");
395 p = isl_printer_end_line(p);
397 return p;
400 static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p,
401 struct ppcg_kernel *kernel)
403 int i;
405 for (i = 0; i < kernel->n_var; ++i)
406 p = print_kernel_var(p, &kernel->var[i]);
408 return p;
411 /* Print a sync statement.
413 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
414 struct ppcg_kernel_stmt *stmt)
416 p = isl_printer_start_line(p);
417 p = isl_printer_print_str(p, "__syncthreads();");
418 p = isl_printer_end_line(p);
420 return p;
423 /* This function is called for each user statement in the AST,
424 * i.e., for each kernel body statement, copy statement or sync statement.
426 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
427 __isl_take isl_ast_print_options *print_options,
428 __isl_keep isl_ast_node *node, void *user)
430 isl_id *id;
431 struct ppcg_kernel_stmt *stmt;
433 id = isl_ast_node_get_annotation(node);
434 stmt = isl_id_get_user(id);
435 isl_id_free(id);
437 isl_ast_print_options_free(print_options);
439 switch (stmt->type) {
440 case ppcg_kernel_copy:
441 return ppcg_kernel_print_copy(p, stmt);
442 case ppcg_kernel_sync:
443 return print_sync(p, stmt);
444 case ppcg_kernel_domain:
445 return ppcg_kernel_print_domain(p, stmt);
448 return p;
451 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
452 struct cuda_info *cuda)
454 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
455 isl_ast_print_options *print_options;
456 isl_printer *p;
458 print_kernel_headers(prog, kernel, cuda);
459 fprintf(cuda->kernel_c, "{\n");
460 print_kernel_iterators(cuda->kernel_c, kernel);
462 p = isl_printer_to_file(ctx, cuda->kernel_c);
463 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
464 p = isl_printer_indent(p, 4);
466 p = print_kernel_vars(p, kernel);
467 p = isl_printer_end_line(p);
468 p = gpu_print_macros(p, kernel->tree);
470 print_options = isl_ast_print_options_alloc(ctx);
471 print_options = isl_ast_print_options_set_print_user(print_options,
472 &print_kernel_stmt, NULL);
473 p = isl_ast_node_print(kernel->tree, p, print_options);
474 isl_printer_free(p);
476 fprintf(cuda->kernel_c, "}\n");
479 struct print_host_user_data {
480 struct cuda_info *cuda;
481 struct gpu_prog *prog;
484 /* Print the user statement of the host code to "p".
486 * In particular, print a block of statements that defines the grid
487 * and the block and then launches the kernel.
489 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
490 __isl_take isl_ast_print_options *print_options,
491 __isl_keep isl_ast_node *node, void *user)
493 isl_id *id;
494 struct ppcg_kernel *kernel;
495 struct print_host_user_data *data;
497 id = isl_ast_node_get_annotation(node);
498 kernel = isl_id_get_user(id);
499 isl_id_free(id);
501 data = (struct print_host_user_data *) user;
503 p = isl_printer_start_line(p);
504 p = isl_printer_print_str(p, "{");
505 p = isl_printer_end_line(p);
506 p = isl_printer_indent(p, 2);
508 p = isl_printer_start_line(p);
509 p = isl_printer_print_str(p, "dim3 k");
510 p = isl_printer_print_int(p, kernel->id);
511 p = isl_printer_print_str(p, "_dimBlock");
512 print_reverse_list(isl_printer_get_file(p),
513 kernel->n_block, kernel->block_dim);
514 p = isl_printer_print_str(p, ";");
515 p = isl_printer_end_line(p);
517 p = print_grid(p, kernel);
519 p = isl_printer_start_line(p);
520 p = isl_printer_print_str(p, "kernel");
521 p = isl_printer_print_int(p, kernel->id);
522 p = isl_printer_print_str(p, " <<<k");
523 p = isl_printer_print_int(p, kernel->id);
524 p = isl_printer_print_str(p, "_dimGrid, k");
525 p = isl_printer_print_int(p, kernel->id);
526 p = isl_printer_print_str(p, "_dimBlock>>> (");
527 p = print_kernel_arguments(p, data->prog, kernel, 0);
528 p = isl_printer_print_str(p, ");");
529 p = isl_printer_end_line(p);
531 p = isl_printer_start_line(p);
532 p = isl_printer_print_str(p, "cudaCheckKernel();");
533 p = isl_printer_end_line(p);
535 p = isl_printer_indent(p, -2);
536 p = isl_printer_start_line(p);
537 p = isl_printer_print_str(p, "}");
538 p = isl_printer_end_line(p);
540 p = isl_printer_start_line(p);
541 p = isl_printer_end_line(p);
543 print_kernel(data->prog, kernel, data->cuda);
545 isl_ast_print_options_free(print_options);
547 return p;
550 static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
551 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
552 struct cuda_info *cuda)
554 isl_ast_print_options *print_options;
555 isl_ctx *ctx = isl_ast_node_get_ctx(tree);
556 struct print_host_user_data data = { cuda, prog };
558 print_options = isl_ast_print_options_alloc(ctx);
559 print_options = isl_ast_print_options_set_print_user(print_options,
560 &print_host_user, &data);
562 p = gpu_print_macros(p, tree);
563 p = isl_ast_node_print(tree, p, print_options);
565 return p;
568 /* For each array that needs to be copied out (based on prog->copy_out),
569 * copy the contents back from the GPU to the host.
571 * If any element of a given array appears in prog->copy_out, then its
572 * entire extent is in prog->copy_out. The bounds on this extent have
573 * been precomputed in extract_array_info and are used in
574 * gpu_array_info_print_size.
576 static __isl_give isl_printer *copy_arrays_from_device(
577 __isl_take isl_printer *p, struct gpu_prog *prog)
579 int i;
580 isl_union_set *copy_out;
581 copy_out = isl_union_set_copy(prog->copy_out);
583 for (i = 0; i < prog->n_array; ++i) {
584 isl_space *space;
585 isl_set *copy_out_i;
586 int empty;
588 space = isl_space_copy(prog->array[i].space);
589 copy_out_i = isl_union_set_extract_set(copy_out, space);
590 empty = isl_set_plain_is_empty(copy_out_i);
591 isl_set_free(copy_out_i);
592 if (empty)
593 continue;
595 p = isl_printer_start_line(p);
596 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
597 if (gpu_array_is_scalar(&prog->array[i]))
598 p = isl_printer_print_str(p, "&");
599 p = isl_printer_print_str(p, prog->array[i].name);
600 p = isl_printer_print_str(p, ", dev_");
601 p = isl_printer_print_str(p, prog->array[i].name);
602 p = isl_printer_print_str(p, ", ");
603 p = gpu_array_info_print_size(p, &prog->array[i]);
604 p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
605 p = isl_printer_end_line(p);
608 isl_union_set_free(copy_out);
609 p = isl_printer_start_line(p);
610 p = isl_printer_end_line(p);
611 return p;
614 static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
615 struct gpu_prog *prog)
617 int i;
619 for (i = 0; i < prog->n_array; ++i) {
620 if (gpu_array_is_read_only_scalar(&prog->array[i]))
621 continue;
622 if (!prog->array[i].accessed)
623 continue;
624 p = isl_printer_start_line(p);
625 p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
626 p = isl_printer_print_str(p, prog->array[i].name);
627 p = isl_printer_print_str(p, "));");
628 p = isl_printer_end_line(p);
631 return p;
634 /* Given a gpu_prog "prog" and the corresponding transformed AST
635 * "tree", print the entire CUDA code to "p".
636 * "types" collects the types for which a definition has already
637 * been printed.
639 static __isl_give isl_printer *print_cuda(__isl_take isl_printer *p,
640 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
641 struct gpu_types *types, void *user)
643 struct cuda_info *cuda = user;
644 isl_printer *kernel;
646 kernel = isl_printer_to_file(isl_printer_get_ctx(p), cuda->kernel_c);
647 kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
648 kernel = gpu_print_types(kernel, types, prog);
649 isl_printer_free(kernel);
651 if (!kernel)
652 return isl_printer_free(p);
654 p = ppcg_start_block(p);
656 p = print_cuda_macros(p);
658 p = declare_device_arrays(p, prog);
659 p = allocate_device_arrays(p, prog);
660 p = copy_arrays_to_device(p, prog);
662 p = print_host_code(p, prog, tree, cuda);
664 p = copy_arrays_from_device(p, prog);
665 p = free_device_arrays(p, prog);
667 p = ppcg_end_block(p);
669 return p;
672 /* Transform the code in the file called "input" by replacing
673 * all scops by corresponding CUDA code.
674 * The names of the output files are derived from "input".
676 * We let generate_gpu do all the hard work and then let it call
677 * us back for printing the AST in print_cuda.
679 * To prepare for this printing, we first open the output files
680 * and we close them after generate_gpu has finished.
682 int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
683 const char *input)
685 struct cuda_info cuda;
686 int r;
688 cuda_open_files(&cuda, input);
690 r = generate_gpu(ctx, input, cuda.host_c, options, &print_cuda, &cuda);
692 cuda_close_files(&cuda);
694 return r;