cuda.c: print_kernel_var: use isl_val
[ppcg.git] / cuda.c
blob0797eeb3f4f4e860b1f02d3c3666c69727045ab4
1 /*
2 * Copyright 2012 Ecole Normale Superieure
4 * Use of this software is governed by the GNU LGPLv2.1 license
6 * Written by Sven Verdoolaege,
7 * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
8 */
10 #include <isl/aff.h>
11 #include <isl/ast.h>
13 #include "cuda_common.h"
14 #include "cuda.h"
15 #include "gpu.h"
16 #include "gpu_print.h"
17 #include "print.h"
19 static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
21 const char *macros =
22 "#define cudaCheckReturn(ret) \\\n"
23 " do { \\\n"
24 " cudaError_t cudaCheckReturn_e = (ret); \\\n"
25 " if (cudaCheckReturn_e != cudaSuccess) { \\\n"
26 " fprintf(stderr, \"CUDA error: %s\\n\", "
27 "cudaGetErrorString(cudaCheckReturn_e)); \\\n"
28 " fflush(stderr); \\\n"
29 " } \\\n"
30 " assert(cudaCheckReturn_e == cudaSuccess); \\\n"
31 " } while(0)\n"
32 "#define cudaCheckKernel() \\\n"
33 " do { \\\n"
34 " cudaCheckReturn(cudaGetLastError()); \\\n"
35 " } while(0)\n\n";
37 p = isl_printer_print_str(p, macros);
38 return p;
41 static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
42 struct gpu_prog *prog)
44 int i;
46 for (i = 0; i < prog->n_array; ++i) {
47 if (gpu_array_is_read_only_scalar(&prog->array[i]))
48 continue;
49 p = isl_printer_start_line(p);
50 p = isl_printer_print_str(p, prog->array[i].type);
51 p = isl_printer_print_str(p, " *dev_");
52 p = isl_printer_print_str(p, prog->array[i].name);
53 p = isl_printer_print_str(p, ";");
54 p = isl_printer_end_line(p);
56 p = isl_printer_start_line(p);
57 p = isl_printer_end_line(p);
58 return p;
61 static __isl_give isl_printer *allocate_device_arrays(
62 __isl_take isl_printer *p, struct gpu_prog *prog)
64 int i;
66 for (i = 0; i < prog->n_array; ++i) {
67 if (gpu_array_is_read_only_scalar(&prog->array[i]))
68 continue;
69 p = isl_printer_start_line(p);
70 p = isl_printer_print_str(p,
71 "cudaCheckReturn(cudaMalloc((void **) &dev_");
72 p = isl_printer_print_str(p, prog->array[i].name);
73 p = isl_printer_print_str(p, ", ");
74 p = gpu_array_info_print_size(p, &prog->array[i]);
75 p = isl_printer_print_str(p, "));");
76 p = isl_printer_end_line(p);
78 p = isl_printer_start_line(p);
79 p = isl_printer_end_line(p);
80 return p;
83 static __isl_give isl_printer *copy_arrays_to_device(__isl_take isl_printer *p,
84 struct gpu_prog *prog)
86 int i;
88 for (i = 0; i < prog->n_array; ++i) {
89 isl_space *dim;
90 isl_set *read_i;
91 int empty;
93 if (gpu_array_is_read_only_scalar(&prog->array[i]))
94 continue;
96 dim = isl_space_copy(prog->array[i].dim);
97 read_i = isl_union_set_extract_set(prog->copy_in, dim);
98 empty = isl_set_fast_is_empty(read_i);
99 isl_set_free(read_i);
100 if (empty)
101 continue;
103 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
104 p = isl_printer_print_str(p, prog->array[i].name);
105 p = isl_printer_print_str(p, ", ");
107 if (gpu_array_is_scalar(&prog->array[i]))
108 p = isl_printer_print_str(p, "&");
109 p = isl_printer_print_str(p, prog->array[i].name);
110 p = isl_printer_print_str(p, ", ");
112 p = gpu_array_info_print_size(p, &prog->array[i]);
113 p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
114 p = isl_printer_end_line(p);
116 p = isl_printer_start_line(p);
117 p = isl_printer_end_line(p);
118 return p;
121 static void print_reverse_list(FILE *out, int len, int *list)
123 int i;
125 if (len == 0)
126 return;
128 fprintf(out, "(");
129 for (i = 0; i < len; ++i) {
130 if (i)
131 fprintf(out, ", ");
132 fprintf(out, "%d", list[len - 1 - i]);
134 fprintf(out, ")");
137 /* Print the effective grid size as a list of the sizes in each
138 * dimension, from innermost to outermost.
140 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
141 struct ppcg_kernel *kernel)
143 int i;
144 int dim;
146 dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
147 if (dim == 0)
148 return p;
150 p = isl_printer_print_str(p, "(");
151 for (i = dim - 1; i >= 0; --i) {
152 isl_pw_aff *bound;
154 bound = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
155 p = isl_printer_print_pw_aff(p, bound);
156 isl_pw_aff_free(bound);
158 if (i > 0)
159 p = isl_printer_print_str(p, ", ");
162 p = isl_printer_print_str(p, ")");
164 return p;
167 /* Print the grid definition.
169 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
170 struct ppcg_kernel *kernel)
172 p = isl_printer_start_line(p);
173 p = isl_printer_print_str(p, "dim3 k");
174 p = isl_printer_print_int(p, kernel->id);
175 p = isl_printer_print_str(p, "_dimGrid");
176 p = print_grid_size(p, kernel);
177 p = isl_printer_print_str(p, ";");
178 p = isl_printer_end_line(p);
180 return p;
183 /* Print the arguments to a kernel declaration or call. If "types" is set,
184 * then print a declaration (including the types of the arguments).
186 * The arguments are printed in the following order
187 * - the arrays accessed by the kernel
188 * - the parameters
189 * - the host loop iterators
191 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
192 struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
194 int i, n;
195 int first = 1;
196 unsigned nparam;
197 isl_space *space;
198 const char *type;
200 for (i = 0; i < prog->n_array; ++i) {
201 isl_set *arr;
202 int empty;
204 space = isl_space_copy(prog->array[i].dim);
205 arr = isl_union_set_extract_set(kernel->arrays, space);
206 empty = isl_set_fast_is_empty(arr);
207 isl_set_free(arr);
208 if (empty)
209 continue;
211 if (!first)
212 p = isl_printer_print_str(p, ", ");
214 if (types) {
215 p = isl_printer_print_str(p, prog->array[i].type);
216 p = isl_printer_print_str(p, " ");
219 if (gpu_array_is_read_only_scalar(&prog->array[i])) {
220 p = isl_printer_print_str(p, prog->array[i].name);
221 } else {
222 if (types)
223 p = isl_printer_print_str(p, "*");
224 else
225 p = isl_printer_print_str(p, "dev_");
226 p = isl_printer_print_str(p, prog->array[i].name);
229 first = 0;
232 space = isl_union_set_get_space(kernel->arrays);
233 nparam = isl_space_dim(space, isl_dim_param);
234 for (i = 0; i < nparam; ++i) {
235 const char *name;
237 name = isl_space_get_dim_name(space, isl_dim_param, i);
239 if (!first)
240 p = isl_printer_print_str(p, ", ");
241 if (types)
242 p = isl_printer_print_str(p, "int ");
243 p = isl_printer_print_str(p, name);
245 first = 0;
247 isl_space_free(space);
249 n = isl_space_dim(kernel->space, isl_dim_set);
250 type = isl_options_get_ast_iterator_type(prog->ctx);
251 for (i = 0; i < n; ++i) {
252 const char *name;
253 isl_id *id;
255 if (!first)
256 p = isl_printer_print_str(p, ", ");
257 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
258 if (types) {
259 p = isl_printer_print_str(p, type);
260 p = isl_printer_print_str(p, " ");
262 p = isl_printer_print_str(p, name);
264 first = 0;
267 return p;
270 /* Print the header of the given kernel.
272 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
273 struct gpu_prog *prog, struct ppcg_kernel *kernel)
275 p = isl_printer_start_line(p);
276 p = isl_printer_print_str(p, "__global__ void kernel");
277 p = isl_printer_print_int(p, kernel->id);
278 p = isl_printer_print_str(p, "(");
279 p = print_kernel_arguments(p, prog, kernel, 1);
280 p = isl_printer_print_str(p, ")");
282 return p;
285 /* Print the header of the given kernel to both gen->cuda.kernel_h
286 * and gen->cuda.kernel_c.
288 static void print_kernel_headers(struct gpu_prog *prog,
289 struct ppcg_kernel *kernel, struct cuda_info *cuda)
291 isl_printer *p;
293 p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
294 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
295 p = print_kernel_header(p, prog, kernel);
296 p = isl_printer_print_str(p, ";");
297 p = isl_printer_end_line(p);
298 isl_printer_free(p);
300 p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
301 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
302 p = print_kernel_header(p, prog, kernel);
303 p = isl_printer_end_line(p);
304 isl_printer_free(p);
307 static void print_indent(FILE *dst, int indent)
309 fprintf(dst, "%*s", indent, "");
312 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
314 int i;
315 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
316 const char *type;
317 const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
318 const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
319 "threadIdx.z" };
321 type = isl_options_get_ast_iterator_type(ctx);
323 if (kernel->n_grid > 0) {
324 print_indent(out, 4);
325 fprintf(out, "%s ", type);
326 for (i = 0; i < kernel->n_grid; ++i) {
327 if (i)
328 fprintf(out, ", ");
329 fprintf(out, "b%d = %s",
330 i, block_dims[kernel->n_grid - 1 - i]);
332 fprintf(out, ";\n");
335 if (kernel->n_block > 0) {
336 print_indent(out, 4);
337 fprintf(out, "%s ", type);
338 for (i = 0; i < kernel->n_block; ++i) {
339 if (i)
340 fprintf(out, ", ");
341 fprintf(out, "t%d = %s",
342 i, thread_dims[kernel->n_block - 1 - i]);
344 fprintf(out, ";\n");
348 static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p,
349 struct ppcg_kernel_var *var)
351 int j;
353 p = isl_printer_start_line(p);
354 if (var->type == ppcg_access_shared)
355 p = isl_printer_print_str(p, "__shared__ ");
356 p = isl_printer_print_str(p, var->array->type);
357 p = isl_printer_print_str(p, " ");
358 p = isl_printer_print_str(p, var->name);
359 for (j = 0; j < var->array->n_index; ++j) {
360 isl_val *v;
362 p = isl_printer_print_str(p, "[");
363 v = isl_vec_get_element_val(var->size, j);
364 p = isl_printer_print_val(p, v);
365 isl_val_free(v);
366 p = isl_printer_print_str(p, "]");
368 p = isl_printer_print_str(p, ";");
369 p = isl_printer_end_line(p);
371 return p;
374 static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p,
375 struct ppcg_kernel *kernel)
377 int i;
379 for (i = 0; i < kernel->n_var; ++i)
380 p = print_kernel_var(p, &kernel->var[i]);
382 return p;
385 /* Print a sync statement.
387 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
388 struct ppcg_kernel_stmt *stmt)
390 p = isl_printer_start_line(p);
391 p = isl_printer_print_str(p, "__syncthreads();");
392 p = isl_printer_end_line(p);
394 return p;
397 /* This function is called for each user statement in the AST,
398 * i.e., for each kernel body statement, copy statement or sync statement.
400 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
401 __isl_take isl_ast_print_options *print_options,
402 __isl_keep isl_ast_node *node, void *user)
404 isl_id *id;
405 struct ppcg_kernel_stmt *stmt;
407 id = isl_ast_node_get_annotation(node);
408 stmt = isl_id_get_user(id);
409 isl_id_free(id);
411 isl_ast_print_options_free(print_options);
413 switch (stmt->type) {
414 case ppcg_kernel_copy:
415 return ppcg_kernel_print_copy(p, stmt);
416 case ppcg_kernel_sync:
417 return print_sync(p, stmt);
418 case ppcg_kernel_domain:
419 return ppcg_kernel_print_domain(p, stmt);
422 return p;
425 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
426 struct cuda_info *cuda)
428 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
429 isl_ast_print_options *print_options;
430 isl_printer *p;
432 print_kernel_headers(prog, kernel, cuda);
433 fprintf(cuda->kernel_c, "{\n");
434 print_kernel_iterators(cuda->kernel_c, kernel);
436 p = isl_printer_to_file(ctx, cuda->kernel_c);
437 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
438 p = isl_printer_indent(p, 4);
440 p = print_kernel_vars(p, kernel);
441 p = isl_printer_end_line(p);
442 p = gpu_print_macros(p, kernel->tree);
444 print_options = isl_ast_print_options_alloc(ctx);
445 print_options = isl_ast_print_options_set_print_user(print_options,
446 &print_kernel_stmt, NULL);
447 p = isl_ast_node_print(kernel->tree, p, print_options);
448 isl_printer_free(p);
450 fprintf(cuda->kernel_c, "}\n");
453 struct print_host_user_data {
454 struct cuda_info *cuda;
455 struct gpu_prog *prog;
458 /* Print the user statement of the host code to "p".
460 * In particular, print a block of statements that defines the grid
461 * and the block and then launches the kernel.
463 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
464 __isl_take isl_ast_print_options *print_options,
465 __isl_keep isl_ast_node *node, void *user)
467 isl_id *id;
468 struct ppcg_kernel *kernel;
469 struct print_host_user_data *data;
471 id = isl_ast_node_get_annotation(node);
472 kernel = isl_id_get_user(id);
473 isl_id_free(id);
475 data = (struct print_host_user_data *) user;
477 p = isl_printer_start_line(p);
478 p = isl_printer_print_str(p, "{");
479 p = isl_printer_end_line(p);
480 p = isl_printer_indent(p, 2);
482 p = isl_printer_start_line(p);
483 p = isl_printer_print_str(p, "dim3 k");
484 p = isl_printer_print_int(p, kernel->id);
485 p = isl_printer_print_str(p, "_dimBlock");
486 print_reverse_list(isl_printer_get_file(p),
487 kernel->n_block, kernel->block_dim);
488 p = isl_printer_print_str(p, ";");
489 p = isl_printer_end_line(p);
491 p = print_grid(p, kernel);
493 p = isl_printer_start_line(p);
494 p = isl_printer_print_str(p, "kernel");
495 p = isl_printer_print_int(p, kernel->id);
496 p = isl_printer_print_str(p, " <<<k");
497 p = isl_printer_print_int(p, kernel->id);
498 p = isl_printer_print_str(p, "_dimGrid, k");
499 p = isl_printer_print_int(p, kernel->id);
500 p = isl_printer_print_str(p, "_dimBlock>>> (");
501 p = print_kernel_arguments(p, data->prog, kernel, 0);
502 p = isl_printer_print_str(p, ");");
503 p = isl_printer_end_line(p);
505 p = isl_printer_start_line(p);
506 p = isl_printer_print_str(p, "cudaCheckKernel();");
507 p = isl_printer_end_line(p);
509 p = isl_printer_indent(p, -2);
510 p = isl_printer_start_line(p);
511 p = isl_printer_print_str(p, "}");
512 p = isl_printer_end_line(p);
514 p = isl_printer_start_line(p);
515 p = isl_printer_end_line(p);
517 print_kernel(data->prog, kernel, data->cuda);
519 isl_ast_print_options_free(print_options);
521 return p;
524 static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
525 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
526 struct cuda_info *cuda)
528 isl_ast_print_options *print_options;
529 isl_ctx *ctx = isl_ast_node_get_ctx(tree);
530 struct print_host_user_data data = { cuda, prog };
532 print_options = isl_ast_print_options_alloc(ctx);
533 print_options = isl_ast_print_options_set_print_user(print_options,
534 &print_host_user, &data);
536 p = gpu_print_macros(p, tree);
537 p = isl_ast_node_print(tree, p, print_options);
539 return p;
542 /* For each array that needs to be copied out (based on prog->copy_out),
543 * copy the contents back from the GPU to the host.
545 * If any element of a given array appears in prog->copy_out, then its
546 * entire extent is in prog->copy_out. The bounds on this extent have
547 * been precomputed in extract_array_info and are used in
548 * gpu_array_info_print_size.
550 static __isl_give isl_printer *copy_arrays_from_device(
551 __isl_take isl_printer *p, struct gpu_prog *prog)
553 int i;
554 isl_union_set *copy_out;
555 copy_out = isl_union_set_copy(prog->copy_out);
557 for (i = 0; i < prog->n_array; ++i) {
558 isl_space *dim;
559 isl_set *copy_out_i;
560 int empty;
562 dim = isl_space_copy(prog->array[i].dim);
563 copy_out_i = isl_union_set_extract_set(copy_out, dim);
564 empty = isl_set_fast_is_empty(copy_out_i);
565 isl_set_free(copy_out_i);
566 if (empty)
567 continue;
569 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
570 if (gpu_array_is_scalar(&prog->array[i]))
571 p = isl_printer_print_str(p, "&");
572 p = isl_printer_print_str(p, prog->array[i].name);
573 p = isl_printer_print_str(p, ", dev_");
574 p = isl_printer_print_str(p, prog->array[i].name);
575 p = isl_printer_print_str(p, ", ");
576 p = gpu_array_info_print_size(p, &prog->array[i]);
577 p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
578 p = isl_printer_end_line(p);
581 isl_union_set_free(copy_out);
582 p = isl_printer_start_line(p);
583 p = isl_printer_end_line(p);
584 return p;
587 static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
588 struct gpu_prog *prog)
590 int i;
592 for (i = 0; i < prog->n_array; ++i) {
593 if (gpu_array_is_read_only_scalar(&prog->array[i]))
594 continue;
595 p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
596 p = isl_printer_print_str(p, prog->array[i].name);
597 p = isl_printer_print_str(p, "));");
598 p = isl_printer_end_line(p);
601 return p;
604 int generate_cuda(isl_ctx *ctx, struct ppcg_scop *scop,
605 struct ppcg_options *options, const char *input)
607 struct cuda_info cuda;
608 struct gpu_prog *prog;
609 isl_ast_node *tree;
610 isl_printer *p;
612 if (!scop)
613 return -1;
615 prog = gpu_prog_alloc(ctx, scop);
617 tree = generate_gpu(ctx, prog, options);
619 cuda.start = scop->start;
620 cuda.end = scop->end;
621 cuda_open_files(&cuda, input);
623 p = isl_printer_to_file(ctx, cuda.host_c);
624 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
625 p = ppcg_print_exposed_declarations(p, scop);
626 p = ppcg_start_block(p);
628 p = print_cuda_macros(p);
630 p = declare_device_arrays(p, prog);
631 p = allocate_device_arrays(p, prog);
632 p = copy_arrays_to_device(p, prog);
634 p = print_host_code(p, prog, tree, &cuda);
635 isl_ast_node_free(tree);
637 p = copy_arrays_from_device(p, prog);
638 p = free_device_arrays(p, prog);
640 p = ppcg_end_block(p);
641 isl_printer_free(p);
643 cuda_close_files(&cuda);
645 gpu_prog_free(prog);
647 return 0;