gpu.c: copy_group_shared_accesses: use gpu_array_is_scalar for clarity
[ppcg.git] / cuda.c
blob759e659fabfd536929add8f7015cc920595a2164
1 /*
2 * Copyright 2012 Ecole Normale Superieure
4 * Use of this software is governed by the MIT license
6 * Written by Sven Verdoolaege,
7 * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
8 */
10 #include <isl/aff.h>
11 #include <isl/ast.h>
13 #include "cuda_common.h"
14 #include "cuda.h"
15 #include "gpu.h"
16 #include "gpu_print.h"
17 #include "print.h"
19 static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
21 const char *macros =
22 "#define cudaCheckReturn(ret) \\\n"
23 " do { \\\n"
24 " cudaError_t cudaCheckReturn_e = (ret); \\\n"
25 " if (cudaCheckReturn_e != cudaSuccess) { \\\n"
26 " fprintf(stderr, \"CUDA error: %s\\n\", "
27 "cudaGetErrorString(cudaCheckReturn_e)); \\\n"
28 " fflush(stderr); \\\n"
29 " } \\\n"
30 " assert(cudaCheckReturn_e == cudaSuccess); \\\n"
31 " } while(0)\n"
32 "#define cudaCheckKernel() \\\n"
33 " do { \\\n"
34 " cudaCheckReturn(cudaGetLastError()); \\\n"
35 " } while(0)\n\n";
37 p = isl_printer_print_str(p, macros);
38 return p;
41 static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
42 struct gpu_prog *prog)
44 int i;
46 for (i = 0; i < prog->n_array; ++i) {
47 if (gpu_array_is_read_only_scalar(&prog->array[i]))
48 continue;
49 p = isl_printer_start_line(p);
50 p = isl_printer_print_str(p, prog->array[i].type);
51 p = isl_printer_print_str(p, " *dev_");
52 p = isl_printer_print_str(p, prog->array[i].name);
53 p = isl_printer_print_str(p, ";");
54 p = isl_printer_end_line(p);
56 p = isl_printer_start_line(p);
57 p = isl_printer_end_line(p);
58 return p;
61 static __isl_give isl_printer *allocate_device_arrays(
62 __isl_take isl_printer *p, struct gpu_prog *prog)
64 int i;
66 for (i = 0; i < prog->n_array; ++i) {
67 if (gpu_array_is_read_only_scalar(&prog->array[i]))
68 continue;
69 p = isl_printer_start_line(p);
70 p = isl_printer_print_str(p,
71 "cudaCheckReturn(cudaMalloc((void **) &dev_");
72 p = isl_printer_print_str(p, prog->array[i].name);
73 p = isl_printer_print_str(p, ", ");
74 p = gpu_array_info_print_size(p, &prog->array[i]);
75 p = isl_printer_print_str(p, "));");
76 p = isl_printer_end_line(p);
78 p = isl_printer_start_line(p);
79 p = isl_printer_end_line(p);
80 return p;
83 static __isl_give isl_printer *copy_arrays_to_device(__isl_take isl_printer *p,
84 struct gpu_prog *prog)
86 int i;
88 for (i = 0; i < prog->n_array; ++i) {
89 isl_space *dim;
90 isl_set *read_i;
91 int empty;
93 if (gpu_array_is_read_only_scalar(&prog->array[i]))
94 continue;
96 dim = isl_space_copy(prog->array[i].dim);
97 read_i = isl_union_set_extract_set(prog->copy_in, dim);
98 empty = isl_set_fast_is_empty(read_i);
99 isl_set_free(read_i);
100 if (empty)
101 continue;
103 p = isl_printer_start_line(p);
104 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
105 p = isl_printer_print_str(p, prog->array[i].name);
106 p = isl_printer_print_str(p, ", ");
108 if (gpu_array_is_scalar(&prog->array[i]))
109 p = isl_printer_print_str(p, "&");
110 p = isl_printer_print_str(p, prog->array[i].name);
111 p = isl_printer_print_str(p, ", ");
113 p = gpu_array_info_print_size(p, &prog->array[i]);
114 p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
115 p = isl_printer_end_line(p);
117 p = isl_printer_start_line(p);
118 p = isl_printer_end_line(p);
119 return p;
122 static void print_reverse_list(FILE *out, int len, int *list)
124 int i;
126 if (len == 0)
127 return;
129 fprintf(out, "(");
130 for (i = 0; i < len; ++i) {
131 if (i)
132 fprintf(out, ", ");
133 fprintf(out, "%d", list[len - 1 - i]);
135 fprintf(out, ")");
138 /* Print the effective grid size as a list of the sizes in each
139 * dimension, from innermost to outermost.
141 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
142 struct ppcg_kernel *kernel)
144 int i;
145 int dim;
147 dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
148 if (dim == 0)
149 return p;
151 p = isl_printer_print_str(p, "(");
152 for (i = dim - 1; i >= 0; --i) {
153 isl_pw_aff *bound;
155 bound = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
156 p = isl_printer_print_pw_aff(p, bound);
157 isl_pw_aff_free(bound);
159 if (i > 0)
160 p = isl_printer_print_str(p, ", ");
163 p = isl_printer_print_str(p, ")");
165 return p;
168 /* Print the grid definition.
170 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
171 struct ppcg_kernel *kernel)
173 p = isl_printer_start_line(p);
174 p = isl_printer_print_str(p, "dim3 k");
175 p = isl_printer_print_int(p, kernel->id);
176 p = isl_printer_print_str(p, "_dimGrid");
177 p = print_grid_size(p, kernel);
178 p = isl_printer_print_str(p, ";");
179 p = isl_printer_end_line(p);
181 return p;
184 /* Print the arguments to a kernel declaration or call. If "types" is set,
185 * then print a declaration (including the types of the arguments).
187 * The arguments are printed in the following order
188 * - the arrays accessed by the kernel
189 * - the parameters
190 * - the host loop iterators
192 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
193 struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
195 int i, n;
196 int first = 1;
197 unsigned nparam;
198 isl_space *space;
199 const char *type;
201 for (i = 0; i < prog->n_array; ++i) {
202 isl_set *arr;
203 int empty;
205 space = isl_space_copy(prog->array[i].dim);
206 arr = isl_union_set_extract_set(kernel->arrays, space);
207 empty = isl_set_fast_is_empty(arr);
208 isl_set_free(arr);
209 if (empty)
210 continue;
212 if (!first)
213 p = isl_printer_print_str(p, ", ");
215 if (types) {
216 p = isl_printer_print_str(p, prog->array[i].type);
217 p = isl_printer_print_str(p, " ");
220 if (gpu_array_is_read_only_scalar(&prog->array[i])) {
221 p = isl_printer_print_str(p, prog->array[i].name);
222 } else {
223 if (types)
224 p = isl_printer_print_str(p, "*");
225 else
226 p = isl_printer_print_str(p, "dev_");
227 p = isl_printer_print_str(p, prog->array[i].name);
230 first = 0;
233 space = isl_union_set_get_space(kernel->arrays);
234 nparam = isl_space_dim(space, isl_dim_param);
235 for (i = 0; i < nparam; ++i) {
236 const char *name;
238 name = isl_space_get_dim_name(space, isl_dim_param, i);
240 if (!first)
241 p = isl_printer_print_str(p, ", ");
242 if (types)
243 p = isl_printer_print_str(p, "int ");
244 p = isl_printer_print_str(p, name);
246 first = 0;
248 isl_space_free(space);
250 n = isl_space_dim(kernel->space, isl_dim_set);
251 type = isl_options_get_ast_iterator_type(prog->ctx);
252 for (i = 0; i < n; ++i) {
253 const char *name;
254 isl_id *id;
256 if (!first)
257 p = isl_printer_print_str(p, ", ");
258 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
259 if (types) {
260 p = isl_printer_print_str(p, type);
261 p = isl_printer_print_str(p, " ");
263 p = isl_printer_print_str(p, name);
265 first = 0;
268 return p;
271 /* Print the header of the given kernel.
273 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
274 struct gpu_prog *prog, struct ppcg_kernel *kernel)
276 p = isl_printer_start_line(p);
277 p = isl_printer_print_str(p, "__global__ void kernel");
278 p = isl_printer_print_int(p, kernel->id);
279 p = isl_printer_print_str(p, "(");
280 p = print_kernel_arguments(p, prog, kernel, 1);
281 p = isl_printer_print_str(p, ")");
283 return p;
286 /* Print the header of the given kernel to both gen->cuda.kernel_h
287 * and gen->cuda.kernel_c.
289 static void print_kernel_headers(struct gpu_prog *prog,
290 struct ppcg_kernel *kernel, struct cuda_info *cuda)
292 isl_printer *p;
294 p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
295 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
296 p = print_kernel_header(p, prog, kernel);
297 p = isl_printer_print_str(p, ";");
298 p = isl_printer_end_line(p);
299 isl_printer_free(p);
301 p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
302 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
303 p = print_kernel_header(p, prog, kernel);
304 p = isl_printer_end_line(p);
305 isl_printer_free(p);
308 static void print_indent(FILE *dst, int indent)
310 fprintf(dst, "%*s", indent, "");
313 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
315 int i, n_grid;
316 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
317 const char *type;
318 const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
319 const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
320 "threadIdx.z" };
322 type = isl_options_get_ast_iterator_type(ctx);
324 n_grid = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
325 if (n_grid > 0) {
326 print_indent(out, 4);
327 fprintf(out, "%s ", type);
328 for (i = 0; i < n_grid; ++i) {
329 if (i)
330 fprintf(out, ", ");
331 fprintf(out, "b%d = %s",
332 i, block_dims[n_grid - 1 - i]);
334 fprintf(out, ";\n");
337 if (kernel->n_block > 0) {
338 print_indent(out, 4);
339 fprintf(out, "%s ", type);
340 for (i = 0; i < kernel->n_block; ++i) {
341 if (i)
342 fprintf(out, ", ");
343 fprintf(out, "t%d = %s",
344 i, thread_dims[kernel->n_block - 1 - i]);
346 fprintf(out, ";\n");
350 static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p,
351 struct ppcg_kernel_var *var)
353 int j;
355 p = isl_printer_start_line(p);
356 if (var->type == ppcg_access_shared)
357 p = isl_printer_print_str(p, "__shared__ ");
358 p = isl_printer_print_str(p, var->array->type);
359 p = isl_printer_print_str(p, " ");
360 p = isl_printer_print_str(p, var->name);
361 for (j = 0; j < var->array->n_index; ++j) {
362 isl_val *v;
364 p = isl_printer_print_str(p, "[");
365 v = isl_vec_get_element_val(var->size, j);
366 p = isl_printer_print_val(p, v);
367 isl_val_free(v);
368 p = isl_printer_print_str(p, "]");
370 p = isl_printer_print_str(p, ";");
371 p = isl_printer_end_line(p);
373 return p;
376 static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p,
377 struct ppcg_kernel *kernel)
379 int i;
381 for (i = 0; i < kernel->n_var; ++i)
382 p = print_kernel_var(p, &kernel->var[i]);
384 return p;
387 /* Print a sync statement.
389 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
390 struct ppcg_kernel_stmt *stmt)
392 p = isl_printer_start_line(p);
393 p = isl_printer_print_str(p, "__syncthreads();");
394 p = isl_printer_end_line(p);
396 return p;
399 /* This function is called for each user statement in the AST,
400 * i.e., for each kernel body statement, copy statement or sync statement.
402 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
403 __isl_take isl_ast_print_options *print_options,
404 __isl_keep isl_ast_node *node, void *user)
406 isl_id *id;
407 struct ppcg_kernel_stmt *stmt;
409 id = isl_ast_node_get_annotation(node);
410 stmt = isl_id_get_user(id);
411 isl_id_free(id);
413 isl_ast_print_options_free(print_options);
415 switch (stmt->type) {
416 case ppcg_kernel_copy:
417 return ppcg_kernel_print_copy(p, stmt);
418 case ppcg_kernel_sync:
419 return print_sync(p, stmt);
420 case ppcg_kernel_domain:
421 return ppcg_kernel_print_domain(p, stmt);
424 return p;
427 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
428 struct cuda_info *cuda)
430 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
431 isl_ast_print_options *print_options;
432 isl_printer *p;
434 print_kernel_headers(prog, kernel, cuda);
435 fprintf(cuda->kernel_c, "{\n");
436 print_kernel_iterators(cuda->kernel_c, kernel);
438 p = isl_printer_to_file(ctx, cuda->kernel_c);
439 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
440 p = isl_printer_indent(p, 4);
442 p = print_kernel_vars(p, kernel);
443 p = isl_printer_end_line(p);
444 p = gpu_print_macros(p, kernel->tree);
446 print_options = isl_ast_print_options_alloc(ctx);
447 print_options = isl_ast_print_options_set_print_user(print_options,
448 &print_kernel_stmt, NULL);
449 p = isl_ast_node_print(kernel->tree, p, print_options);
450 isl_printer_free(p);
452 fprintf(cuda->kernel_c, "}\n");
455 struct print_host_user_data {
456 struct cuda_info *cuda;
457 struct gpu_prog *prog;
460 /* Print the user statement of the host code to "p".
462 * In particular, print a block of statements that defines the grid
463 * and the block and then launches the kernel.
465 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
466 __isl_take isl_ast_print_options *print_options,
467 __isl_keep isl_ast_node *node, void *user)
469 isl_id *id;
470 struct ppcg_kernel *kernel;
471 struct print_host_user_data *data;
473 id = isl_ast_node_get_annotation(node);
474 kernel = isl_id_get_user(id);
475 isl_id_free(id);
477 data = (struct print_host_user_data *) user;
479 p = isl_printer_start_line(p);
480 p = isl_printer_print_str(p, "{");
481 p = isl_printer_end_line(p);
482 p = isl_printer_indent(p, 2);
484 p = isl_printer_start_line(p);
485 p = isl_printer_print_str(p, "dim3 k");
486 p = isl_printer_print_int(p, kernel->id);
487 p = isl_printer_print_str(p, "_dimBlock");
488 print_reverse_list(isl_printer_get_file(p),
489 kernel->n_block, kernel->block_dim);
490 p = isl_printer_print_str(p, ";");
491 p = isl_printer_end_line(p);
493 p = print_grid(p, kernel);
495 p = isl_printer_start_line(p);
496 p = isl_printer_print_str(p, "kernel");
497 p = isl_printer_print_int(p, kernel->id);
498 p = isl_printer_print_str(p, " <<<k");
499 p = isl_printer_print_int(p, kernel->id);
500 p = isl_printer_print_str(p, "_dimGrid, k");
501 p = isl_printer_print_int(p, kernel->id);
502 p = isl_printer_print_str(p, "_dimBlock>>> (");
503 p = print_kernel_arguments(p, data->prog, kernel, 0);
504 p = isl_printer_print_str(p, ");");
505 p = isl_printer_end_line(p);
507 p = isl_printer_start_line(p);
508 p = isl_printer_print_str(p, "cudaCheckKernel();");
509 p = isl_printer_end_line(p);
511 p = isl_printer_indent(p, -2);
512 p = isl_printer_start_line(p);
513 p = isl_printer_print_str(p, "}");
514 p = isl_printer_end_line(p);
516 p = isl_printer_start_line(p);
517 p = isl_printer_end_line(p);
519 print_kernel(data->prog, kernel, data->cuda);
521 isl_ast_print_options_free(print_options);
523 return p;
526 static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
527 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
528 struct cuda_info *cuda)
530 isl_ast_print_options *print_options;
531 isl_ctx *ctx = isl_ast_node_get_ctx(tree);
532 struct print_host_user_data data = { cuda, prog };
534 print_options = isl_ast_print_options_alloc(ctx);
535 print_options = isl_ast_print_options_set_print_user(print_options,
536 &print_host_user, &data);
538 p = gpu_print_macros(p, tree);
539 p = isl_ast_node_print(tree, p, print_options);
541 return p;
544 /* For each array that needs to be copied out (based on prog->copy_out),
545 * copy the contents back from the GPU to the host.
547 * If any element of a given array appears in prog->copy_out, then its
548 * entire extent is in prog->copy_out. The bounds on this extent have
549 * been precomputed in extract_array_info and are used in
550 * gpu_array_info_print_size.
552 static __isl_give isl_printer *copy_arrays_from_device(
553 __isl_take isl_printer *p, struct gpu_prog *prog)
555 int i;
556 isl_union_set *copy_out;
557 copy_out = isl_union_set_copy(prog->copy_out);
559 for (i = 0; i < prog->n_array; ++i) {
560 isl_space *dim;
561 isl_set *copy_out_i;
562 int empty;
564 dim = isl_space_copy(prog->array[i].dim);
565 copy_out_i = isl_union_set_extract_set(copy_out, dim);
566 empty = isl_set_fast_is_empty(copy_out_i);
567 isl_set_free(copy_out_i);
568 if (empty)
569 continue;
571 p = isl_printer_start_line(p);
572 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
573 if (gpu_array_is_scalar(&prog->array[i]))
574 p = isl_printer_print_str(p, "&");
575 p = isl_printer_print_str(p, prog->array[i].name);
576 p = isl_printer_print_str(p, ", dev_");
577 p = isl_printer_print_str(p, prog->array[i].name);
578 p = isl_printer_print_str(p, ", ");
579 p = gpu_array_info_print_size(p, &prog->array[i]);
580 p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
581 p = isl_printer_end_line(p);
584 isl_union_set_free(copy_out);
585 p = isl_printer_start_line(p);
586 p = isl_printer_end_line(p);
587 return p;
590 static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
591 struct gpu_prog *prog)
593 int i;
595 for (i = 0; i < prog->n_array; ++i) {
596 if (gpu_array_is_read_only_scalar(&prog->array[i]))
597 continue;
598 p = isl_printer_start_line(p);
599 p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
600 p = isl_printer_print_str(p, prog->array[i].name);
601 p = isl_printer_print_str(p, "));");
602 p = isl_printer_end_line(p);
605 return p;
608 /* Given a gpu_prog "prog" and the corresponding transformed AST
609 * "tree", print the entire CUDA code to "p".
611 static __isl_give isl_printer *print_cuda(__isl_take isl_printer *p,
612 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
613 void *user)
615 struct cuda_info *cuda = user;
617 p = ppcg_start_block(p);
619 p = print_cuda_macros(p);
621 p = declare_device_arrays(p, prog);
622 p = allocate_device_arrays(p, prog);
623 p = copy_arrays_to_device(p, prog);
625 p = print_host_code(p, prog, tree, cuda);
627 p = copy_arrays_from_device(p, prog);
628 p = free_device_arrays(p, prog);
630 p = ppcg_end_block(p);
632 return p;
635 /* Transform the code in the file called "input" by replacing
636 * all scops by corresponding CUDA code.
637 * The names of the output files are derived from "input".
639 * We let generate_gpu do all the hard work and then let it call
640 * us back for printing the AST in print_cuda.
642 * To prepare for this printing, we first open the output files
643 * and we close them after generate_gpu has finished.
645 int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
646 const char *input)
648 struct cuda_info cuda;
649 int r;
651 cuda_open_files(&cuda, input);
653 r = generate_gpu(ctx, input, cuda.host_c, options, &print_cuda, &cuda);
655 cuda_close_files(&cuda);
657 return r;