gpu.c: add_bounded_parameters: use isl_set_{lower,upper}_bound_si
[ppcg.git] / cuda.c
blobc999004de036c2cfd1a2a541f78652d2fc1d9bc6
1 /*
2 * Copyright 2012 Ecole Normale Superieure
4 * Use of this software is governed by the MIT license
6 * Written by Sven Verdoolaege,
7 * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
8 */
10 #include <isl/aff.h>
11 #include <isl/ast.h>
13 #include "cuda_common.h"
14 #include "cuda.h"
15 #include "gpu.h"
16 #include "gpu_print.h"
17 #include "print.h"
19 static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
21 const char *macros =
22 "#define cudaCheckReturn(ret) \\\n"
23 " do { \\\n"
24 " cudaError_t cudaCheckReturn_e = (ret); \\\n"
25 " if (cudaCheckReturn_e != cudaSuccess) { \\\n"
26 " fprintf(stderr, \"CUDA error: %s\\n\", "
27 "cudaGetErrorString(cudaCheckReturn_e)); \\\n"
28 " fflush(stderr); \\\n"
29 " } \\\n"
30 " assert(cudaCheckReturn_e == cudaSuccess); \\\n"
31 " } while(0)\n"
32 "#define cudaCheckKernel() \\\n"
33 " do { \\\n"
34 " cudaCheckReturn(cudaGetLastError()); \\\n"
35 " } while(0)\n\n";
37 p = isl_printer_print_str(p, macros);
38 return p;
41 /* Print a declaration for the device array corresponding to "array" on "p".
43 static __isl_give isl_printer *declare_device_array(__isl_take isl_printer *p,
44 struct gpu_array_info *array)
46 int i;
48 p = isl_printer_start_line(p);
49 p = isl_printer_print_str(p, array->type);
50 p = isl_printer_print_str(p, " ");
51 if (!array->linearize && array->n_index > 1)
52 p = isl_printer_print_str(p, "(");
53 p = isl_printer_print_str(p, "*dev_");
54 p = isl_printer_print_str(p, array->name);
55 if (!array->linearize && array->n_index > 1) {
56 p = isl_printer_print_str(p, ")");
57 for (i = 1; i < array->n_index; i++) {
58 p = isl_printer_print_str(p, "[");
59 p = isl_printer_print_pw_aff(p, array->bound[i]);
60 p = isl_printer_print_str(p, "]");
63 p = isl_printer_print_str(p, ";");
64 p = isl_printer_end_line(p);
66 return p;
69 static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
70 struct gpu_prog *prog)
72 int i;
74 for (i = 0; i < prog->n_array; ++i) {
75 if (gpu_array_is_read_only_scalar(&prog->array[i]))
76 continue;
78 p = declare_device_array(p, &prog->array[i]);
80 p = isl_printer_start_line(p);
81 p = isl_printer_end_line(p);
82 return p;
85 static __isl_give isl_printer *allocate_device_arrays(
86 __isl_take isl_printer *p, struct gpu_prog *prog)
88 int i;
90 for (i = 0; i < prog->n_array; ++i) {
91 if (gpu_array_is_read_only_scalar(&prog->array[i]))
92 continue;
93 p = isl_printer_start_line(p);
94 p = isl_printer_print_str(p,
95 "cudaCheckReturn(cudaMalloc((void **) &dev_");
96 p = isl_printer_print_str(p, prog->array[i].name);
97 p = isl_printer_print_str(p, ", ");
98 p = gpu_array_info_print_size(p, &prog->array[i]);
99 p = isl_printer_print_str(p, "));");
100 p = isl_printer_end_line(p);
102 p = isl_printer_start_line(p);
103 p = isl_printer_end_line(p);
104 return p;
107 static __isl_give isl_printer *copy_arrays_to_device(__isl_take isl_printer *p,
108 struct gpu_prog *prog)
110 int i;
112 for (i = 0; i < prog->n_array; ++i) {
113 isl_space *dim;
114 isl_set *read_i;
115 int empty;
117 if (gpu_array_is_read_only_scalar(&prog->array[i]))
118 continue;
120 dim = isl_space_copy(prog->array[i].space);
121 read_i = isl_union_set_extract_set(prog->copy_in, dim);
122 empty = isl_set_plain_is_empty(read_i);
123 isl_set_free(read_i);
124 if (empty)
125 continue;
127 p = isl_printer_start_line(p);
128 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
129 p = isl_printer_print_str(p, prog->array[i].name);
130 p = isl_printer_print_str(p, ", ");
132 if (gpu_array_is_scalar(&prog->array[i]))
133 p = isl_printer_print_str(p, "&");
134 p = isl_printer_print_str(p, prog->array[i].name);
135 p = isl_printer_print_str(p, ", ");
137 p = gpu_array_info_print_size(p, &prog->array[i]);
138 p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
139 p = isl_printer_end_line(p);
141 p = isl_printer_start_line(p);
142 p = isl_printer_end_line(p);
143 return p;
146 static void print_reverse_list(FILE *out, int len, int *list)
148 int i;
150 if (len == 0)
151 return;
153 fprintf(out, "(");
154 for (i = 0; i < len; ++i) {
155 if (i)
156 fprintf(out, ", ");
157 fprintf(out, "%d", list[len - 1 - i]);
159 fprintf(out, ")");
162 /* Print the effective grid size as a list of the sizes in each
163 * dimension, from innermost to outermost.
165 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
166 struct ppcg_kernel *kernel)
168 int i;
169 int dim;
171 dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
172 if (dim == 0)
173 return p;
175 p = isl_printer_print_str(p, "(");
176 for (i = dim - 1; i >= 0; --i) {
177 isl_pw_aff *bound;
179 bound = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
180 p = isl_printer_print_pw_aff(p, bound);
181 isl_pw_aff_free(bound);
183 if (i > 0)
184 p = isl_printer_print_str(p, ", ");
187 p = isl_printer_print_str(p, ")");
189 return p;
192 /* Print the grid definition.
194 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
195 struct ppcg_kernel *kernel)
197 p = isl_printer_start_line(p);
198 p = isl_printer_print_str(p, "dim3 k");
199 p = isl_printer_print_int(p, kernel->id);
200 p = isl_printer_print_str(p, "_dimGrid");
201 p = print_grid_size(p, kernel);
202 p = isl_printer_print_str(p, ";");
203 p = isl_printer_end_line(p);
205 return p;
208 /* Print the arguments to a kernel declaration or call. If "types" is set,
209 * then print a declaration (including the types of the arguments).
211 * The arguments are printed in the following order
212 * - the arrays accessed by the kernel
213 * - the parameters
214 * - the host loop iterators
216 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
217 struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
219 int i, n;
220 int first = 1;
221 unsigned nparam;
222 isl_space *space;
223 const char *type;
225 for (i = 0; i < prog->n_array; ++i) {
226 isl_set *arr;
227 int empty;
229 space = isl_space_copy(prog->array[i].space);
230 arr = isl_union_set_extract_set(kernel->arrays, space);
231 empty = isl_set_plain_is_empty(arr);
232 isl_set_free(arr);
233 if (empty)
234 continue;
236 if (!first)
237 p = isl_printer_print_str(p, ", ");
239 if (types)
240 p = gpu_array_info_print_declaration_argument(p,
241 &prog->array[i], NULL);
242 else
243 p = gpu_array_info_print_call_argument(p,
244 &prog->array[i]);
246 first = 0;
249 space = isl_union_set_get_space(kernel->arrays);
250 nparam = isl_space_dim(space, isl_dim_param);
251 for (i = 0; i < nparam; ++i) {
252 const char *name;
254 name = isl_space_get_dim_name(space, isl_dim_param, i);
256 if (!first)
257 p = isl_printer_print_str(p, ", ");
258 if (types)
259 p = isl_printer_print_str(p, "int ");
260 p = isl_printer_print_str(p, name);
262 first = 0;
264 isl_space_free(space);
266 n = isl_space_dim(kernel->space, isl_dim_set);
267 type = isl_options_get_ast_iterator_type(prog->ctx);
268 for (i = 0; i < n; ++i) {
269 const char *name;
271 if (!first)
272 p = isl_printer_print_str(p, ", ");
273 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
274 if (types) {
275 p = isl_printer_print_str(p, type);
276 p = isl_printer_print_str(p, " ");
278 p = isl_printer_print_str(p, name);
280 first = 0;
283 return p;
286 /* Print the header of the given kernel.
288 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
289 struct gpu_prog *prog, struct ppcg_kernel *kernel)
291 p = isl_printer_start_line(p);
292 p = isl_printer_print_str(p, "__global__ void kernel");
293 p = isl_printer_print_int(p, kernel->id);
294 p = isl_printer_print_str(p, "(");
295 p = print_kernel_arguments(p, prog, kernel, 1);
296 p = isl_printer_print_str(p, ")");
298 return p;
301 /* Print the header of the given kernel to both gen->cuda.kernel_h
302 * and gen->cuda.kernel_c.
304 static void print_kernel_headers(struct gpu_prog *prog,
305 struct ppcg_kernel *kernel, struct cuda_info *cuda)
307 isl_printer *p;
309 p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
310 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
311 p = print_kernel_header(p, prog, kernel);
312 p = isl_printer_print_str(p, ";");
313 p = isl_printer_end_line(p);
314 isl_printer_free(p);
316 p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
317 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
318 p = print_kernel_header(p, prog, kernel);
319 p = isl_printer_end_line(p);
320 isl_printer_free(p);
323 static void print_indent(FILE *dst, int indent)
325 fprintf(dst, "%*s", indent, "");
328 /* Print a list of iterators of type "type" with names "ids" to "out".
329 * Each iterator is assigned one of the cuda identifiers in cuda_dims.
330 * In particular, the last iterator is assigned the x identifier
331 * (the first in the list of cuda identifiers).
333 static void print_iterators(FILE *out, const char *type,
334 __isl_keep isl_id_list *ids, const char *cuda_dims[])
336 int i, n;
338 n = isl_id_list_n_id(ids);
339 if (n <= 0)
340 return;
341 print_indent(out, 4);
342 fprintf(out, "%s ", type);
343 for (i = 0; i < n; ++i) {
344 isl_id *id;
346 if (i)
347 fprintf(out, ", ");
348 id = isl_id_list_get_id(ids, i);
349 fprintf(out, "%s%d = %s", isl_id_get_name(id), i,
350 cuda_dims[n - 1 - i]);
351 isl_id_free(id);
353 fprintf(out, ";\n");
356 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
358 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
359 const char *type;
360 const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
361 const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
362 "threadIdx.z" };
364 type = isl_options_get_ast_iterator_type(ctx);
366 print_iterators(out, type, kernel->block_ids, block_dims);
367 print_iterators(out, type, kernel->thread_ids, thread_dims);
370 static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p,
371 struct ppcg_kernel_var *var)
373 int j;
375 p = isl_printer_start_line(p);
376 if (var->type == ppcg_access_shared)
377 p = isl_printer_print_str(p, "__shared__ ");
378 p = isl_printer_print_str(p, var->array->type);
379 p = isl_printer_print_str(p, " ");
380 p = isl_printer_print_str(p, var->name);
381 for (j = 0; j < var->array->n_index; ++j) {
382 isl_val *v;
384 p = isl_printer_print_str(p, "[");
385 v = isl_vec_get_element_val(var->size, j);
386 p = isl_printer_print_val(p, v);
387 isl_val_free(v);
388 p = isl_printer_print_str(p, "]");
390 p = isl_printer_print_str(p, ";");
391 p = isl_printer_end_line(p);
393 return p;
396 static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p,
397 struct ppcg_kernel *kernel)
399 int i;
401 for (i = 0; i < kernel->n_var; ++i)
402 p = print_kernel_var(p, &kernel->var[i]);
404 return p;
407 /* Print a sync statement.
409 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
410 struct ppcg_kernel_stmt *stmt)
412 p = isl_printer_start_line(p);
413 p = isl_printer_print_str(p, "__syncthreads();");
414 p = isl_printer_end_line(p);
416 return p;
419 /* This function is called for each user statement in the AST,
420 * i.e., for each kernel body statement, copy statement or sync statement.
422 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
423 __isl_take isl_ast_print_options *print_options,
424 __isl_keep isl_ast_node *node, void *user)
426 isl_id *id;
427 struct ppcg_kernel_stmt *stmt;
429 id = isl_ast_node_get_annotation(node);
430 stmt = isl_id_get_user(id);
431 isl_id_free(id);
433 isl_ast_print_options_free(print_options);
435 switch (stmt->type) {
436 case ppcg_kernel_copy:
437 return ppcg_kernel_print_copy(p, stmt);
438 case ppcg_kernel_sync:
439 return print_sync(p, stmt);
440 case ppcg_kernel_domain:
441 return ppcg_kernel_print_domain(p, stmt);
444 return p;
447 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
448 struct cuda_info *cuda)
450 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
451 isl_ast_print_options *print_options;
452 isl_printer *p;
454 print_kernel_headers(prog, kernel, cuda);
455 fprintf(cuda->kernel_c, "{\n");
456 print_kernel_iterators(cuda->kernel_c, kernel);
458 p = isl_printer_to_file(ctx, cuda->kernel_c);
459 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
460 p = isl_printer_indent(p, 4);
462 p = print_kernel_vars(p, kernel);
463 p = isl_printer_end_line(p);
464 p = gpu_print_macros(p, kernel->tree);
466 print_options = isl_ast_print_options_alloc(ctx);
467 print_options = isl_ast_print_options_set_print_user(print_options,
468 &print_kernel_stmt, NULL);
469 p = isl_ast_node_print(kernel->tree, p, print_options);
470 isl_printer_free(p);
472 fprintf(cuda->kernel_c, "}\n");
475 struct print_host_user_data {
476 struct cuda_info *cuda;
477 struct gpu_prog *prog;
480 /* Print the user statement of the host code to "p".
482 * In particular, print a block of statements that defines the grid
483 * and the block and then launches the kernel.
485 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
486 __isl_take isl_ast_print_options *print_options,
487 __isl_keep isl_ast_node *node, void *user)
489 isl_id *id;
490 struct ppcg_kernel *kernel;
491 struct print_host_user_data *data;
493 id = isl_ast_node_get_annotation(node);
494 kernel = isl_id_get_user(id);
495 isl_id_free(id);
497 data = (struct print_host_user_data *) user;
499 p = isl_printer_start_line(p);
500 p = isl_printer_print_str(p, "{");
501 p = isl_printer_end_line(p);
502 p = isl_printer_indent(p, 2);
504 p = isl_printer_start_line(p);
505 p = isl_printer_print_str(p, "dim3 k");
506 p = isl_printer_print_int(p, kernel->id);
507 p = isl_printer_print_str(p, "_dimBlock");
508 print_reverse_list(isl_printer_get_file(p),
509 kernel->n_block, kernel->block_dim);
510 p = isl_printer_print_str(p, ";");
511 p = isl_printer_end_line(p);
513 p = print_grid(p, kernel);
515 p = isl_printer_start_line(p);
516 p = isl_printer_print_str(p, "kernel");
517 p = isl_printer_print_int(p, kernel->id);
518 p = isl_printer_print_str(p, " <<<k");
519 p = isl_printer_print_int(p, kernel->id);
520 p = isl_printer_print_str(p, "_dimGrid, k");
521 p = isl_printer_print_int(p, kernel->id);
522 p = isl_printer_print_str(p, "_dimBlock>>> (");
523 p = print_kernel_arguments(p, data->prog, kernel, 0);
524 p = isl_printer_print_str(p, ");");
525 p = isl_printer_end_line(p);
527 p = isl_printer_start_line(p);
528 p = isl_printer_print_str(p, "cudaCheckKernel();");
529 p = isl_printer_end_line(p);
531 p = isl_printer_indent(p, -2);
532 p = isl_printer_start_line(p);
533 p = isl_printer_print_str(p, "}");
534 p = isl_printer_end_line(p);
536 p = isl_printer_start_line(p);
537 p = isl_printer_end_line(p);
539 print_kernel(data->prog, kernel, data->cuda);
541 isl_ast_print_options_free(print_options);
543 return p;
546 static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
547 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
548 struct cuda_info *cuda)
550 isl_ast_print_options *print_options;
551 isl_ctx *ctx = isl_ast_node_get_ctx(tree);
552 struct print_host_user_data data = { cuda, prog };
554 print_options = isl_ast_print_options_alloc(ctx);
555 print_options = isl_ast_print_options_set_print_user(print_options,
556 &print_host_user, &data);
558 p = gpu_print_macros(p, tree);
559 p = isl_ast_node_print(tree, p, print_options);
561 return p;
564 /* For each array that needs to be copied out (based on prog->copy_out),
565 * copy the contents back from the GPU to the host.
567 * If any element of a given array appears in prog->copy_out, then its
568 * entire extent is in prog->copy_out. The bounds on this extent have
569 * been precomputed in extract_array_info and are used in
570 * gpu_array_info_print_size.
572 static __isl_give isl_printer *copy_arrays_from_device(
573 __isl_take isl_printer *p, struct gpu_prog *prog)
575 int i;
576 isl_union_set *copy_out;
577 copy_out = isl_union_set_copy(prog->copy_out);
579 for (i = 0; i < prog->n_array; ++i) {
580 isl_space *dim;
581 isl_set *copy_out_i;
582 int empty;
584 dim = isl_space_copy(prog->array[i].space);
585 copy_out_i = isl_union_set_extract_set(copy_out, dim);
586 empty = isl_set_plain_is_empty(copy_out_i);
587 isl_set_free(copy_out_i);
588 if (empty)
589 continue;
591 p = isl_printer_start_line(p);
592 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
593 if (gpu_array_is_scalar(&prog->array[i]))
594 p = isl_printer_print_str(p, "&");
595 p = isl_printer_print_str(p, prog->array[i].name);
596 p = isl_printer_print_str(p, ", dev_");
597 p = isl_printer_print_str(p, prog->array[i].name);
598 p = isl_printer_print_str(p, ", ");
599 p = gpu_array_info_print_size(p, &prog->array[i]);
600 p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
601 p = isl_printer_end_line(p);
604 isl_union_set_free(copy_out);
605 p = isl_printer_start_line(p);
606 p = isl_printer_end_line(p);
607 return p;
610 static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
611 struct gpu_prog *prog)
613 int i;
615 for (i = 0; i < prog->n_array; ++i) {
616 if (gpu_array_is_read_only_scalar(&prog->array[i]))
617 continue;
618 p = isl_printer_start_line(p);
619 p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
620 p = isl_printer_print_str(p, prog->array[i].name);
621 p = isl_printer_print_str(p, "));");
622 p = isl_printer_end_line(p);
625 return p;
628 /* Given a gpu_prog "prog" and the corresponding transformed AST
629 * "tree", print the entire CUDA code to "p".
630 * "types" collects the types for which a definition has already
631 * been printed.
633 static __isl_give isl_printer *print_cuda(__isl_take isl_printer *p,
634 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
635 struct gpu_types *types, void *user)
637 struct cuda_info *cuda = user;
638 isl_printer *kernel;
640 kernel = isl_printer_to_file(isl_printer_get_ctx(p), cuda->kernel_c);
641 kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
642 kernel = gpu_print_types(kernel, types, prog);
643 isl_printer_free(kernel);
645 if (!kernel)
646 return isl_printer_free(p);
648 p = ppcg_start_block(p);
650 p = print_cuda_macros(p);
652 p = declare_device_arrays(p, prog);
653 p = allocate_device_arrays(p, prog);
654 p = copy_arrays_to_device(p, prog);
656 p = print_host_code(p, prog, tree, cuda);
658 p = copy_arrays_from_device(p, prog);
659 p = free_device_arrays(p, prog);
661 p = ppcg_end_block(p);
663 return p;
666 /* Transform the code in the file called "input" by replacing
667 * all scops by corresponding CUDA code.
668 * The names of the output files are derived from "input".
670 * We let generate_gpu do all the hard work and then let it call
671 * us back for printing the AST in print_cuda.
673 * To prepare for this printing, we first open the output files
674 * and we close them after generate_gpu has finished.
676 int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
677 const char *input)
679 struct cuda_info cuda;
680 int r;
682 cuda_open_files(&cuda, input);
684 r = generate_gpu(ctx, input, cuda.host_c, options, &print_cuda, &cuda);
686 cuda_close_files(&cuda);
688 return r;