cpu.c: get_output_file: handle input file names without extension
[ppcg.git] / cuda.c
blob3277d132931bd050cddb70c605c0ca5765007a88
1 /*
2 * Copyright 2012 Ecole Normale Superieure
4 * Use of this software is governed by the GNU LGPLv2.1 license
6 * Written by Sven Verdoolaege,
7 * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
8 */
10 #include <isl/aff.h>
11 #include <isl/ast.h>
13 #include "cuda_common.h"
14 #include "cuda.h"
15 #include "gpu.h"
16 #include "gpu_print.h"
17 #include "print.h"
19 static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
21 const char *macros =
22 "#define cudaCheckReturn(ret) \\\n"
23 " do { \\\n"
24 " cudaError_t cudaCheckReturn_e = (ret); \\\n"
25 " if (cudaCheckReturn_e != cudaSuccess) { \\\n"
26 " fprintf(stderr, \"CUDA error: %s\\n\", "
27 "cudaGetErrorString(cudaCheckReturn_e)); \\\n"
28 " fflush(stderr); \\\n"
29 " } \\\n"
30 " assert(cudaCheckReturn_e == cudaSuccess); \\\n"
31 " } while(0)\n"
32 "#define cudaCheckKernel() \\\n"
33 " do { \\\n"
34 " cudaCheckReturn(cudaGetLastError()); \\\n"
35 " } while(0)\n\n";
37 p = isl_printer_print_str(p, macros);
38 return p;
41 static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
42 struct gpu_prog *prog)
44 int i;
46 for (i = 0; i < prog->n_array; ++i) {
47 if (gpu_array_is_read_only_scalar(&prog->array[i]))
48 continue;
49 p = isl_printer_start_line(p);
50 p = isl_printer_print_str(p, prog->array[i].type);
51 p = isl_printer_print_str(p, " *dev_");
52 p = isl_printer_print_str(p, prog->array[i].name);
53 p = isl_printer_print_str(p, ";");
54 p = isl_printer_end_line(p);
56 p = isl_printer_start_line(p);
57 p = isl_printer_end_line(p);
58 return p;
61 static __isl_give isl_printer *allocate_device_arrays(
62 __isl_take isl_printer *p, struct gpu_prog *prog)
64 int i;
66 for (i = 0; i < prog->n_array; ++i) {
67 if (gpu_array_is_read_only_scalar(&prog->array[i]))
68 continue;
69 p = isl_printer_start_line(p);
70 p = isl_printer_print_str(p,
71 "cudaCheckReturn(cudaMalloc((void **) &dev_");
72 p = isl_printer_print_str(p, prog->array[i].name);
73 p = isl_printer_print_str(p, ", ");
74 p = gpu_array_info_print_size(p, &prog->array[i]);
75 p = isl_printer_print_str(p, "));");
76 p = isl_printer_end_line(p);
78 p = isl_printer_start_line(p);
79 p = isl_printer_end_line(p);
80 return p;
83 static __isl_give isl_printer *copy_arrays_to_device(__isl_take isl_printer *p,
84 struct gpu_prog *prog)
86 int i;
88 for (i = 0; i < prog->n_array; ++i) {
89 isl_space *dim;
90 isl_set *read_i;
91 int empty;
93 if (gpu_array_is_read_only_scalar(&prog->array[i]))
94 continue;
96 dim = isl_space_copy(prog->array[i].dim);
97 read_i = isl_union_set_extract_set(prog->copy_in, dim);
98 empty = isl_set_fast_is_empty(read_i);
99 isl_set_free(read_i);
100 if (empty)
101 continue;
103 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
104 p = isl_printer_print_str(p, prog->array[i].name);
105 p = isl_printer_print_str(p, ", ");
107 if (gpu_array_is_scalar(&prog->array[i]))
108 p = isl_printer_print_str(p, "&");
109 p = isl_printer_print_str(p, prog->array[i].name);
110 p = isl_printer_print_str(p, ", ");
112 p = gpu_array_info_print_size(p, &prog->array[i]);
113 p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
114 p = isl_printer_end_line(p);
116 p = isl_printer_start_line(p);
117 p = isl_printer_end_line(p);
118 return p;
121 static void print_reverse_list(FILE *out, int len, int *list)
123 int i;
125 if (len == 0)
126 return;
128 fprintf(out, "(");
129 for (i = 0; i < len; ++i) {
130 if (i)
131 fprintf(out, ", ");
132 fprintf(out, "%d", list[len - 1 - i]);
134 fprintf(out, ")");
137 /* Print the effective grid size as a list of the sizes in each
138 * dimension, from innermost to outermost.
140 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
141 struct ppcg_kernel *kernel)
143 int i;
144 int dim;
146 dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
147 if (dim == 0)
148 return p;
150 p = isl_printer_print_str(p, "(");
151 for (i = dim - 1; i >= 0; --i) {
152 isl_pw_aff *bound;
154 bound = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
155 p = isl_printer_print_pw_aff(p, bound);
156 isl_pw_aff_free(bound);
158 if (i > 0)
159 p = isl_printer_print_str(p, ", ");
162 p = isl_printer_print_str(p, ")");
164 return p;
167 /* Print the grid definition.
169 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
170 struct ppcg_kernel *kernel)
172 p = isl_printer_start_line(p);
173 p = isl_printer_print_str(p, "dim3 k");
174 p = isl_printer_print_int(p, kernel->id);
175 p = isl_printer_print_str(p, "_dimGrid");
176 p = print_grid_size(p, kernel);
177 p = isl_printer_print_str(p, ";");
178 p = isl_printer_end_line(p);
180 return p;
183 /* Print the arguments to a kernel declaration or call. If "types" is set,
184 * then print a declaration (including the types of the arguments).
186 * The arguments are printed in the following order
187 * - the arrays accessed by the kernel
188 * - the parameters
189 * - the host loop iterators
191 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
192 struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
194 int i, n;
195 int first = 1;
196 unsigned nparam;
197 isl_space *space;
198 const char *type;
200 for (i = 0; i < prog->n_array; ++i) {
201 isl_set *arr;
202 int empty;
204 space = isl_space_copy(prog->array[i].dim);
205 arr = isl_union_set_extract_set(kernel->arrays, space);
206 empty = isl_set_fast_is_empty(arr);
207 isl_set_free(arr);
208 if (empty)
209 continue;
211 if (!first)
212 p = isl_printer_print_str(p, ", ");
214 if (types) {
215 p = isl_printer_print_str(p, prog->array[i].type);
216 p = isl_printer_print_str(p, " ");
219 if (gpu_array_is_read_only_scalar(&prog->array[i])) {
220 p = isl_printer_print_str(p, prog->array[i].name);
221 } else {
222 if (types)
223 p = isl_printer_print_str(p, "*");
224 else
225 p = isl_printer_print_str(p, "dev_");
226 p = isl_printer_print_str(p, prog->array[i].name);
229 first = 0;
232 space = isl_union_set_get_space(kernel->arrays);
233 nparam = isl_space_dim(space, isl_dim_param);
234 for (i = 0; i < nparam; ++i) {
235 const char *name;
237 name = isl_space_get_dim_name(space, isl_dim_param, i);
239 if (!first)
240 p = isl_printer_print_str(p, ", ");
241 if (types)
242 p = isl_printer_print_str(p, "int ");
243 p = isl_printer_print_str(p, name);
245 first = 0;
247 isl_space_free(space);
249 n = isl_space_dim(kernel->space, isl_dim_set);
250 type = isl_options_get_ast_iterator_type(prog->ctx);
251 for (i = 0; i < n; ++i) {
252 const char *name;
253 isl_id *id;
255 if (!first)
256 p = isl_printer_print_str(p, ", ");
257 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
258 if (types) {
259 p = isl_printer_print_str(p, type);
260 p = isl_printer_print_str(p, " ");
262 p = isl_printer_print_str(p, name);
264 first = 0;
267 return p;
270 /* Print the header of the given kernel.
272 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
273 struct gpu_prog *prog, struct ppcg_kernel *kernel)
275 p = isl_printer_start_line(p);
276 p = isl_printer_print_str(p, "__global__ void kernel");
277 p = isl_printer_print_int(p, kernel->id);
278 p = isl_printer_print_str(p, "(");
279 p = print_kernel_arguments(p, prog, kernel, 1);
280 p = isl_printer_print_str(p, ")");
282 return p;
285 /* Print the header of the given kernel to both gen->cuda.kernel_h
286 * and gen->cuda.kernel_c.
288 static void print_kernel_headers(struct gpu_prog *prog,
289 struct ppcg_kernel *kernel, struct cuda_info *cuda)
291 isl_printer *p;
293 p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
294 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
295 p = print_kernel_header(p, prog, kernel);
296 p = isl_printer_print_str(p, ";");
297 p = isl_printer_end_line(p);
298 isl_printer_free(p);
300 p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
301 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
302 p = print_kernel_header(p, prog, kernel);
303 p = isl_printer_end_line(p);
304 isl_printer_free(p);
307 static void print_indent(FILE *dst, int indent)
309 fprintf(dst, "%*s", indent, "");
312 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
314 int i, n_grid;
315 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
316 const char *type;
317 const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
318 const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
319 "threadIdx.z" };
321 type = isl_options_get_ast_iterator_type(ctx);
323 n_grid = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
324 if (n_grid > 0) {
325 print_indent(out, 4);
326 fprintf(out, "%s ", type);
327 for (i = 0; i < n_grid; ++i) {
328 if (i)
329 fprintf(out, ", ");
330 fprintf(out, "b%d = %s",
331 i, block_dims[n_grid - 1 - i]);
333 fprintf(out, ";\n");
336 if (kernel->n_block > 0) {
337 print_indent(out, 4);
338 fprintf(out, "%s ", type);
339 for (i = 0; i < kernel->n_block; ++i) {
340 if (i)
341 fprintf(out, ", ");
342 fprintf(out, "t%d = %s",
343 i, thread_dims[kernel->n_block - 1 - i]);
345 fprintf(out, ";\n");
349 static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p,
350 struct ppcg_kernel_var *var)
352 int j;
354 p = isl_printer_start_line(p);
355 if (var->type == ppcg_access_shared)
356 p = isl_printer_print_str(p, "__shared__ ");
357 p = isl_printer_print_str(p, var->array->type);
358 p = isl_printer_print_str(p, " ");
359 p = isl_printer_print_str(p, var->name);
360 for (j = 0; j < var->array->n_index; ++j) {
361 isl_val *v;
363 p = isl_printer_print_str(p, "[");
364 v = isl_vec_get_element_val(var->size, j);
365 p = isl_printer_print_val(p, v);
366 isl_val_free(v);
367 p = isl_printer_print_str(p, "]");
369 p = isl_printer_print_str(p, ";");
370 p = isl_printer_end_line(p);
372 return p;
375 static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p,
376 struct ppcg_kernel *kernel)
378 int i;
380 for (i = 0; i < kernel->n_var; ++i)
381 p = print_kernel_var(p, &kernel->var[i]);
383 return p;
386 /* Print a sync statement.
388 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
389 struct ppcg_kernel_stmt *stmt)
391 p = isl_printer_start_line(p);
392 p = isl_printer_print_str(p, "__syncthreads();");
393 p = isl_printer_end_line(p);
395 return p;
398 /* This function is called for each user statement in the AST,
399 * i.e., for each kernel body statement, copy statement or sync statement.
401 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
402 __isl_take isl_ast_print_options *print_options,
403 __isl_keep isl_ast_node *node, void *user)
405 isl_id *id;
406 struct ppcg_kernel_stmt *stmt;
408 id = isl_ast_node_get_annotation(node);
409 stmt = isl_id_get_user(id);
410 isl_id_free(id);
412 isl_ast_print_options_free(print_options);
414 switch (stmt->type) {
415 case ppcg_kernel_copy:
416 return ppcg_kernel_print_copy(p, stmt);
417 case ppcg_kernel_sync:
418 return print_sync(p, stmt);
419 case ppcg_kernel_domain:
420 return ppcg_kernel_print_domain(p, stmt);
423 return p;
426 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
427 struct cuda_info *cuda)
429 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
430 isl_ast_print_options *print_options;
431 isl_printer *p;
433 print_kernel_headers(prog, kernel, cuda);
434 fprintf(cuda->kernel_c, "{\n");
435 print_kernel_iterators(cuda->kernel_c, kernel);
437 p = isl_printer_to_file(ctx, cuda->kernel_c);
438 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
439 p = isl_printer_indent(p, 4);
441 p = print_kernel_vars(p, kernel);
442 p = isl_printer_end_line(p);
443 p = gpu_print_macros(p, kernel->tree);
445 print_options = isl_ast_print_options_alloc(ctx);
446 print_options = isl_ast_print_options_set_print_user(print_options,
447 &print_kernel_stmt, NULL);
448 p = isl_ast_node_print(kernel->tree, p, print_options);
449 isl_printer_free(p);
451 fprintf(cuda->kernel_c, "}\n");
454 struct print_host_user_data {
455 struct cuda_info *cuda;
456 struct gpu_prog *prog;
459 /* Print the user statement of the host code to "p".
461 * In particular, print a block of statements that defines the grid
462 * and the block and then launches the kernel.
464 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
465 __isl_take isl_ast_print_options *print_options,
466 __isl_keep isl_ast_node *node, void *user)
468 isl_id *id;
469 struct ppcg_kernel *kernel;
470 struct print_host_user_data *data;
472 id = isl_ast_node_get_annotation(node);
473 kernel = isl_id_get_user(id);
474 isl_id_free(id);
476 data = (struct print_host_user_data *) user;
478 p = isl_printer_start_line(p);
479 p = isl_printer_print_str(p, "{");
480 p = isl_printer_end_line(p);
481 p = isl_printer_indent(p, 2);
483 p = isl_printer_start_line(p);
484 p = isl_printer_print_str(p, "dim3 k");
485 p = isl_printer_print_int(p, kernel->id);
486 p = isl_printer_print_str(p, "_dimBlock");
487 print_reverse_list(isl_printer_get_file(p),
488 kernel->n_block, kernel->block_dim);
489 p = isl_printer_print_str(p, ";");
490 p = isl_printer_end_line(p);
492 p = print_grid(p, kernel);
494 p = isl_printer_start_line(p);
495 p = isl_printer_print_str(p, "kernel");
496 p = isl_printer_print_int(p, kernel->id);
497 p = isl_printer_print_str(p, " <<<k");
498 p = isl_printer_print_int(p, kernel->id);
499 p = isl_printer_print_str(p, "_dimGrid, k");
500 p = isl_printer_print_int(p, kernel->id);
501 p = isl_printer_print_str(p, "_dimBlock>>> (");
502 p = print_kernel_arguments(p, data->prog, kernel, 0);
503 p = isl_printer_print_str(p, ");");
504 p = isl_printer_end_line(p);
506 p = isl_printer_start_line(p);
507 p = isl_printer_print_str(p, "cudaCheckKernel();");
508 p = isl_printer_end_line(p);
510 p = isl_printer_indent(p, -2);
511 p = isl_printer_start_line(p);
512 p = isl_printer_print_str(p, "}");
513 p = isl_printer_end_line(p);
515 p = isl_printer_start_line(p);
516 p = isl_printer_end_line(p);
518 print_kernel(data->prog, kernel, data->cuda);
520 isl_ast_print_options_free(print_options);
522 return p;
525 static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
526 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
527 struct cuda_info *cuda)
529 isl_ast_print_options *print_options;
530 isl_ctx *ctx = isl_ast_node_get_ctx(tree);
531 struct print_host_user_data data = { cuda, prog };
533 print_options = isl_ast_print_options_alloc(ctx);
534 print_options = isl_ast_print_options_set_print_user(print_options,
535 &print_host_user, &data);
537 p = gpu_print_macros(p, tree);
538 p = isl_ast_node_print(tree, p, print_options);
540 return p;
543 /* For each array that needs to be copied out (based on prog->copy_out),
544 * copy the contents back from the GPU to the host.
546 * If any element of a given array appears in prog->copy_out, then its
547 * entire extent is in prog->copy_out. The bounds on this extent have
548 * been precomputed in extract_array_info and are used in
549 * gpu_array_info_print_size.
551 static __isl_give isl_printer *copy_arrays_from_device(
552 __isl_take isl_printer *p, struct gpu_prog *prog)
554 int i;
555 isl_union_set *copy_out;
556 copy_out = isl_union_set_copy(prog->copy_out);
558 for (i = 0; i < prog->n_array; ++i) {
559 isl_space *dim;
560 isl_set *copy_out_i;
561 int empty;
563 dim = isl_space_copy(prog->array[i].dim);
564 copy_out_i = isl_union_set_extract_set(copy_out, dim);
565 empty = isl_set_fast_is_empty(copy_out_i);
566 isl_set_free(copy_out_i);
567 if (empty)
568 continue;
570 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
571 if (gpu_array_is_scalar(&prog->array[i]))
572 p = isl_printer_print_str(p, "&");
573 p = isl_printer_print_str(p, prog->array[i].name);
574 p = isl_printer_print_str(p, ", dev_");
575 p = isl_printer_print_str(p, prog->array[i].name);
576 p = isl_printer_print_str(p, ", ");
577 p = gpu_array_info_print_size(p, &prog->array[i]);
578 p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
579 p = isl_printer_end_line(p);
582 isl_union_set_free(copy_out);
583 p = isl_printer_start_line(p);
584 p = isl_printer_end_line(p);
585 return p;
588 static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
589 struct gpu_prog *prog)
591 int i;
593 for (i = 0; i < prog->n_array; ++i) {
594 if (gpu_array_is_read_only_scalar(&prog->array[i]))
595 continue;
596 p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
597 p = isl_printer_print_str(p, prog->array[i].name);
598 p = isl_printer_print_str(p, "));");
599 p = isl_printer_end_line(p);
602 return p;
605 int generate_cuda(isl_ctx *ctx, struct ppcg_scop *scop,
606 struct ppcg_options *options, const char *input)
608 struct cuda_info cuda;
609 struct gpu_prog *prog;
610 isl_ast_node *tree;
611 isl_printer *p;
613 if (!scop)
614 return -1;
616 prog = gpu_prog_alloc(ctx, scop);
617 if (!prog)
618 return -1;
620 tree = generate_gpu(ctx, prog, options);
622 cuda.start = scop->start;
623 cuda.end = scop->end;
624 cuda_open_files(&cuda, input);
626 p = isl_printer_to_file(ctx, cuda.host_c);
627 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
628 p = ppcg_print_exposed_declarations(p, scop);
629 p = ppcg_start_block(p);
631 p = print_cuda_macros(p);
633 p = declare_device_arrays(p, prog);
634 p = allocate_device_arrays(p, prog);
635 p = copy_arrays_to_device(p, prog);
637 p = print_host_code(p, prog, tree, &cuda);
638 isl_ast_node_free(tree);
640 p = copy_arrays_from_device(p, prog);
641 p = free_device_arrays(p, prog);
643 p = ppcg_end_block(p);
644 isl_printer_free(p);
646 cuda_close_files(&cuda);
648 gpu_prog_free(prog);
650 return 0;