update cloog for improved unrolling
[ppcg.git] / cuda.h
blobb7e7a9747627e6fc47e218570661f15ebbe74b86
1 #ifndef _CUDA_H
2 #define _CUDA_H
4 #include <pet.h>
5 #include "cuda_common.h"
6 #include "gpucode.h"
7 #include "ppcg_options.h"
9 struct cuda_gen {
10 struct cuda_info cuda;
11 struct gpucode_info code;
12 struct gpucode_info kernel_code;
13 struct gpucode_info stmt_code;
15 isl_ctx *ctx;
16 struct ppcg_options *options;
17 CloogState *state;
19 struct pet_scop *scop;
21 /* Set of parameter values */
22 isl_set *context;
24 /* Uninitialized data elements (or an overapproximation) */
25 isl_union_set *copy_in;
27 /* All read accesses in the entire program */
28 isl_union_map *read;
30 /* All write accesses in the entire program */
31 isl_union_map *write;
33 /* Array of statements */
34 int n_stmts;
35 struct cuda_stmt *stmts;
37 int n_array;
38 struct cuda_array_info *array;
40 /* Identifier of current kernel. */
41 int kernel_id;
43 /* First tile dimension. */
44 int tile_first;
45 /* Number of tile dimensions. */
46 int tile_len;
47 /* Number of initial parallel loops among tile dimensions. */
48 int n_parallel;
50 /* Number of dimensions determining shared memory. */
51 int shared_len;
53 /* Number of rows in the untiled schedule. */
54 int untiled_len;
55 /* Number of rows in the tiled schedule. */
56 int tiled_len;
57 /* Number of rows in schedule after tiling/wrapping over threads. */
58 int thread_tiled_len;
60 /* Global untiled schedule. */
61 isl_union_map *sched;
62 /* Local (per kernel launch) tiled schedule. */
63 isl_union_map *tiled_sched;
64 /* Local schedule per shared memory tile loop iteration. */
65 isl_union_map *local_sched;
66 /* Domain of the current statement (within print_statement). */
67 isl_set *stmt_domain;
69 /* Position of first parameter corresponding to shared tile loop
70 * in shared_sched.
72 unsigned first_shared;
73 /* Local tiled schedule projected onto the shared tile loops and
74 * the loops that will be wrapped over the threads,
75 * with all shared tile loops parametrized.
77 isl_union_map *shared_sched;
78 /* Projects out the loops that will be wrapped over the threads
79 * from shared_sched.
81 isl_union_map *shared_proj;
83 /* A map that takes the range of shared_sched as input,
84 * wraps the appropriate loops over the threads and then projects
85 * out these loops.
87 isl_map *privatization;
89 /* A map from the shared memory tile loops and the thread indices
90 * (as parameters) to the set of accessed memory elements that
91 * will be accessed through private copies.
93 isl_union_map *private_access;
95 /* The schedule for the current private access
96 * (within print_private_access).
98 isl_map *private_sched;
99 /* The array reference group corresponding to private_sched. */
100 struct cuda_array_ref_group *private_group;
102 /* First loop to unroll (or -1 if none). */
103 int first_unroll;
105 int n_grid;
106 int n_block;
107 /* Note: in the input file, the sizes of the grid and the blocks
108 * are specified in the order x, y, z, but internally, the sizes
109 * are stored in reverse order, so that the last element always
110 * refers to the x dimension.
112 int grid_dim[2];
113 int block_dim[3];
114 int *tile_size;
117 __isl_give isl_set *add_context_from_str(__isl_take isl_set *set,
118 const char *str);
119 void collect_array_info(struct cuda_gen *gen);
120 void print_host_code(struct cuda_gen *gen);
121 void clear_cuda_gen(struct cuda_gen *gen);
123 int cuda_pet(isl_ctx *ctx, struct pet_scop *scop, struct ppcg_options *options,
124 const char *input);
126 #endif