cuda.c

   1 /*
   2  * Copyright 2010-2011 INRIA Saclay
   3  *
   4  * Use of this software is governed by the GNU LGPLv2.1 license
   5  *
   6  * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
   7  * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
   8  * 91893 Orsay, France
   9  */
  10
  11 #include <assert.h>
  12 #include <stdlib.h>
  13
  14 #include <isl/polynomial.h>
  15 #include <isl/union_set.h>
  16 #include <isl/aff.h>
  17 #include <isl/ilp.h>
  18 #include <isl/flow.h>
  19 #include <isl/band.h>
  20 #include <isl/schedule.h>
  21 #include <isl/options.h>
  22 #include <cloog/isl/cloog.h>
  23
  24 #include "cuda.h"
  25 #include "cuda_common.h"
  26 #include "clast_printer.h"
  27 #include "schedule.h"
  28 #include "pet_printer.h"
  29 #include "ppcg_options.h"
  30
  31 /* The fields stride, shift and shift_map only contain valid information
  32  * if shift != NULL.
  33  * If so, they express that current index is such that if you add shift,
  34  * then the result is always a multiple of stride.
  35  * shift_map contains the mapping
  36  *
  37  *      i -> (i + shift)/stride
  38  */
  39 struct cuda_array_bound {
  40         isl_int size;
  41         isl_aff *lb;
  42
  43         isl_int stride;
  44         isl_aff *shift;
  45         isl_basic_map *shift_map;
  46 };
  47
  48 struct cuda_array_info;
  49
  50 /* A group of array references in a kernel that should be handled together.
  51  * If private_bound is not NULL, then it is mapped to registers.
  52  * Otherwise, if shared_bound is not NULL, it is mapped to shared memory.
  53  * Otherwise, it is accessed from global memory.
  54  */
  55 struct cuda_array_ref_group {
  56         /* The references in this group access this array. */
  57         struct cuda_array_info *array;
  58         /* Position of this group in the list of reference groups of array. */
  59         int nr;
  60
  61         /* The following fields are use during the construction of the groups.
  62          * access is the combined access relation relative to the shared
  63          * memory tiling.
  64          * write is set if any access in the group is a write.
  65          */
  66         isl_map *access;
  67         int write;
  68
  69         /* For each index, size and offset of piece in shared memory. */
  70         struct cuda_array_bound *shared_bound;
  71
  72         /* For each index, size and offset of piece in private memory. */
  73         struct cuda_array_bound *private_bound;
  74
  75         /* References in this group; point to elements of a linked list. */
  76         int n_ref;
  77         struct cuda_stmt_access **refs;
  78
  79         /* Last shared memory tile dimension that affects tile of this group. */
  80         int last_shared;
  81         /* Dimension at which copying to/from shared memory is printed.
  82          * if >= 0, then the value is >= last_shared
  83          * if -1, then the copying is done at the leaf level.
  84          */
  85         int print_shared_level;
  86 };
  87
  88 struct cuda_array_info {
  89         isl_space *dim;
  90         /* Element type. */
  91         char *type;
  92         /* Element size. */
  93         int size;
  94         /* Name of the array. */
  95         char *name;
  96         /* Number of indices. */
  97         unsigned n_index;
  98         /* For each index, a bound on the array in that direction. */
  99         isl_pw_aff **bound;
 100         /* For each index, bound[i] specialized to the current kernel. */
 101         isl_pw_aff **local_bound;
 102
 103         /* All references to this array; point to elements of a linked list. */
 104         int n_ref;
 105         struct cuda_stmt_access **refs;
 106
 107         /* The reference groups associated to this array. */
 108         int n_group;
 109         struct cuda_array_ref_group **groups;
 110
 111         /* For scalars, is this scalar read-only within the entire program? */
 112         int read_only;
 113 };
 114
 115 /* Print the name of the local copy of a given group of array references.
 116  */
 117 static void print_array_name(FILE *out, struct cuda_array_ref_group *group)
 118 {
 119         int global = 0;
 120
 121         if (group->private_bound)
 122                 fprintf(out, "private_");
 123         else if (group->shared_bound)
 124                 fprintf(out, "shared_");
 125         else
 126                 global = 1;
 127         fprintf(out, "%s", group->array->name);
 128         if (!global && group->array->n_group > 1)
 129                 fprintf(out, "_%d", group->nr);
 130 }
 131
 132 /* Collect all references to the given array and store pointers to them
 133  * in array->refs.
 134  */
 135 static void collect_references(struct cuda_gen *gen,
 136         struct cuda_array_info *array)
 137 {
 138         int i;
 139         int n;
 140
 141         n = 0;
 142         for (i = 0; i < gen->n_stmts; ++i) {
 143                 struct cuda_stmt *stmt = &gen->stmts[i];
 144                 struct cuda_stmt_access *access;
 145
 146                 for (access = stmt->accesses; access; access = access->next) {
 147                         const char *name;
 148                         name = isl_map_get_tuple_name(access->access,
 149                                                       isl_dim_out);
 150                         if (name && !strcmp(array->name, name))
 151                                 n++;
 152                 }
 153         }
 154
 155         array->n_ref = n;
 156         array->refs = isl_alloc_array(gen->ctx, struct cuda_stmt_access *, n);
 157         assert(array->refs);
 158
 159         n = 0;
 160         for (i = 0; i < gen->n_stmts; ++i) {
 161                 struct cuda_stmt *stmt = &gen->stmts[i];
 162                 struct cuda_stmt_access *access;
 163
 164                 for (access = stmt->accesses; access; access = access->next) {
 165                         const char *name;
 166                         name = isl_map_get_tuple_name(access->access,
 167                                                       isl_dim_out);
 168                         if (!name || strcmp(array->name, name))
 169                                 continue;
 170
 171                         array->refs[n++] = access;
 172                 }
 173         }
 174 }
 175
 176 static struct cuda_array_bound *create_bound_list(isl_ctx *ctx, int n_index)
 177 {
 178         int i;
 179         struct cuda_array_bound *bound;
 180
 181         bound = isl_alloc_array(ctx, struct cuda_array_bound, n_index);
 182         assert(bound);
 183
 184         for (i = 0; i < n_index; ++i) {
 185                 isl_int_init(bound[i].size);
 186                 bound[i].lb = NULL;
 187                 isl_int_init(bound[i].stride);
 188                 bound[i].shift = NULL;
 189                 bound[i].shift_map = NULL;
 190         }
 191
 192         return bound;
 193 }
 194
 195 static void free_bound_list(struct cuda_array_bound *bound, int n_index)
 196 {
 197         int j;
 198
 199         if (!bound)
 200                 return;
 201
 202         for (j = 0; j < n_index; ++j) {
 203                 isl_int_clear(bound[j].size);
 204                 isl_int_clear(bound[j].stride);
 205                 isl_aff_free(bound[j].lb);
 206                 isl_aff_free(bound[j].shift);
 207                 isl_basic_map_free(bound[j].shift_map);
 208         }
 209         free(bound);
 210 }
 211
 212 static struct pet_array *find_array(struct pet_scop *scop,
 213         __isl_keep isl_set *accessed)
 214 {
 215         int i;
 216         isl_id *id;
 217
 218         id = isl_set_get_tuple_id(accessed);
 219
 220         for (i = 0; i < scop->n_array; ++i) {
 221                 isl_id *id_i;
 222
 223                 id_i = isl_set_get_tuple_id(scop->arrays[i]->extent);
 224                 isl_id_free(id_i);
 225                 if (id == id_i)
 226                         break;
 227         }
 228         isl_id_free(id);
 229
 230         return i < scop->n_array ? scop->arrays[i] : NULL;
 231 }
 232
 233 /* Compute bounds on the host arrays based on the accessed elements
 234  * and collect all references to the array.
 235  *
 236  * If the array is zero-dimensional, i.e., a scalar, we check
 237  * whether it is read-only.
 238  */
 239 static int extract_array_info(__isl_take isl_set *array, void *user)
 240 {
 241         int i;
 242         struct cuda_gen *gen = (struct cuda_gen *)user;
 243         const char *name;
 244         int n_index;
 245         isl_pw_aff **bounds;
 246         isl_pw_aff **local_bounds;
 247         struct pet_array *pa;
 248
 249         n_index = isl_set_dim(array, isl_dim_set);
 250         name = isl_set_get_tuple_name(array);
 251         bounds = isl_alloc_array(isl_set_get_ctx(array),
 252                                  isl_pw_aff *, n_index);
 253         assert(bounds);
 254         local_bounds = isl_calloc_array(isl_set_get_ctx(array),
 255                                  isl_pw_aff *, n_index);
 256         assert(local_bounds);
 257         gen->array[gen->n_array].dim = isl_set_get_space(array);
 258         gen->array[gen->n_array].name = strdup(name);
 259         gen->array[gen->n_array].n_index = n_index;
 260         gen->array[gen->n_array].bound = bounds;
 261         gen->array[gen->n_array].local_bound = local_bounds;
 262
 263         pa = find_array(gen->scop, array);
 264         assert(pa);
 265
 266         gen->array[gen->n_array].type = strdup(pa->element_type);
 267         gen->array[gen->n_array].size = pa->element_size;
 268
 269         if (n_index == 0) {
 270                 isl_set *space;
 271                 isl_union_map *write;
 272                 int empty;
 273
 274                 write = isl_union_map_copy(gen->write);
 275                 space = isl_set_universe(isl_set_get_space(array));
 276                 write = isl_union_map_intersect_range(write,
 277                                     isl_union_set_from_set(space));
 278                 empty = isl_union_map_is_empty(write);
 279                 isl_union_map_free(write);
 280
 281                 gen->array[gen->n_array].read_only = empty;
 282         }
 283
 284         for (i = 0; i < n_index; ++i) {
 285                 isl_set *dom;
 286                 isl_local_space *ls;
 287                 isl_aff *one;
 288                 isl_pw_aff *bound;
 289                 isl_set *size = i == 0 ? array : pa->extent;
 290
 291                 bound = isl_set_dim_max(isl_set_copy(size), i);
 292                 assert(bound);
 293                 dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
 294                 ls = isl_local_space_from_space(isl_set_get_space(dom));
 295                 one = isl_aff_zero_on_domain(ls);
 296                 one = isl_aff_add_constant_si(one, 1);
 297                 bound = isl_pw_aff_add(bound, isl_pw_aff_alloc(dom, one));
 298                 bound = isl_pw_aff_gist(bound, isl_set_copy(gen->context));
 299
 300                 bounds[i] = bound;
 301         }
 302
 303         collect_references(gen, &gen->array[gen->n_array]);
 304
 305         gen->n_array++;
 306
 307         isl_set_free(array);
 308         return 0;
 309 }
 310
 311 void collect_array_info(struct cuda_gen *gen)
 312 {
 313         isl_union_set *arrays;
 314
 315         arrays = isl_union_map_range(isl_union_map_copy(gen->read));
 316         arrays = isl_union_set_union(arrays,
 317                         isl_union_map_range(isl_union_map_copy(gen->write)));
 318         arrays = isl_union_set_coalesce(arrays);
 319
 320         gen->n_array = isl_union_set_n_set(arrays);
 321         gen->array = isl_alloc_array(gen->ctx,
 322                                      struct cuda_array_info, gen->n_array);
 323         assert(gen->array);
 324         gen->n_array = 0;
 325         isl_union_set_foreach_set(arrays, &extract_array_info, gen);
 326         isl_union_set_free(arrays);
 327 }
 328
 329 static void free_array_info(struct cuda_gen *gen)
 330 {
 331         int i, j;
 332
 333         for (i = 0; i < gen->n_array; ++i) {
 334                 int n_index = gen->array[i].n_index;
 335                 free(gen->array[i].type);
 336                 free(gen->array[i].name);
 337                 for (j = 0; j < n_index; ++j) {
 338                         isl_pw_aff_free(gen->array[i].bound[j]);
 339                         isl_pw_aff_free(gen->array[i].local_bound[j]);
 340                 }
 341                 isl_space_free(gen->array[i].dim);
 342                 free(gen->array[i].bound);
 343                 free(gen->array[i].local_bound);
 344                 free(gen->array[i].refs);
 345         }
 346         free(gen->array);
 347 }
 348
 349 /* Check if a cuda array is a scalar.  A scalar is a value that is not stored
 350  * as an array or through a pointer reference, but as single data element.  At
 351  * the moment, scalars are represented as zero dimensional arrays.
 352  */
 353 static int cuda_array_is_scalar(struct cuda_array_info *array)
 354 {
 355         return (array->n_index == 0);
 356 }
 357
 358 /* Is "array" a read-only scalar?
 359  */
 360 static int cuda_array_is_read_only_scalar(struct cuda_array_info *array)
 361 {
 362         return cuda_array_is_scalar(array) && array->read_only;
 363 }
 364
 365 static void declare_device_arrays(struct cuda_gen *gen)
 366 {
 367         int i;
 368
 369         for (i = 0; i < gen->n_array; ++i) {
 370                 if (cuda_array_is_read_only_scalar(&gen->array[i]))
 371                         continue;
 372                 fprintf(gen->cuda.host_c, "%s *dev_%s;\n",
 373                         gen->array[i].type, gen->array[i].name);
 374         }
 375         fprintf(gen->cuda.host_c, "\n");
 376 }
 377
 378 static void print_array_size(struct cuda_gen *gen, FILE *out,
 379         struct cuda_array_info *array)
 380 {
 381         int i;
 382         isl_printer *prn;
 383
 384         prn = isl_printer_to_file(gen->ctx, out);
 385         prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
 386         for (i = 0; i < array->n_index; ++i) {
 387                 prn = isl_printer_print_str(prn, "(");
 388                 prn = isl_printer_print_pw_aff(prn, array->bound[i]);
 389                 prn = isl_printer_print_str(prn, ") * ");
 390         }
 391         prn = isl_printer_print_str(prn, "sizeof(");
 392         prn = isl_printer_print_str(prn, array->type);
 393         prn = isl_printer_print_str(prn, ")");
 394         isl_printer_free(prn);
 395 }
 396
 397 static void allocate_device_arrays(struct cuda_gen *gen)
 398 {
 399         int i;
 400
 401         for (i = 0; i < gen->n_array; ++i) {
 402                 if (cuda_array_is_read_only_scalar(&gen->array[i]))
 403                         continue;
 404                 fprintf(gen->cuda.host_c,
 405                         "cudaCheckReturn(cudaMalloc((void **) &dev_%s, ",
 406                         gen->array[i].name);
 407                 print_array_size(gen, gen->cuda.host_c, &gen->array[i]);
 408                 fprintf(gen->cuda.host_c, "));\n");
 409         }
 410         fprintf(gen->cuda.host_c, "\n");
 411 }
 412
 413 static void free_device_arrays(struct cuda_gen *gen)
 414 {
 415         int i;
 416
 417         for (i = 0; i < gen->n_array; ++i) {
 418                 if (cuda_array_is_read_only_scalar(&gen->array[i]))
 419                         continue;
 420                 fprintf(gen->cuda.host_c, "cudaCheckReturn(cudaFree(dev_%s));\n",
 421                         gen->array[i].name);
 422         }
 423 }
 424
 425 static void copy_arrays_to_device(struct cuda_gen *gen)
 426 {
 427         int i;
 428
 429         for (i = 0; i < gen->n_array; ++i) {
 430                 isl_space *dim;
 431                 isl_set *read_i;
 432                 int empty;
 433
 434                 if (cuda_array_is_read_only_scalar(&gen->array[i]))
 435                         continue;
 436
 437                 dim = isl_space_copy(gen->array[i].dim);
 438                 read_i = isl_union_set_extract_set(gen->copy_in, dim);
 439                 empty = isl_set_fast_is_empty(read_i);
 440                 isl_set_free(read_i);
 441                 if (empty)
 442                         continue;
 443
 444                 fprintf(gen->cuda.host_c, "cudaCheckReturn(cudaMemcpy(dev_%s,",
 445                         gen->array[i].name);
 446
 447                 if (cuda_array_is_scalar(&(gen->array[i])))
 448                         fprintf(gen->cuda.host_c, " &%s, ",
 449                                 gen->array[i].name);
 450                 else
 451                         fprintf(gen->cuda.host_c, " %s, ", gen->array[i].name);
 452
 453                 print_array_size(gen, gen->cuda.host_c, &gen->array[i]);
 454                 fprintf(gen->cuda.host_c, ", cudaMemcpyHostToDevice));\n");
 455         }
 456         fprintf(gen->cuda.host_c, "\n");
 457 }
 458
 459 static void copy_arrays_from_device(struct cuda_gen *gen)
 460 {
 461         int i;
 462         isl_union_set *write;
 463         write = isl_union_map_range(isl_union_map_copy(gen->write));
 464
 465         for (i = 0; i < gen->n_array; ++i) {
 466                 isl_space *dim;
 467                 isl_set *write_i;
 468                 int empty;
 469
 470                 dim = isl_space_copy(gen->array[i].dim);
 471                 write_i = isl_union_set_extract_set(write, dim);
 472                 empty = isl_set_fast_is_empty(write_i);
 473                 isl_set_free(write_i);
 474                 if (empty)
 475                         continue;
 476
 477                 fprintf(gen->cuda.host_c, "cudaCheckReturn(cudaMemcpy(");
 478                 if (cuda_array_is_scalar(&gen->array[i]))
 479                         fprintf(gen->cuda.host_c, "&%s, ", gen->array[i].name);
 480                 else
 481                         fprintf(gen->cuda.host_c, "%s, ", gen->array[i].name);
 482                 fprintf(gen->cuda.host_c, "dev_%s, ",  gen->array[i].name);
 483                 print_array_size(gen, gen->cuda.host_c, &gen->array[i]);
 484                 fprintf(gen->cuda.host_c, ", cudaMemcpyDeviceToHost));\n");
 485         }
 486
 487         isl_union_set_free(write);
 488         fprintf(gen->cuda.host_c, "\n");
 489 }
 490
 491 /* Internal data structure for extract_size_of_type.
 492  * "type" specifies the name of the space that we want to extract.
 493  * "res" is used to store the subset of that space.
 494  */
 495 struct ppcg_extract_size_data {
 496         const char *type;
 497         isl_set *res;
 498 };
 499
 500 /* This function is called for each set in a union_set.
 501  * If the name of the set matches data->type, we store the
 502  * set in data->res.
 503  */
 504 static int extract_size_of_type(__isl_take isl_set *size, void *user)
 505 {
 506         struct ppcg_extract_size_data *data = user;
 507         const char *name;
 508
 509         name = isl_set_get_tuple_name(size);
 510         if (name && !strcmp(name, data->type)) {
 511                 data->res = size;
 512                 return -1;
 513         }
 514
 515         isl_set_free(size);
 516         return 0;
 517 }
 518
 519 /* Given a union map { kernel[i] -> *[...] },
 520  * return the range in the space called "type" for the kernel with
 521  * sequence number "id".
 522  */
 523 static __isl_give isl_set *extract_sizes(__isl_keep isl_union_map *sizes,
 524         const char *type, int id)
 525 {
 526         isl_space *space;
 527         isl_set *dom;
 528         isl_union_set *local_sizes;
 529         struct ppcg_extract_size_data data = { type, NULL };
 530
 531         if (!sizes)
 532                 return NULL;
 533
 534         space = isl_union_map_get_space(sizes);
 535         space = isl_space_set_from_params(space);
 536         space = isl_space_add_dims(space, isl_dim_set, 1);
 537         space = isl_space_set_tuple_name(space, isl_dim_set, "kernel");
 538         dom = isl_set_universe(space);
 539         dom = isl_set_fix_si(dom, isl_dim_set, 0, id);
 540
 541         local_sizes = isl_union_set_apply(isl_union_set_from_set(dom),
 542                                         isl_union_map_copy(sizes));
 543         isl_union_set_foreach_set(local_sizes, &extract_size_of_type, &data);
 544         isl_union_set_free(local_sizes);
 545         return data.res;
 546 }
 547
 548 /* Given a singleton set, extract the first (at most *len) elements
 549  * of the single integer tuple into *sizes and update *len if needed.
 550  */
 551 static void read_sizes_from_set(__isl_take isl_set *set, int *sizes, int *len)
 552 {
 553         int i;
 554         int dim;
 555         isl_int v;
 556
 557         if (!set)
 558                 return;
 559
 560         dim = isl_set_dim(set, isl_dim_set);
 561         if (dim < *len)
 562                 *len = dim;
 563
 564         isl_int_init(v);
 565
 566         for (i = 0; i < *len; ++i) {
 567                 int ok;
 568
 569                 ok = isl_set_plain_is_fixed(set, isl_dim_set, i, &v);
 570                 assert(ok);
 571
 572                 sizes[i] = isl_int_get_si(v);
 573         }
 574
 575         isl_int_clear(v);
 576
 577         isl_set_free(set);
 578 }
 579
 580 /* Extract user specified "tile" sizes from the "sizes" command line option,
 581  * defaulting to option->tile_size in each dimension.
 582  */
 583 static void read_tile_sizes(struct cuda_gen *gen)
 584 {
 585         int n;
 586         isl_set *size;
 587
 588         gen->tile_size = isl_alloc_array(gen->ctx, int, gen->tile_len);
 589         assert(gen->tile_size);
 590         for (n = 0; n < gen->tile_len; ++n)
 591                 gen->tile_size[n] = gen->options->tile_size;
 592
 593         size = extract_sizes(gen->sizes, "tile", gen->kernel_id);
 594         read_sizes_from_set(size, gen->tile_size, &gen->tile_len);
 595
 596         if (gen->n_parallel > gen->tile_len)
 597                 gen->n_parallel = gen->tile_len;
 598 }
 599
 600 /* Extract user specified "block" sizes from the "sizes" command line option,
 601  * after filling in some potentially useful defaults.
 602  */
 603 static void read_block_sizes(struct cuda_gen *gen)
 604 {
 605         int n;
 606         isl_set *size;
 607
 608         n = gen->n_parallel;
 609         gen->n_block = (n <= 3) ? n : 3;
 610         switch (gen->n_block) {
 611         case 1:
 612                 gen->block_dim[0] = 512;
 613                 break;
 614         case 2:
 615                 gen->block_dim[0] = 32;
 616                 gen->block_dim[1] = 16;
 617                 break;
 618         default:
 619                 gen->block_dim[0] = 32;
 620                 gen->block_dim[1] = 4;
 621                 gen->block_dim[2] = 4;
 622                 break;
 623         }
 624
 625         size = extract_sizes(gen->sizes, "block", gen->kernel_id);
 626         read_sizes_from_set(size, gen->block_dim, &gen->n_block);
 627 }
 628
 629 /* Extract user specified "grid" sizes from the "sizes" command line option,
 630  * after filling in some potentially useful defaults.
 631  */
 632 static void read_grid_sizes(struct cuda_gen *gen)
 633 {
 634         int n = gen->n_parallel;
 635         isl_set *size;
 636
 637         gen->n_grid = (n <= 2) ? n : 2;
 638         switch (gen->n_grid) {
 639         case 1:
 640                 gen->grid_dim[0] = 32768;
 641                 break;
 642         default:
 643                 gen->grid_dim[0] = 256;
 644                 gen->grid_dim[1] = 256;
 645                 break;
 646         }
 647
 648         size = extract_sizes(gen->sizes, "grid", gen->kernel_id);
 649         read_sizes_from_set(size, gen->grid_dim, &gen->n_grid);
 650 }
 651
 652 /* Extract user specified sizes from the "sizes" command line option
 653  * after filling in some potentially useful defaults.
 654  */
 655 static void read_sizes(struct cuda_gen *gen)
 656 {
 657         read_tile_sizes(gen);
 658         read_block_sizes(gen);
 659         read_grid_sizes(gen);
 660 }
 661
 662 static void free_stmts(struct cuda_stmt *stmts, int n)
 663 {
 664         int i;
 665
 666         for (i = 0; i < n; ++i) {
 667                 struct cuda_stmt_access *access, *next;
 668
 669                 for (access = stmts[i].accesses; access; access = next) {
 670                         next = access->next;
 671                         isl_map_free(access->access);
 672                         free(access);
 673                 }
 674
 675                 isl_set_free(stmts[i].domain);
 676         }
 677         free(stmts);
 678 }
 679
 680 void clear_cuda_gen(struct cuda_gen *gen)
 681 {
 682         free_stmts(gen->stmts, gen->n_stmts);
 683         free_array_info(gen);
 684         isl_union_map_free(gen->sizes);
 685         isl_set_free(gen->context);
 686         isl_union_set_free(gen->copy_in);
 687         isl_union_map_free(gen->sched);
 688         isl_union_map_free(gen->read);
 689         isl_union_map_free(gen->write);
 690 }
 691
 692 static void print_reverse_list(FILE *out, int len, int *list)
 693 {
 694         int i;
 695
 696         if (len == 0)
 697                 return;
 698
 699         fprintf(out, "(");
 700         for (i = 0; i < len; ++i) {
 701                 if (i)
 702                         fprintf(out, ", ");
 703                 fprintf(out, "%d", list[len - 1 - i]);
 704         }
 705         fprintf(out, ")");
 706 }
 707
 708 static void print_kernel_launch(struct cuda_gen *gen,
 709         __isl_keep isl_union_set *arrays)
 710 {
 711         int i;
 712         int first = 1;
 713         unsigned nparam;
 714         isl_space *dim;
 715
 716         print_indent(gen->code.dst, gen->code.indent);
 717         fprintf(gen->code.dst, "kernel%d <<<k%d_dimGrid, k%d_dimBlock>>> (",
 718                 gen->kernel_id, gen->kernel_id, gen->kernel_id);
 719         fprintf(gen->cuda.kernel_c, "__global__ void kernel%d(",
 720                 gen->kernel_id);
 721         fprintf(gen->cuda.kernel_h, "__global__ void kernel%d(",
 722                 gen->kernel_id);
 723
 724         for (i = 0; i < gen->n_array; ++i) {
 725                 isl_space *dim;
 726                 isl_set *arr;
 727                 int empty;
 728
 729                 dim = isl_space_copy(gen->array[i].dim);
 730                 arr = isl_union_set_extract_set(arrays, dim);
 731                 empty = isl_set_fast_is_empty(arr);
 732                 isl_set_free(arr);
 733                 if (empty)
 734                         continue;
 735
 736                 if (!first) {
 737                         fprintf(gen->code.dst, ", ");
 738                         fprintf(gen->cuda.kernel_c, ", ");
 739                         fprintf(gen->cuda.kernel_h, ", ");
 740                 }
 741
 742                 if (cuda_array_is_read_only_scalar(&gen->array[i])) {
 743                         fprintf(gen->code.dst, "%s", gen->array[i].name);
 744                         fprintf(gen->cuda.kernel_c, "%s %s",
 745                                 gen->array[i].type, gen->array[i].name);
 746                         fprintf(gen->cuda.kernel_h, "%s %s",
 747                                 gen->array[i].type, gen->array[i].name);
 748                 } else {
 749                         fprintf(gen->code.dst, "dev_%s", gen->array[i].name);
 750                         fprintf(gen->cuda.kernel_c, "%s *%s",
 751                                 gen->array[i].type, gen->array[i].name);
 752                         fprintf(gen->cuda.kernel_h, "%s *%s",
 753                                 gen->array[i].type, gen->array[i].name);
 754                 }
 755
 756                 first = 0;
 757         }
 758
 759         dim = isl_union_set_get_space(arrays);
 760         nparam = isl_space_dim(dim, isl_dim_param);
 761         for (i = 0; i < nparam; ++i) {
 762                 const char *name = isl_space_get_dim_name(dim, isl_dim_param, i);
 763                 if (!first) {
 764                         fprintf(gen->code.dst, ", ");
 765                         fprintf(gen->cuda.kernel_c, ", ");
 766                         fprintf(gen->cuda.kernel_h, ", ");
 767                 }
 768                 fprintf(gen->code.dst, "%s", name);
 769                 fprintf(gen->cuda.kernel_c, "int %s", name);
 770                 fprintf(gen->cuda.kernel_h, "int %s", name);
 771                 first = 0;
 772         }
 773         isl_space_free(dim);
 774
 775         for (i = 0; i < gen->tile_first; ++i) {
 776                 if (!first) {
 777                         fprintf(gen->code.dst, ", ");
 778                         fprintf(gen->cuda.kernel_c, ", ");
 779                         fprintf(gen->cuda.kernel_h, ", ");
 780                 }
 781                 fprintf(gen->code.dst, "h%d", i);
 782                 fprintf(gen->cuda.kernel_c, "int h%d", i);
 783                 fprintf(gen->cuda.kernel_h, "int h%d", i);
 784                 first = 0;
 785         }
 786
 787         fprintf(gen->code.dst, ");\n");
 788         fprintf(gen->cuda.kernel_c, ")\n");
 789         fprintf(gen->cuda.kernel_h, ");\n");
 790
 791         fprintf(gen->code.dst, "cudaCheckKernel();\n");
 792 }
 793
 794 /* Construct a map from a domain of dimensionality "len"
 795  * to a domain of dimensionality "len" + "tile_len" that tiles
 796  * the "tile_len" coordinates starting at "first".
 797  * In particular, [s_i] -> [s_i / tile_size[i], s_i % tile_size[i]].
 798  * "dim" prescribes the parameters.
 799  */
 800 static __isl_give isl_map *tile(__isl_take isl_space *dim, int len,
 801         int first, int tile_len, int *tile_size)
 802 {
 803         int i;
 804         isl_int v;
 805         isl_basic_map *bmap;
 806         isl_constraint *c;
 807         isl_local_space *ls;
 808
 809         isl_int_init(v);
 810
 811         dim = isl_space_add_dims(dim, isl_dim_in, len);
 812         dim = isl_space_add_dims(dim, isl_dim_out, len + tile_len);
 813         bmap = isl_basic_map_universe(isl_space_copy(dim));
 814         ls = isl_local_space_from_space(dim);
 815
 816         for (i = 0; i < len - tile_len; ++i) {
 817                 int j = i < first ? i : i + tile_len;
 818                 int k = i < first ? i : i + 2 * tile_len;
 819
 820                 c = isl_equality_alloc(isl_local_space_copy(ls));
 821                 isl_int_set_si(v, -1);
 822                 isl_constraint_set_coefficient(c, isl_dim_in, j, v);
 823                 isl_int_set_si(v, 1);
 824                 isl_constraint_set_coefficient(c, isl_dim_out, k, v);
 825                 bmap = isl_basic_map_add_constraint(bmap, c);
 826         }
 827
 828         for (i = 0; i < tile_len; ++i) {
 829                 c = isl_equality_alloc(isl_local_space_copy(ls));
 830                 isl_int_set_si(v, -1);
 831                 isl_constraint_set_coefficient(c, isl_dim_in, first + i, v);
 832                 isl_int_set_si(v, tile_size[i]);
 833                 isl_constraint_set_coefficient(c, isl_dim_out, first + i, v);
 834                 isl_int_set_si(v, 1);
 835                 isl_constraint_set_coefficient(c, isl_dim_out,
 836                                                 first + i + tile_len, v);
 837                 bmap = isl_basic_map_add_constraint(bmap, c);
 838
 839                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 840                 isl_int_set_si(v, 1);
 841                 isl_constraint_set_coefficient(c, isl_dim_out,
 842                                                 first + i + tile_len, v);
 843                 bmap = isl_basic_map_add_constraint(bmap, c);
 844
 845                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 846                 isl_int_set_si(v, -1);
 847                 isl_constraint_set_coefficient(c, isl_dim_out,
 848                                                 first + i + tile_len, v);
 849                 isl_int_set_si(v, tile_size[i] - 1);
 850                 isl_constraint_set_constant(c, v);
 851                 bmap = isl_basic_map_add_constraint(bmap, c);
 852         }
 853
 854         isl_local_space_free(ls);
 855         isl_int_clear(v);
 856
 857         return isl_map_from_basic_map(bmap);
 858 }
 859
 860 /* Construct a map from a domain of dimensionality "len"
 861  * to a domain of dimensionality "len" + "wrap_len" that "wraps"
 862  * the "wrap_len" coordinates starting at "first" according to "wrap_size".
 863  * In particular, [s_i] -> [s_i, s_i % wrap_size[i]].
 864  * To do so, we need extra variables corresponding to [s_i / wrap_size[i]],
 865  * that are projected out at the end.
 866  * "dim" prescribes the parameters.
 867  */
 868 static __isl_give isl_map *wrap(__isl_take isl_space *dim, int len,
 869         int first, int wrap_len, int *wrap_size)
 870 {
 871         int i;
 872         isl_basic_map *bmap;
 873         isl_constraint *c;
 874         isl_local_space *ls;
 875
 876         dim = isl_space_add_dims(dim, isl_dim_in, len);
 877         dim = isl_space_add_dims(dim, isl_dim_out, len + 2 * wrap_len);
 878         bmap = isl_basic_map_universe(isl_space_copy(dim));
 879         ls = isl_local_space_from_space(dim);
 880
 881         for (i = 0; i < len; ++i) {
 882                 int k = i < first + wrap_len ? i : i + 2 * wrap_len;
 883
 884                 c = isl_equality_alloc(isl_local_space_copy(ls));
 885                 isl_constraint_set_coefficient_si(c, isl_dim_in, i, -1);
 886                 isl_constraint_set_coefficient_si(c, isl_dim_out, k, 1);
 887                 bmap = isl_basic_map_add_constraint(bmap, c);
 888         }
 889
 890         for (i = 0; i < wrap_len; ++i) {
 891                 c = isl_equality_alloc(isl_local_space_copy(ls));
 892                 isl_constraint_set_coefficient_si(c, isl_dim_out,
 893                                                     first + i, -1);
 894                 isl_constraint_set_coefficient_si(c, isl_dim_out,
 895                                                     first + wrap_len + i, 1);
 896                 isl_constraint_set_coefficient_si(c, isl_dim_out,
 897                                     first + 2 * wrap_len + i, wrap_size[i]);
 898                 bmap = isl_basic_map_add_constraint(bmap, c);
 899
 900                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 901                 isl_constraint_set_coefficient_si(c, isl_dim_out,
 902                                                     first + wrap_len + i, 1);
 903                 bmap = isl_basic_map_add_constraint(bmap, c);
 904
 905                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 906                 isl_constraint_set_coefficient_si(c, isl_dim_out,
 907                                                     first + wrap_len + i, -1);
 908                 isl_constraint_set_constant_si(c, wrap_size[i] - 1);
 909                 bmap = isl_basic_map_add_constraint(bmap, c);
 910         }
 911
 912         isl_local_space_free(ls);
 913
 914         bmap = isl_basic_map_project_out(bmap, isl_dim_out,
 915                                 first + 2 * wrap_len, wrap_len);
 916
 917         return isl_map_from_basic_map(bmap);
 918 }
 919
 920 /* Add "n" parameters named prefix%d.
 921  */
 922 static __isl_give isl_set *add_params( __isl_take isl_set *set,
 923         int n, const char *prefix)
 924 {
 925         int i;
 926         unsigned nparam;
 927         char name[20];
 928
 929         nparam = isl_set_dim(set, isl_dim_param);
 930         set = isl_set_add_dims(set, isl_dim_param, n);
 931
 932         for (i = 0; i < n; ++i) {
 933                 snprintf(name, sizeof(name), "%s%d", prefix, i);
 934                 set = isl_set_set_dim_name(set, isl_dim_param,
 935                                             nparam + i, name);
 936         }
 937
 938         return set;
 939 }
 940
 941 /* Equate the "n" dimensions of "set" starting at "first" to
 942  * freshly created parameters named prefix%d.
 943  */
 944 static __isl_give isl_set *parametrize(__isl_take isl_set *set,
 945         int first, int n, const char *prefix)
 946 {
 947         int i;
 948         unsigned nparam;
 949         isl_int v;
 950         isl_space *dim;
 951         isl_basic_set *bset;
 952         isl_constraint *c;
 953         isl_local_space *ls;
 954
 955         nparam = isl_set_dim(set, isl_dim_param);
 956
 957         set = add_params(set, n, prefix);
 958
 959         dim = isl_set_get_space(set);
 960         bset = isl_basic_set_universe(isl_space_copy(dim));
 961         ls = isl_local_space_from_space(dim);
 962
 963         isl_int_init(v);
 964
 965         for (i = 0; i < n; ++i) {
 966                 c = isl_equality_alloc(isl_local_space_copy(ls));
 967                 isl_int_set_si(v, -1);
 968                 isl_constraint_set_coefficient(c, isl_dim_param, nparam + i, v);
 969                 isl_int_set_si(v, 1);
 970                 isl_constraint_set_coefficient(c, isl_dim_set, first + i, v);
 971                 bset = isl_basic_set_add_constraint(bset, c);
 972         }
 973
 974         isl_int_clear(v);
 975         isl_local_space_free(ls);
 976
 977         return isl_set_intersect(set, isl_set_from_basic_set(bset));
 978 }
 979
 980 static __isl_give isl_set *parametrization(__isl_take isl_space *dim,
 981         int len, int first, int n, const char *prefix)
 982 {
 983         isl_set *set;
 984
 985         dim = isl_space_add_dims(dim, isl_dim_set, len);
 986         set = isl_set_universe(dim);
 987
 988         return parametrize(set, first, n, prefix);
 989 }
 990
 991 /* Tile the B loops over the tile sizes and then tile/wrap
 992  * the T1 loops over the blocks.
 993  */
 994 static __isl_give isl_union_map *tile_schedule(struct cuda_gen *gen,
 995         __isl_take isl_union_map *sched)
 996 {
 997         isl_space *dim;
 998         isl_map *tiling, *block_tiling;
 999
1000         dim = isl_union_map_get_space(sched);
1001         tiling = tile(isl_space_copy(dim), gen->untiled_len,
1002                       gen->tile_first, gen->tile_len, gen->tile_size);
1003
1004         if (gen->options->wrap)
1005                 block_tiling = wrap(dim, gen->untiled_len + gen->tile_len,
1006                                 gen->tile_first, gen->n_grid, gen->grid_dim);
1007         else
1008                 block_tiling = tile(dim, gen->untiled_len + gen->tile_len,
1009                                 gen->tile_first, gen->n_grid, gen->grid_dim);
1010
1011         gen->tiled_len = gen->untiled_len + gen->tile_len + gen->n_grid;
1012
1013         tiling = isl_map_apply_range(tiling, block_tiling);
1014
1015         sched = isl_union_map_apply_range(sched,
1016                                              isl_union_map_from_map(tiling));
1017
1018         gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
1019
1020         return sched;
1021 }
1022
1023 static __isl_give isl_union_map *parametrize_tiled_schedule(
1024         struct cuda_gen *gen, __isl_take isl_union_map *sched)
1025 {
1026         isl_space *dim;
1027         isl_set *par;
1028
1029         dim = isl_union_map_get_space(sched);
1030         par = parametrization(dim, gen->tiled_len, 0, gen->tile_first, "h");
1031         sched = isl_union_map_intersect_range(sched,
1032                                                 isl_union_set_from_set(par));
1033
1034         dim = isl_union_map_get_space(sched);
1035         par = parametrization(dim, gen->tiled_len,
1036                 gen->tile_first + gen->n_grid, gen->n_grid, "b");
1037         sched = isl_union_map_intersect_range(sched,
1038                                                 isl_union_set_from_set(par));
1039
1040         return sched;
1041 }
1042
1043 /* Tile/wrap the P1 loops over the threads.
1044  */
1045 static __isl_give isl_union_map *thread_tile_schedule(struct cuda_gen *gen,
1046         __isl_take isl_union_map *sched)
1047 {
1048         isl_space *dim;
1049         isl_map *tiling;
1050         isl_set *par;
1051
1052         dim = isl_union_map_get_space(sched);
1053
1054         if (gen->options->wrap)
1055                 tiling = wrap(isl_space_copy(dim), gen->tiled_len,
1056                                 gen->shared_len, gen->n_block, gen->block_dim);
1057         else
1058                 tiling = tile(isl_space_copy(dim), gen->tiled_len,
1059                                 gen->shared_len, gen->n_block, gen->block_dim);
1060         gen->thread_tiled_len = gen->tiled_len + gen->n_block;
1061
1062         sched = isl_union_map_apply_range(sched,
1063                                              isl_union_map_from_map(tiling));
1064
1065         par = parametrization(dim, gen->thread_tiled_len,
1066                 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
1067                 gen->n_block, "t");
1068         sched = isl_union_map_intersect_range(sched,
1069                                                 isl_union_set_from_set(par));
1070
1071         gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
1072
1073         return sched;
1074 }
1075
1076 /* If the user asked for it, scale the shared memory tile loops
1077  * (T1T and T2) of "sched" by gen->tile_size[i].
1078  * If we are not performing "wrapping", then additionally scale the T1P
1079  * loops by gen->grid_dim[i].
1080  */
1081 static __isl_give isl_union_map *scale_tile_loops(struct cuda_gen *gen,
1082         __isl_take isl_union_map *sched)
1083 {
1084         int i;
1085         isl_space *dim;
1086         isl_basic_map *scale;
1087         isl_constraint *c;
1088         isl_local_space *ls;
1089
1090         if (!gen->options->scale_tile_loops)
1091                 return sched;
1092
1093         dim = isl_union_map_get_space(sched);
1094         dim = isl_space_add_dims(dim, isl_dim_in, gen->tiled_len);
1095         dim = isl_space_add_dims(dim, isl_dim_out, gen->tiled_len);
1096         scale = isl_basic_map_universe(isl_space_copy(dim));
1097         ls = isl_local_space_from_space(dim);
1098
1099         for (i = 0; i < gen->tiled_len; ++i) {
1100                 int f = 1;
1101
1102                 if (i >= gen->tile_first && i < gen->tile_first + gen->n_grid) {
1103                         f = gen->tile_size[i - gen->tile_first];
1104                         if (!gen->options->wrap)
1105                                 f *= gen->grid_dim[i - gen->tile_first];
1106                 } else if (i >= gen->tile_first + gen->n_grid &&
1107                            i < gen->tile_first + gen->n_grid + gen->tile_len) {
1108                         f = gen->tile_size[i - (gen->tile_first + gen->n_grid)];
1109                 }
1110
1111                 c = isl_equality_alloc(isl_local_space_copy(ls));
1112                 isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1113                 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1114                 scale = isl_basic_map_add_constraint(scale, c);
1115         }
1116
1117         isl_local_space_free(ls);
1118
1119         sched = isl_union_map_apply_range(sched,
1120                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1121
1122         return sched;
1123 }
1124
1125 /* If we are not performing "wrapping" and if the user asked for it,
1126  * scale the thread tile loops (P1T) of "sched" by gen->block_dim[i].
1127  */
1128 static __isl_give isl_union_map *scale_thread_tile_loops(struct cuda_gen *gen,
1129         __isl_take isl_union_map *sched)
1130 {
1131         int i;
1132         isl_space *dim;
1133         isl_basic_map *scale;
1134         isl_constraint *c;
1135         isl_local_space *ls;
1136
1137         if (gen->options->wrap)
1138                 return sched;
1139         if (!gen->options->scale_tile_loops)
1140                 return sched;
1141
1142         dim = isl_union_map_get_space(sched);
1143         dim = isl_space_add_dims(dim, isl_dim_in, gen->thread_tiled_len);
1144         dim = isl_space_add_dims(dim, isl_dim_out, gen->thread_tiled_len);
1145         scale = isl_basic_map_universe(isl_space_copy(dim));
1146         ls = isl_local_space_from_space(dim);
1147
1148         for (i = 0; i < gen->thread_tiled_len; ++i) {
1149                 int f = 1;
1150
1151                 if (i >= gen->shared_len &&
1152                     i < gen->shared_len + gen->n_block)
1153                         f = gen->block_dim[i - gen->shared_len];
1154
1155                 c = isl_equality_alloc(isl_local_space_copy(ls));
1156                 isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1157                 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1158                 scale = isl_basic_map_add_constraint(scale, c);
1159         }
1160
1161         isl_local_space_free(ls);
1162
1163         sched = isl_union_map_apply_range(sched,
1164                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1165
1166         return sched;
1167 }
1168
1169 /* If we are not performing "wrapping" and if the user asked for it,
1170  * scale the "n_tile" loops starting at "first" of "sched" by gen->block_dim[i].
1171  */
1172 static __isl_give isl_union_map *scale_access_tile_loops(struct cuda_gen *gen,
1173         __isl_take isl_union_map *sched, int len, int first, int n_tile)
1174 {
1175         int i;
1176         isl_space *dim;
1177         isl_basic_map *scale;
1178         isl_constraint *c;
1179         isl_local_space *ls;
1180
1181         if (gen->options->wrap)
1182                 return sched;
1183         if (!gen->options->scale_tile_loops)
1184                 return sched;
1185
1186         dim = isl_union_map_get_space(sched);
1187         dim = isl_space_add_dims(dim, isl_dim_in, len);
1188         dim = isl_space_add_dims(dim, isl_dim_out, len);
1189         scale = isl_basic_map_universe(isl_space_copy(dim));
1190         ls = isl_local_space_from_space(dim);
1191
1192         for (i = 0; i < len; ++i) {
1193                 int f = 1;
1194
1195                 if (i >= first && i < first + n_tile)
1196                         f = gen->block_dim[i - first];
1197
1198                 c = isl_equality_alloc(isl_local_space_copy(ls));
1199                 isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1200                 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1201                 scale = isl_basic_map_add_constraint(scale, c);
1202         }
1203
1204         isl_local_space_free(ls);
1205
1206         sched = isl_union_map_apply_range(sched,
1207                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1208
1209         return sched;
1210 }
1211
1212 /* If print_user_stmt is set, we want to print the statements ourselves,
1213  * instead of relying on the C preprocessor.  If so, we need to use
1214  * the stop option so that the domains will be saved on the statement
1215  * nodes.
1216  */
1217 static void print_cloog_shared_body(struct cuda_gen *gen,
1218         __isl_keep isl_set *context, __isl_keep isl_union_map *sched, int len,
1219         void (*print_user_stmt)(struct clast_printer_info *info,
1220                                 struct clast_user_stmt *s),
1221         int first_unroll)
1222 {
1223         int i;
1224         CloogOptions *options;
1225         CloogDomain *cloog_context;
1226         CloogUnionDomain *ud;
1227         CloogInput *input;
1228         struct clast_stmt *stmt;
1229         char name[20];
1230
1231         sched = isl_union_map_copy(sched);
1232         sched = isl_union_map_align_params(sched, isl_set_get_space(context));
1233
1234         options = cloog_options_malloc(gen->state);
1235         options->language = CLOOG_LANGUAGE_C;
1236         options->strides = 1;
1237         options->sh = 1;
1238         options->f = len;
1239         options->l = -1;
1240         options->override = 1;
1241         options->save_domains = 1;
1242         options->noscalars = 1;
1243         options->first_unroll = first_unroll;
1244
1245         ud = cloog_union_domain_from_isl_union_map(sched);
1246         for (i = 0; i < len; ++i) {
1247                 snprintf(name, sizeof(name), "c%d", i);
1248                 ud = cloog_union_domain_set_name(ud, CLOOG_SCAT, i, name);
1249         }
1250         cloog_context = cloog_domain_from_isl_set(isl_set_copy(context));
1251         input = cloog_input_alloc(cloog_context, ud);
1252
1253         stmt = cloog_clast_create_from_input(input, options);
1254
1255         gen->stmt_code.indent = gen->kernel_code.indent;
1256         gen->stmt_code.dst = gen->cuda.kernel_c;
1257         gen->stmt_code.print_user_stmt = print_user_stmt;
1258         gen->stmt_code.print_user_stmt_list = NULL;
1259         gen->stmt_code.print_for_head = NULL;
1260         gen->stmt_code.print_for_foot = NULL;
1261         gen->stmt_code.user = gen;
1262         print_clast(&gen->stmt_code, stmt);
1263
1264         cloog_clast_free(stmt);
1265         cloog_options_free(options);
1266 }
1267
1268 /* Add "len" parameters p[i] called prefix%d,
1269  * with bounds to 0 <= p[i] < size[i].
1270  */
1271 __isl_give isl_set *add_bounded_parameters(__isl_take isl_set *set,
1272         int len, int *size, const char *prefix)
1273 {
1274         int i;
1275         unsigned nparam;
1276         isl_int v;
1277         isl_space *dim;
1278         isl_basic_set *bset;
1279         isl_constraint *c;
1280         isl_local_space *ls;
1281         char name[20];
1282
1283         nparam = isl_set_dim(set, isl_dim_param);
1284         set = isl_set_add_dims(set, isl_dim_param, len);
1285
1286         for (i = 0; i < len; ++i) {
1287                 snprintf(name, sizeof(name), "%s%d", prefix, i);
1288                 set = isl_set_set_dim_name(set, isl_dim_param,
1289                                             nparam + i, name);
1290         }
1291
1292         dim = isl_set_get_space(set);
1293         bset = isl_basic_set_universe(isl_space_copy(dim));
1294         ls = isl_local_space_from_space(dim);
1295
1296         isl_int_init(v);
1297
1298         for (i = 0; i < len; ++i) {
1299                 c = isl_inequality_alloc(isl_local_space_copy(ls));
1300                 isl_int_set_si(v, 1);
1301                 isl_constraint_set_coefficient(c, isl_dim_param, nparam + i, v);
1302                 bset = isl_basic_set_add_constraint(bset, c);
1303
1304                 c = isl_inequality_alloc(isl_local_space_copy(ls));
1305                 isl_int_set_si(v, -1);
1306                 isl_constraint_set_coefficient(c, isl_dim_param, nparam + i, v);
1307                 isl_int_set_si(v, size[i] - 1);
1308                 isl_constraint_set_constant(c, v);
1309                 bset = isl_basic_set_add_constraint(bset, c);
1310         }
1311
1312         isl_int_clear(v);
1313         isl_local_space_free(ls);
1314
1315         return isl_set_intersect(set, isl_set_from_basic_set(bset));
1316 }
1317
1318 static void print_shared_body(struct cuda_gen *gen,
1319         __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *sched,
1320         int len, void (*print_user_stmt)(struct clast_printer_info *info,
1321                                          struct clast_user_stmt *s),
1322         int first_unroll)
1323 {
1324         isl_set *context;
1325
1326         context = isl_set_copy(shared_domain);
1327         context = parametrize(context, 0, gen->shared_len, "g");
1328         context = isl_set_project_out(context, isl_dim_set, 0, gen->shared_len);
1329         context = add_bounded_parameters(context,
1330                                         gen->n_block, gen->block_dim, "t");
1331
1332         print_cloog_shared_body(gen, context, sched, len, print_user_stmt,
1333                                 first_unroll);
1334
1335         isl_set_free(context);
1336 }
1337
1338 /* Given a tile of an array, construct a map that maps each element
1339  * of the tile to a copy of the tile shifted to the origin
1340  * (based on the lower bounds in group->private_bound or group->shared_bound).
1341  * If any of the indices is strided, then {private,shared}_bound[i].shift_map
1342  * is applied to the index first.
1343  * The domain of the resulting map is "access",
1344  * while the range space is anonymous.
1345  */
1346 static __isl_give isl_map *shift_access(__isl_take isl_set *access,
1347         struct cuda_array_ref_group *group)
1348 {
1349         int i;
1350         isl_space *dim;
1351         isl_basic_set *bset;
1352         isl_basic_map *bmap;
1353         isl_aff *lb;
1354         isl_basic_set *offset;
1355         isl_basic_map *shift;
1356         isl_basic_map *pre_shift;
1357         isl_map *sched;
1358         const char *name;
1359         struct cuda_array_bound *bounds;
1360         int n_index = group->array->n_index;
1361
1362         bounds = group->private_bound;
1363         if (!bounds)
1364                 bounds = group->shared_bound;
1365
1366         dim = isl_set_get_space(access);
1367         dim = isl_space_drop_dims(dim, isl_dim_set, 0, n_index);
1368         offset = isl_basic_set_universe(dim);
1369         for (i = 0; i < n_index; ++i) {
1370                 lb = isl_aff_copy(bounds[i].lb);
1371                 bmap = isl_basic_map_from_aff(lb);
1372                 bset = isl_basic_map_range(bmap);
1373                 offset = isl_basic_set_flat_product(offset, bset);
1374         }
1375         offset = isl_basic_set_neg(offset);
1376
1377         dim = isl_space_map_from_set(isl_set_get_space(access));
1378         shift = isl_basic_map_identity(dim);
1379         shift = isl_basic_map_set_tuple_name(shift, isl_dim_out, NULL);
1380
1381         bset = isl_basic_set_universe(isl_set_get_space(access));
1382         bmap = isl_basic_map_from_domain_and_range(bset, offset);
1383
1384         shift = isl_basic_map_sum(shift, bmap);
1385
1386         dim = isl_set_get_space(access);
1387         dim = isl_space_drop_dims(dim, isl_dim_set, 0, n_index);
1388         dim = isl_space_map_from_set(dim);
1389         pre_shift = isl_basic_map_universe(isl_space_copy(dim));
1390         dim = isl_space_add_dims(dim, isl_dim_in, 1);
1391         dim = isl_space_add_dims(dim, isl_dim_out, 1);
1392         for (i = 0; i < n_index; ++i) {
1393                 if (!bounds[i].shift_map)
1394                         bmap = isl_basic_map_identity(isl_space_copy(dim));
1395                 else
1396                         bmap = isl_basic_map_copy(bounds[i].shift_map);
1397                 pre_shift = isl_basic_map_flat_product(pre_shift, bmap);
1398         }
1399         isl_space_free(dim);
1400         name = isl_basic_map_get_tuple_name(shift, isl_dim_in);
1401         pre_shift = isl_basic_map_set_tuple_name(pre_shift, isl_dim_in, name);
1402         pre_shift = isl_basic_map_set_tuple_name(pre_shift, isl_dim_out, name);
1403         shift = isl_basic_map_apply_range(pre_shift, shift);
1404
1405         sched = isl_map_from_basic_map(shift);
1406         sched = isl_map_intersect_domain(sched, access);
1407
1408         return sched;
1409 }
1410
1411 /* Construct a schedule for iterating over all elements in the given
1412  * piece of an array.  The schedule iterates over a copy of the piece
1413  * that is shifted to the origin.
1414  * We subsequently also perform the tiling/wrapping over the threads.
1415  *
1416  * In particular, we tile the final iterators so that the final thread
1417  * dimension runs over the final array dimension.
1418  * However, if those final iterators have only a single iteration,
1419  * we try to tile earlier iterators instead.
1420  */
1421 static __isl_give isl_union_map *access_schedule(struct cuda_gen *gen,
1422         __isl_take isl_set *access, struct cuda_array_ref_group *group)
1423 {
1424         isl_space *dim;
1425         isl_map *sched;
1426         isl_union_map *usched;
1427         isl_map *tiling;
1428         isl_set *par;
1429         unsigned nvar = isl_set_dim(access, isl_dim_set);
1430         int n_tile;
1431         int first;
1432
1433         sched = shift_access(access, group);
1434
1435         n_tile = gen->n_block;
1436         if (n_tile > nvar) {
1437                 int i;
1438                 sched = isl_map_insert_dims(sched,
1439                                                 isl_dim_out, 0, n_tile - nvar);
1440                 for (i = 0; i < n_tile - nvar; ++i)
1441                         sched = isl_map_fix_si(sched, isl_dim_out, i, 0);
1442                 nvar = n_tile;
1443         }
1444
1445         first = nvar - n_tile;
1446
1447         for (; first > 0; first --)
1448                 if (!isl_map_plain_is_fixed(sched, isl_dim_out,
1449                                                 first + n_tile - 1, NULL))
1450                         break;
1451
1452         dim = isl_map_get_space(sched);
1453         dim = isl_space_params(dim);
1454         if (gen->options->wrap)
1455                 tiling = wrap(isl_space_copy(dim), nvar, first,
1456                                 n_tile, gen->block_dim);
1457         else
1458                 tiling = tile(isl_space_copy(dim), nvar, first,
1459                                 n_tile, gen->block_dim);
1460         sched = isl_map_apply_range(sched, tiling);
1461
1462         par = parametrization(dim, nvar + n_tile, first + n_tile, n_tile, "t");
1463         usched = isl_union_map_from_map(sched);
1464         usched = isl_union_map_intersect_range(usched,
1465                                                 isl_union_set_from_set(par));
1466
1467         usched = scale_access_tile_loops(gen, usched, nvar + n_tile,
1468                                          first, n_tile);
1469
1470         return usched;
1471 }
1472
1473 /* Print an access to the element in the global memory copy of the
1474  * given array that corresponds to the element described by "pma".
1475  * of the original array.
1476  * The copy in global memory has been linearized, so we need to take
1477  * the array size into account.
1478  */
1479 static void print_global_index(FILE *out,
1480         struct cuda_array_info *array, __isl_keep isl_pw_multi_aff *pma,
1481         __isl_keep isl_set *domain)
1482 {
1483         int i;
1484         isl_ctx *ctx = isl_pw_multi_aff_get_ctx(pma);
1485         isl_printer *prn;
1486
1487         if (cuda_array_is_scalar(array)) {
1488                 fprintf(out, "*%s", array->name);
1489                 return;
1490         }
1491
1492         fprintf(out, "%s[", array->name);
1493         prn = isl_printer_to_file(ctx, out);
1494         prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
1495         for (i = 0; i + 1 < array->n_index; ++i)
1496                 prn = isl_printer_print_str(prn, "(");
1497         for (i = 0; i < array->n_index; ++i) {
1498                 isl_pw_aff *pa = isl_pw_multi_aff_get_pw_aff(pma, i);
1499                 pa = isl_pw_aff_coalesce(pa);
1500                 pa = isl_pw_aff_gist(pa, isl_set_copy(domain));
1501                 if (i) {
1502                         prn = isl_printer_print_str(prn, ") * (");
1503                         prn = isl_printer_print_pw_aff(prn,
1504                                                         array->local_bound[i]);
1505                         prn = isl_printer_print_str(prn, ") + ");
1506                 }
1507                 prn = isl_printer_print_pw_aff(prn, pa);
1508                 isl_pw_aff_free(pa);
1509         }
1510         isl_printer_free(prn);
1511         fprintf(out, "]");
1512 }
1513
1514 /* Given an index expression into a tile of an array, adjust the expression
1515  * to a shift of the tile to the origin
1516  * (based on the lower bounds in array->shared_bound).
1517  * If the index is strided, then we first add
1518  * bound->shift and divide by bound->stride.
1519  */
1520 static __isl_give isl_pw_aff *shift_index(__isl_take isl_pw_aff *pa,
1521         struct cuda_array_info *array,
1522         struct cuda_array_bound *bound, __isl_take isl_set *domain)
1523 {
1524         isl_aff *lb;
1525         isl_pw_aff *tmp;
1526
1527         if (bound->shift) {
1528                 isl_aff *shift;
1529                 shift = bound->shift;
1530                 shift = isl_aff_copy(shift);
1531                 shift = isl_aff_project_domain_on_params(shift);
1532                 shift = isl_aff_align_params(shift, isl_pw_aff_get_space(pa));
1533                 tmp = isl_pw_aff_alloc(isl_set_copy(domain), shift);
1534                 pa = isl_pw_aff_add(pa, tmp);
1535                 pa = isl_pw_aff_scale_down(pa, bound->stride);
1536         }
1537
1538         lb = isl_aff_copy(bound->lb);
1539         lb = isl_aff_project_domain_on_params(lb);
1540
1541         lb = isl_aff_align_params(lb, isl_pw_aff_get_space(pa));
1542
1543         tmp = isl_pw_aff_alloc(isl_set_copy(domain), lb);
1544         pa = isl_pw_aff_sub(pa, tmp);
1545         pa = isl_pw_aff_coalesce(pa);
1546         pa = isl_pw_aff_gist(pa, domain);
1547
1548         return pa;
1549 }
1550
1551 /* Print an access to the element in the private/shared memory copy of the
1552  * given array reference group that corresponds to the element described
1553  * by "pma" of the original array.
1554  * Since the array in private/shared memory is just a shifted copy of part
1555  * of the original array, we simply need to subtract the lower bound,
1556  * which was computed in can_tile_for_shared_memory.
1557  * If any of the indices is strided, then we first add
1558  * bounds[i].shift and divide by bounds[i].stride.
1559  */
1560 static void print_local_index(FILE *out,
1561         struct cuda_array_ref_group *group, struct cuda_array_bound *bounds,
1562         __isl_keep isl_pw_multi_aff *pma, __isl_keep isl_set *domain)
1563 {
1564         int i;
1565         isl_ctx *ctx = isl_pw_multi_aff_get_ctx(pma);
1566         isl_printer *prn;
1567         struct cuda_array_info *array = group->array;
1568
1569         print_array_name(out, group);
1570         for (i = 0; i < array->n_index; ++i) {
1571                 isl_pw_aff *pa = isl_pw_multi_aff_get_pw_aff(pma, i);
1572
1573                 pa = shift_index(pa, array, &bounds[i], isl_set_copy(domain));
1574
1575                 fprintf(out, "[");
1576                 prn = isl_printer_to_file(ctx, out);
1577                 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
1578                 prn = isl_printer_print_pw_aff(prn, pa);
1579                 isl_printer_free(prn);
1580                 fprintf(out, "]");
1581                 isl_pw_aff_free(pa);
1582         }
1583 }
1584
1585 /* This function is called for each leaf in the clast of the code
1586  * for copying to or from shared/private memory.
1587  * The statement name is {read,write}_{shared,private}_<array>.
1588  *
1589  * The schedule iterates over the array elements, so we can use
1590  * the domain of copy_sched at the current scheduling position
1591  * as the index of the array.
1592  */
1593 static void print_copy_statement(struct clast_printer_info *code,
1594         struct clast_user_stmt *u)
1595 {
1596         struct cuda_gen *gen = code->user;
1597         isl_set *domain;
1598         isl_map *sched;
1599         struct cuda_array_ref_group *group = gen->copy_group;
1600         struct cuda_array_bound *bounds = gen->copy_bound;
1601         unsigned n_in;
1602         unsigned n_out;
1603         isl_space *dim;
1604         isl_set *param;
1605         isl_set *index;
1606         isl_pw_multi_aff *pma;
1607         int read;
1608
1609         read = !strncmp(u->statement->name, "read", 4);
1610
1611         domain = extract_host_domain(u);
1612         assert(domain);
1613
1614         sched = isl_map_copy(gen->copy_sched);
1615         sched = isl_map_reverse(sched);
1616         sched = isl_map_intersect_domain(sched, domain);
1617         n_in = isl_map_dim(sched, isl_dim_in);
1618         n_out = isl_map_dim(sched, isl_dim_out);
1619         dim = isl_map_get_space(sched);
1620         dim = isl_space_drop_dims(dim, isl_dim_in, 0, n_in);
1621         dim = isl_space_drop_dims(dim, isl_dim_out, 0, n_out);
1622         param = parametrization(dim, n_in, 0, n_in, "c");
1623         sched = isl_map_align_params(sched, isl_set_get_space(param));
1624         sched = isl_map_intersect_domain(sched, param);
1625         index = isl_map_range(sched);
1626         domain = isl_set_copy(index);
1627         pma = isl_pw_multi_aff_from_set(index);
1628         pma = isl_pw_multi_aff_coalesce(pma);
1629         domain = isl_set_params(domain);
1630
1631         print_indent(code->dst, code->indent);
1632         if (read) {
1633                 print_local_index(code->dst, group, bounds, pma, domain);
1634                 fprintf(code->dst, " = ");
1635                 print_global_index(code->dst, group->array, pma, domain);
1636         } else {
1637                 print_global_index(code->dst, group->array, pma, domain);
1638                 fprintf(code->dst, " = ");
1639                 print_local_index(code->dst, group, bounds, pma, domain);
1640         }
1641         fprintf(code->dst, ";\n");
1642
1643         isl_pw_multi_aff_free(pma);
1644         isl_set_free(domain);
1645 }
1646
1647 static void print_shared_access(struct cuda_gen *gen,
1648         __isl_keep isl_set *shared_domain, __isl_take isl_set *access,
1649         const char *type, struct cuda_array_ref_group *group)
1650 {
1651         const char *array_name;
1652         char *name;
1653         isl_ctx *ctx;
1654         isl_union_map *sched;
1655         unsigned nvar = isl_set_dim(access, isl_dim_set);
1656         int n_tile;
1657
1658         ctx = isl_set_get_ctx(access);
1659         array_name = isl_set_get_tuple_name(access);
1660         name = isl_alloc_array(ctx, char,
1661                 strlen(type) + sizeof("_shared_") + strlen(array_name) + 20);
1662         if (group->array->n_group > 1)
1663                 sprintf(name, "%s_shared_%s_%d", type, array_name, group->nr);
1664         else
1665                 sprintf(name, "%s_shared_%s", type, array_name);
1666         access = isl_set_set_tuple_name(access, name);
1667         free(name);
1668
1669         sched = access_schedule(gen, access, group);
1670
1671         n_tile = gen->n_block;
1672         if (n_tile > nvar)
1673                 n_tile = nvar;
1674
1675         gen->copy_sched = isl_map_from_union_map(isl_union_map_copy(sched));
1676         gen->copy_group = group;
1677         gen->copy_bound = group->shared_bound;
1678
1679         print_shared_body(gen, shared_domain, sched, nvar + n_tile,
1680                                 &print_copy_statement, -1);
1681
1682         isl_union_map_free(sched);
1683         isl_map_free(gen->copy_sched);
1684 }
1685
1686 /* Return the union of all read (read = 1) and/or write (write = 1)
1687  * access relations in the group.
1688  */
1689 static __isl_give isl_union_map *group_access_relation(
1690         struct cuda_array_ref_group *group, int read, int write)
1691 {
1692         int i;
1693         isl_union_map *access;
1694
1695         access = isl_union_map_empty(isl_map_get_space(group->access));
1696         for (i = 0; i < group->n_ref; ++i) {
1697                 isl_map *map_i;
1698
1699                 if (!((read && group->refs[i]->read) ||
1700                      (write && group->refs[i]->write)))
1701                         continue;
1702                 map_i = isl_map_copy(group->refs[i]->access);
1703                 access = isl_union_map_union(access,
1704                                             isl_union_map_from_map(map_i));
1705         }
1706
1707         return access;
1708 }
1709
1710 /* Check that none of the shared memory tiles involve any strides.
1711  */
1712 static int no_strides(struct cuda_array_ref_group *group)
1713 {
1714         int i;
1715         int n_index = group->array->n_index;
1716
1717         for (i = 0; i < n_index; ++i)
1718                 if (group->shared_bound[i].shift)
1719                         return 0;
1720
1721         return 1;
1722 }
1723
1724 /* Return a set containing the values of the given index i
1725  * of the elements in the array tile in global memory that corresponds
1726  * to the shared memory copy.
1727  * In particular, if a is the index, we return a set with constraints
1728  *
1729  *      tile_offset <= a <= tile_offset + tile_size - 1
1730  *
1731  * and
1732  *
1733  *      0 <= a <= array_size - 1
1734  *
1735  */
1736 static __isl_give isl_set *group_tile_dim(struct cuda_array_ref_group *group,
1737         int i)
1738 {
1739         isl_basic_set *tile;
1740         isl_aff *aff;
1741         isl_constraint *c;
1742         isl_local_space *ls;
1743         isl_pw_aff *bound;
1744         isl_set *dom;
1745         isl_set *tile_set;
1746
1747         aff = isl_aff_copy(group->shared_bound[i].lb);
1748         aff = isl_aff_add_dims(aff, isl_dim_in, 1);
1749         ls = isl_aff_get_domain_local_space(aff);
1750         aff = isl_aff_neg(aff);
1751         aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1);
1752         c = isl_inequality_from_aff(isl_aff_copy(aff));
1753         tile = isl_basic_set_from_constraint(c);
1754
1755         aff = isl_aff_neg(aff);
1756         aff = isl_aff_add_constant(aff, group->shared_bound[i].size);
1757         aff = isl_aff_add_constant_si(aff, -1);
1758         c = isl_inequality_from_aff(aff);
1759         tile = isl_basic_set_add_constraint(tile, c);
1760
1761         aff = isl_aff_zero_on_domain(ls);
1762         aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1);
1763         c = isl_inequality_from_aff(aff);
1764         tile = isl_basic_set_add_constraint(tile, c);
1765
1766         bound = isl_pw_aff_copy(group->array->bound[i]);
1767         bound = isl_pw_aff_add_dims(bound, isl_dim_in, 1);
1768         ls = isl_local_space_from_space(isl_pw_aff_get_domain_space(bound));
1769         aff = isl_aff_zero_on_domain(ls);
1770         aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1);
1771         aff = isl_aff_add_constant_si(aff, 1);
1772         dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
1773
1774         tile_set = isl_pw_aff_ge_set(bound, isl_pw_aff_alloc(dom, aff));
1775         tile_set = isl_set_align_params(tile_set, isl_basic_set_get_space(tile));
1776         tile_set = isl_set_intersect(tile_set, isl_set_from_basic_set(tile));
1777
1778         return tile_set;
1779 }
1780
1781 /* Return a set containing the elements in the array tile in
1782  * global memory that corresponds to the shared memory copy.
1783  */
1784 static __isl_give isl_set *group_tile(struct cuda_array_ref_group *group)
1785 {
1786         int i;
1787         int n_index = group->array->n_index;
1788         isl_set *tile;
1789
1790         tile = group_tile_dim(group, 0);
1791         for (i = 1; i < n_index; ++i) {
1792                 isl_set *tile_i;
1793
1794                 tile_i = group_tile_dim(group, i);
1795                 tile = isl_set_flat_product(tile, tile_i);
1796         }
1797
1798         tile = isl_set_set_tuple_name(tile, group->array->name);
1799
1800         return tile;
1801 }
1802
1803 /* Print code for reading into or writing from shared memory
1804  * the given array reference group.
1805  *
1806  * sched maps the original iteration domains to the shared memory tile loops.
1807  *
1808  * If we are performing a read from global memory to shared memory,
1809  * if the array involved is not a scalar and if the definition of the
1810  * shared memory tiles does not involve any strides, then we copy
1811  * the entire tile to shared memory.  This may result in some extra
1812  * elements getting copied, but it should lead to simpler code
1813  * (which means that fewer registers may be needed) and less divergence.
1814  *
1815  * Otherwise, we only copy the elements that will be read or have been written
1816  * in the kernel.
1817  *
1818  * Note that the absence of stride requirement can easily be lifted.
1819  * We would just need to add constraints of the form
1820  *
1821  *      shift + a = stride * alpha
1822  */
1823 static int print_group_shared_accesses(struct cuda_gen *gen,
1824         struct cuda_array_ref_group *group, const char *type,
1825         __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *sched)
1826 {
1827         int read;
1828         isl_union_map *access;
1829         isl_union_set *uset;
1830         isl_set *access_set;
1831
1832         if (group->private_bound)
1833                 return 0;
1834         if (!group->shared_bound)
1835                 return 0;
1836
1837         read = !strcmp(type, "read");
1838
1839         access = group_access_relation(group, read, !read);
1840         access = isl_union_map_apply_domain(access, isl_union_map_copy(sched));
1841         uset = isl_union_map_range(access);
1842
1843         if (isl_union_set_is_empty(uset)) {
1844                 isl_union_set_free(uset);
1845                 return 0;
1846         }
1847
1848         if (read && group->array->n_index > 0 && no_strides(group)) {
1849                 isl_union_set_free(uset);
1850                 access_set = group_tile(group);
1851                 print_shared_access(gen, shared_domain, access_set,
1852                                     type, group);
1853                 return 1;
1854         }
1855
1856         access_set = isl_set_from_union_set(uset);
1857         access_set = isl_set_coalesce(access_set);
1858
1859         print_shared_access(gen, shared_domain, access_set, type, group);
1860
1861         return 1;
1862 }
1863
1864 /* Print code for reading into or writing from shared memory at
1865  * the given level (-1 for innermost).
1866  *
1867  * If we are not printing at the innermost level, then the dimensionality
1868  * of shared_domain may be smaller than gen->shared_len.
1869  * As the rest of the code assumes that the domain of access has
1870  * gen->shared_len dimensions, we therefore may need to embed this domain
1871  * in a higher dimensional space after intersection with shared_domain.
1872  */
1873 static void print_shared_accesses(struct cuda_gen *gen,
1874         __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *access,
1875         const char *type, int level)
1876 {
1877         int i, j;
1878         isl_space *dim;
1879         isl_map *proj;
1880         isl_set *par;
1881         int shared_len = isl_set_dim(shared_domain, isl_dim_set);
1882         int sync = 0;
1883         isl_union_map *sched;
1884
1885         shared_domain = isl_set_copy(shared_domain);
1886         sched = isl_union_map_copy(gen->tiled_sched);
1887         dim = isl_union_map_get_space(sched);
1888         proj = projection(dim, gen->tiled_len, shared_len);
1889         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
1890         sched = isl_union_map_intersect_range(sched,
1891                         isl_union_set_from_set(isl_set_copy(shared_domain)));
1892         if (shared_len != gen->shared_len) {
1893                 dim = isl_union_map_get_space(sched);
1894                 proj = projection(dim, gen->shared_len, shared_len);
1895                 proj = isl_map_reverse(proj);
1896                 shared_domain = isl_set_apply(shared_domain,
1897                                                 isl_map_copy(proj));
1898                 sched = isl_union_map_apply_range(sched,
1899                                 isl_union_map_from_map(proj));
1900         }
1901
1902         dim = isl_union_map_get_space(sched);
1903         par = parametrization(dim, gen->shared_len, 0, gen->shared_len, "g");
1904         sched = isl_union_map_intersect_range(sched,
1905                                                 isl_union_set_from_set(par));
1906
1907         for (i = 0; i < gen->n_array; ++i) {
1908                 struct cuda_array_info *array = &gen->array[i];
1909
1910                 for (j = 0; j < array->n_group; ++j) {
1911                         if (array->groups[j]->print_shared_level != level)
1912                                 continue;
1913
1914                         if (print_group_shared_accesses(gen, array->groups[j],
1915                                             type, shared_domain, sched))
1916                                 sync = 1;
1917                 }
1918         }
1919
1920         isl_union_map_free(sched);
1921         isl_set_free(shared_domain);
1922
1923         if (sync) {
1924                 print_indent(gen->cuda.kernel_c, gen->kernel_code.indent);
1925                 fprintf(gen->cuda.kernel_c, "__syncthreads();\n");
1926         }
1927 }
1928
1929 /* This function is called for each access to an array in some statement
1930  * in the original code.
1931  * Replace that access by an access to shared or (linearized) global memory.
1932  * Since the array in shared memory is just
1933  * a shifted copy of part of the original array, we simply need
1934  * to subtract the lower bound, which was computed
1935  * in can_tile_for_shared_memory.
1936  * If any of the indices is strided, then we first add
1937  * shared_bound[i].shift and divide by shared_bound[i].stride.
1938  *
1939  * If the given array is accessed directly from global memory,
1940  * we don't need to perform any shifting and simply simplify
1941  * expression in the context of the domain instead.
1942  *
1943  * If the array space (range of access) has no name, then we are
1944  * accessing an iterator in the original program.
1945  */
1946 static void print_access(struct cuda_gen *gen, __isl_take isl_map *access,
1947         int group_nr)
1948 {
1949         int i;
1950         const char *name;
1951         unsigned n_index;
1952         struct cuda_array_info *array = NULL;
1953         isl_printer *prn;
1954         isl_pw_multi_aff *pma;
1955         isl_set *data_set;
1956         isl_set *domain;
1957         struct cuda_array_bound *bounds = NULL;
1958
1959         access = isl_map_align_params(access,
1960                                         isl_set_get_space(gen->stmt_domain));
1961
1962         data_set = isl_set_apply(isl_set_copy(gen->stmt_domain), access);
1963
1964         name = isl_set_get_tuple_name(data_set);
1965
1966         if (!name)
1967                 fprintf(gen->cuda.kernel_c, "(");
1968         else {
1969                 struct cuda_array_ref_group *group;
1970
1971                 for (i = 0; i < gen->n_array; ++i) {
1972                         if (strcmp(name, gen->array[i].name))
1973                                 continue;
1974                         array = &gen->array[i];
1975                 }
1976                 assert(array);
1977                 group = array->groups[group_nr];
1978                 bounds = group->private_bound;
1979                 if (!bounds)
1980                         bounds = group->shared_bound;
1981
1982                 if (!bounds && cuda_array_is_scalar(array) && !array->read_only)
1983                         fprintf(gen->cuda.kernel_c, "*");
1984                 print_array_name(gen->cuda.kernel_c, group);
1985
1986                 if (cuda_array_is_scalar(array)) {
1987                         isl_set_free(data_set);
1988                         return;
1989                 }
1990
1991                 fprintf(gen->cuda.kernel_c, "[");
1992         }
1993
1994
1995         n_index = isl_set_dim(data_set, isl_dim_set);
1996         pma = isl_pw_multi_aff_from_set(data_set);
1997         pma = isl_pw_multi_aff_coalesce(pma);
1998
1999         prn = isl_printer_to_file(gen->ctx, gen->cuda.kernel_c);
2000         prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
2001
2002         if (!bounds)
2003                 for (i = 0; i + 1 < n_index; ++i)
2004                         prn = isl_printer_print_str(prn, "(");
2005
2006         for (i = 0; i < n_index; ++i) {
2007                 isl_pw_aff *index;
2008
2009                 index = isl_pw_multi_aff_get_pw_aff(pma, i);
2010
2011                 if (!array) {
2012                         prn = isl_printer_print_pw_aff(prn, index);
2013                         isl_pw_aff_free(index);
2014                         continue;
2015                 }
2016
2017                 domain = isl_set_copy(gen->stmt_domain);
2018                 domain = isl_set_params(domain);
2019                 if (!bounds) {
2020                         index = isl_pw_aff_coalesce(index);
2021                         index = isl_pw_aff_gist(index, domain);
2022                 } else
2023                         index = shift_index(index, array, &bounds[i], domain);
2024
2025                 if (i) {
2026                         if (!bounds) {
2027                                 prn = isl_printer_print_str(prn, ") * (");
2028                                 prn = isl_printer_print_pw_aff(prn,
2029                                                         array->local_bound[i]);
2030                                 prn = isl_printer_print_str(prn, ") + ");
2031                         } else
2032                                 prn = isl_printer_print_str(prn, "][");
2033                 }
2034                 prn = isl_printer_print_pw_aff(prn, index);
2035                 isl_pw_aff_free(index);
2036         }
2037         if (!name)
2038                 prn = isl_printer_print_str(prn, ")");
2039         else
2040                 prn = isl_printer_print_str(prn, "]");
2041         isl_printer_free(prn);
2042
2043         isl_pw_multi_aff_free(pma);
2044 }
2045
2046 struct cuda_access_print_info {
2047         struct cuda_gen *gen;
2048         struct cuda_stmt_access *access;
2049 };
2050
2051 /* To print the cuda accesses we walk the list of cuda accesses simultaneously
2052  * with the pet printer. This means that whenever the pet printer prints a
2053  * pet access expression we have the corresponding cuda access available and can
2054  * print the modified access.
2055  */
2056 static void print_cuda_access(struct pet_expr *expr, void *usr)
2057 {
2058         struct cuda_access_print_info *info =
2059                 (struct cuda_access_print_info *) usr;
2060         print_access(info->gen, isl_map_copy(info->access->access),
2061                      info->access->group);
2062         info->access = info->access->next;
2063 }
2064
2065 static void print_stmt_body(struct cuda_gen *gen,
2066         FILE *out, struct cuda_stmt *stmt)
2067 {
2068         struct cuda_access_print_info info;
2069
2070         info.gen = gen;
2071         info.access = stmt->accesses;
2072
2073         print_pet_expr(out, stmt->body, print_cuda_access, &info);
2074         fprintf(out, ";\n");
2075 }
2076
2077 /* This function is called for each leaf in the innermost clast,
2078  * i.e., for each statement.
2079  * We print the statement body, simplifying the accesses based
2080  * on the schedule.
2081  */
2082 static void print_statement(struct clast_printer_info *code,
2083         struct clast_user_stmt *u)
2084 {
2085         struct cuda_gen *gen = code->user;
2086         isl_space *dim;
2087         isl_set *par;
2088         isl_set *stmt_domain;
2089         isl_union_map *stmt_sched;
2090         isl_union_set *uset;
2091         int nr;
2092         struct cuda_stmt *stmt;
2093
2094         nr = atoi(u->statement->name + 2);
2095         stmt = &gen->stmts[nr];
2096
2097         stmt_domain = extract_host_domain(u);
2098
2099         stmt_sched = isl_union_map_intersect_range(
2100                     isl_union_map_copy(gen->local_sched),
2101                     isl_union_set_from_set(extend(stmt_domain,
2102                                                   gen->thread_tiled_len)));
2103         dim = isl_union_map_get_space(stmt_sched);
2104         par = parametrization(dim, gen->thread_tiled_len, 0,
2105                                 gen->thread_tiled_len, "c");
2106         stmt_sched = isl_union_map_intersect_range(stmt_sched,
2107                                                 isl_union_set_from_set(par));
2108
2109         uset = isl_union_map_domain(stmt_sched);
2110         dim = isl_union_set_get_space(uset);
2111         dim = isl_space_add_dims(dim, isl_dim_set,
2112                           isl_set_dim(stmt->domain, isl_dim_set));
2113         dim = isl_space_set_tuple_name(dim, isl_dim_set, u->statement->name);
2114         gen->stmt_domain = isl_union_set_extract_set(uset, dim);
2115         isl_union_set_free(uset);
2116
2117         print_indent(code->dst, code->indent);
2118         print_stmt_body(gen, code->dst, stmt);
2119
2120         isl_set_free(gen->stmt_domain);
2121 }
2122
2123 static void print_private_access(struct cuda_gen *gen,
2124         __isl_keep isl_set *shared_domain, __isl_take isl_set *access,
2125         const char *type, struct cuda_array_ref_group *group)
2126 {
2127         const char *array_name;
2128         char *name;
2129         isl_ctx *ctx;
2130         unsigned nvar = isl_set_dim(access, isl_dim_set);
2131         isl_union_map *usched;
2132
2133         if (isl_set_fast_is_empty(access)) {
2134                 isl_set_free(access);
2135                 return;
2136         }
2137
2138         ctx = isl_set_get_ctx(access);
2139         array_name = isl_set_get_tuple_name(access);
2140         name = isl_alloc_array(ctx, char,
2141                 strlen(type) + sizeof("_private_") + strlen(array_name) + 20);
2142         if (group->array->n_group > 1)
2143                 sprintf(name, "%s_private_%s_%d", type, array_name, group->nr);
2144         else
2145                 sprintf(name, "%s_private_%s", type, array_name);
2146         access = isl_set_set_tuple_name(access, name);
2147         free(name);
2148
2149         gen->copy_sched = shift_access(access, group);
2150         gen->copy_group = group;
2151         gen->copy_bound = group->private_bound;
2152
2153         usched = isl_union_map_from_map(isl_map_copy(gen->copy_sched));
2154         print_shared_body(gen, shared_domain, usched, nvar,
2155                                 &print_copy_statement, 1);
2156         isl_union_map_free(usched);
2157
2158         isl_map_free(gen->copy_sched);
2159 }
2160
2161 /* Print code for reading into or writing from private memory
2162  * the given array reference group.
2163  *
2164  * sched maps the original iteration domains to the shared memory tile loops.
2165  */
2166 static void print_group_private_accesses(struct cuda_gen *gen,
2167         struct cuda_array_ref_group *group,
2168          const char *type, __isl_keep isl_set *shared_domain,
2169         unsigned first_shared, int shared_len, __isl_keep isl_union_map *sched)
2170 {
2171         int read;
2172         isl_union_map *access;
2173         isl_union_set *uset;
2174         isl_set *access_set;
2175
2176         if (!group->private_bound)
2177                 return;
2178
2179         read = !strcmp(type, "read");
2180
2181         access = group_access_relation(group, read, !read);
2182         access = isl_union_map_apply_domain(access, isl_union_map_copy(sched));
2183         access = isl_union_map_intersect(access,
2184                                     isl_union_map_copy(gen->private_access));
2185         uset = isl_union_map_range(access);
2186
2187         if (isl_union_set_is_empty(uset)) {
2188                 isl_union_set_free(uset);
2189                 return;
2190         }
2191
2192         access_set = isl_set_from_union_set(uset);
2193         access_set = isl_set_coalesce(access_set);
2194         access_set = isl_set_eliminate(access_set, isl_dim_param,
2195                         first_shared + shared_len,
2196                         gen->shared_len - shared_len);
2197
2198         print_private_access(gen, shared_domain, access_set, type, group);
2199 }
2200
2201 /* Print code for reading into or writing from private memory at
2202  * the given level (-1 for innermost).
2203  *
2204  * If we are not printing at the innermost level, then the dimensionality
2205  * of shared_domain may be smaller than gen->shared_len.
2206  * As the rest of the code assumes that the domain of access has
2207  * gen->shared_len dimensions, we therefore may need to embed this domain
2208  * in a higher dimensional space after intersection with shared_domain.
2209  *
2210  * This code is very similar to print_shared_accesses.
2211  * The main difference is that we to take into account gen->private_access.
2212  */
2213 static void print_private_accesses(struct cuda_gen *gen,
2214         __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *access,
2215         const char *type, int level)
2216 {
2217         int i, j;
2218         isl_space *dim;
2219         isl_map *proj;
2220         int shared_len = isl_set_dim(shared_domain, isl_dim_set);
2221         unsigned first_shared;
2222         isl_union_map *sched;
2223
2224         shared_domain = isl_set_copy(shared_domain);
2225         sched = isl_union_map_copy(gen->tiled_sched);
2226         dim = isl_union_map_get_space(sched);
2227         first_shared = isl_space_dim(dim, isl_dim_param);
2228         proj = projection(dim, gen->tiled_len, shared_len);
2229         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
2230         sched = isl_union_map_intersect_range(sched,
2231                         isl_union_set_from_set(isl_set_copy(shared_domain)));
2232         if (shared_len != gen->shared_len) {
2233                 dim = isl_union_map_get_space(sched);
2234                 proj = projection(dim, gen->shared_len, shared_len);
2235                 proj = isl_map_reverse(proj);
2236                 shared_domain = isl_set_apply(shared_domain,
2237                                                 isl_map_copy(proj));
2238                 sched = isl_union_map_apply_range(sched,
2239                                 isl_union_map_from_map(proj));
2240         }
2241
2242         for (i = 0; i < gen->n_array; ++i) {
2243                 struct cuda_array_info *array = &gen->array[i];
2244
2245                 for (j = 0; j < array->n_group; ++j) {
2246                         if (array->groups[j]->print_shared_level != level)
2247                                 continue;
2248
2249                         print_group_private_accesses(gen, array->groups[j],
2250                                             type, shared_domain,
2251                                             first_shared, shared_len, sched);
2252                 }
2253         }
2254
2255         isl_union_map_free(sched);
2256         isl_set_free(shared_domain);
2257 }
2258
2259 /* Set unroll[j] if the input dimension j is involved in
2260  * the index expression represented by bmap.
2261  */
2262 static int check_unroll(__isl_take isl_basic_map *bmap, void *user)
2263 {
2264         int i, j;
2265         int n_in = isl_basic_map_dim(bmap, isl_dim_in);
2266         int n_out = isl_basic_map_dim(bmap, isl_dim_out);
2267         int *unroll = user;
2268
2269         for (i = 0; i < n_out; ++i) {
2270                 isl_constraint *c;
2271                 int ok;
2272
2273                 ok = isl_basic_map_has_defining_equality(bmap,
2274                                                         isl_dim_out, i, &c);
2275                 assert(ok);
2276                 for (j = 0; j < n_in; ++j)
2277                         if (isl_constraint_involves_dims(c, isl_dim_in, j, 1))
2278                                 unroll[j] = 1;
2279                 isl_constraint_free(c);
2280         }
2281
2282         isl_basic_map_free(bmap);
2283         return 0;
2284 }
2285
2286 /* Given an array pos mapping input dimensions to the corresponding
2287  * output dimension, construct the corresponding map.
2288  */
2289 static __isl_give isl_map *permutation(__isl_take isl_space *dim,
2290         int *pos, int len)
2291 {
2292         int i;
2293         isl_constraint *c;
2294         isl_basic_map *bmap;
2295         isl_local_space *ls;
2296
2297         dim = isl_space_add_dims(dim, isl_dim_in, len);
2298         dim = isl_space_add_dims(dim, isl_dim_out, len);
2299         bmap = isl_basic_map_universe(isl_space_copy(dim));
2300         ls = isl_local_space_from_space(dim);
2301
2302         for (i = 0; i < len; ++i) {
2303                 c = isl_equality_alloc(isl_local_space_copy(ls));
2304                 isl_constraint_set_coefficient_si(c, isl_dim_in, i, -1);
2305                 isl_constraint_set_coefficient_si(c, isl_dim_out, pos[i], 1);
2306                 bmap = isl_basic_map_add_constraint(bmap, c);
2307         }
2308         isl_local_space_free(ls);
2309
2310         return isl_map_from_basic_map(bmap);
2311 }
2312
2313 /* Find all loops involved in any of the index expressions for any of
2314  * the private accesses, move them innermost and then mark them as
2315  * requiring unrolling by setting gen->first_unroll.
2316  * The loops involved should all be parallel because of the checks
2317  * we performed in check_private_group_access.  Moving them innermost
2318  * is therefore a valid transformation.
2319  */
2320 static __isl_give isl_union_map *interchange_for_unroll(struct cuda_gen *gen,
2321         __isl_take isl_union_map *sched)
2322 {
2323         int i, j;
2324         int unroll[gen->thread_tiled_len];
2325         int perm[gen->thread_tiled_len];
2326         isl_space *dim;
2327         isl_map *permute;
2328         int len = gen->shared_len + gen->n_parallel + gen->n_block;
2329
2330         gen->first_unroll = -1;
2331
2332         for (i = 0; i < gen->thread_tiled_len; ++i)
2333                 unroll[i] = 0;
2334         for (i = 0; i < gen->n_array; ++i) {
2335                 struct cuda_array_info *array = &gen->array[i];
2336
2337                 for (j = 0; j < array->n_group; ++j) {
2338                         isl_union_map *access;
2339                         isl_map *acc;
2340
2341                         if (!array->groups[j]->private_bound)
2342                                 continue;
2343
2344                         access = group_access_relation(array->groups[j], 1, 1);
2345                         access = isl_union_map_apply_domain(access,
2346                                                 isl_union_map_copy(sched));
2347
2348                         acc = isl_map_from_union_map(access);
2349                         isl_map_foreach_basic_map(acc, &check_unroll, unroll);
2350
2351                         isl_map_free(acc);
2352                 }
2353         }
2354
2355         for (i = 0; i < gen->shared_len; ++i)
2356                 if (unroll[i])
2357                         return sched;
2358
2359         for (i = gen->shared_len; i < len; ++i)
2360                 if (unroll[i])
2361                         break;
2362
2363         if (i >= len)
2364                 return sched;
2365
2366         for (i = len; i < gen->thread_tiled_len; ++i)
2367                 if (unroll[i])
2368                         return sched;
2369
2370         j = 0;
2371         for (i = 0; i < gen->thread_tiled_len; ++i)
2372                 if (!unroll[i])
2373                         perm[i] = j++;
2374         gen->first_unroll = 1 + j;
2375         for (i = 0; i < len; ++i)
2376                 if (unroll[i])
2377                         perm[i] = j++;
2378
2379         dim = isl_union_map_get_space(sched);
2380         permute = permutation(dim, perm, gen->thread_tiled_len);
2381         sched = isl_union_map_apply_range(sched,
2382                                           isl_union_map_from_map(permute));
2383
2384         return sched;
2385 }
2386
2387 /* This function is called for each leaf in the clast of the kernel code.
2388  * We first specialize the schedule to the site of the leaf and
2389  * print code for reading into shared memory, performing the actual
2390  * computations and writing from shared memory, with the required
2391  * synchronizations.
2392  */
2393 static void print_kernel_user(struct clast_printer_info *code,
2394         struct clast_user_stmt *u)
2395 {
2396         struct cuda_gen *gen = code->user;
2397         isl_set *shared_domain;
2398
2399         shared_domain = extract_entire_host_domain(&u->stmt);
2400
2401         print_shared_accesses(gen, shared_domain, gen->read, "read", -1);
2402
2403         print_private_accesses(gen, shared_domain, gen->read, "read", -1);
2404
2405         print_shared_body(gen, shared_domain, gen->local_sched,
2406                             gen->thread_tiled_len, &print_statement,
2407                             gen->first_unroll);
2408
2409         print_private_accesses(gen, shared_domain, gen->write, "write", -1);
2410
2411         print_indent(gen->cuda.kernel_c, gen->kernel_code.indent);
2412         fprintf(gen->cuda.kernel_c, "__syncthreads();\n");
2413
2414         print_shared_accesses(gen, shared_domain, gen->write, "write", -1);
2415
2416         isl_set_free(shared_domain);
2417 }
2418
2419 /* Check if we need to perform any copying to shared memory at this level
2420  * and if so, print the copying instructions.
2421  * Any array for which we are allowed to print copying instructions at
2422  * this level, but haven't done so already, is printed.
2423  */
2424 static void copy_to_local(struct cuda_gen *gen, __isl_keep isl_set *domain)
2425 {
2426         int i, j;
2427         int level;
2428         int print = 0;
2429
2430         level = isl_set_dim(domain, isl_dim_set);
2431
2432         for (i = 0; i < gen->n_array; ++i) {
2433                 struct cuda_array_info *array = &gen->array[i];
2434
2435                 for (j = 0; j < array->n_group; ++j) {
2436                         if (array->groups[j]->print_shared_level >= 0)
2437                                 continue;
2438                         if (array->groups[j]->last_shared >= level)
2439                                 continue;
2440                         array->groups[j]->print_shared_level = level;
2441                         print = 1;
2442                 }
2443         }
2444
2445         if (print) {
2446                 print_shared_accesses(gen, domain, gen->read, "read", level);
2447                 print_private_accesses(gen, domain, gen->read, "read", level);
2448         }
2449
2450 }
2451
2452 /* This function is called for each for loop in the clast,
2453  * right after the opening brace has been printed.
2454  *
2455  * Print copying instructions to shared or private memory if needed.
2456  */
2457 static void print_kernel_for_head(struct clast_printer_info *code,
2458         struct clast_for *f)
2459 {
2460         struct cuda_gen *gen = code->user;
2461         isl_set *domain;
2462
2463         domain = isl_set_from_cloog_domain(cloog_domain_copy(f->domain));
2464         copy_to_local(gen, domain);
2465
2466         isl_set_free(domain);
2467 }
2468
2469 /* Print instructions for copying from shared memory for each array
2470  * for which print_kernel_for_head has added copying instructions
2471  * to shared memory.
2472  */
2473 static void copy_from_local(struct cuda_gen *gen, __isl_keep isl_set *domain)
2474 {
2475         int i, j;
2476         int level;
2477         int print = 0;
2478
2479         level = isl_set_dim(domain, isl_dim_set);
2480
2481         for (i = 0; i < gen->n_array; ++i) {
2482                 struct cuda_array_info *array = &gen->array[i];
2483
2484                 for (j = 0; j < array->n_group; ++j) {
2485                         if (array->groups[j]->print_shared_level != level)
2486                                 continue;
2487                         print = 1;
2488                         break;
2489                 }
2490                 if (print)
2491                         break;
2492         }
2493
2494         if (print) {
2495                 print_private_accesses(gen, domain, gen->write, "write", level);
2496                 print_shared_accesses(gen, domain, gen->write, "write", level);
2497         }
2498 }
2499
2500 /* This function is called for each for loop in the clast,
2501  * right before the closing brace is printed.
2502  *
2503  * Print copying instructions from shared or private memory if needed.
2504  */
2505 static void print_kernel_for_foot(struct clast_printer_info *code,
2506         struct clast_for *f)
2507 {
2508         struct cuda_gen *gen = code->user;
2509         isl_set *domain;
2510
2511         domain = isl_set_from_cloog_domain(cloog_domain_copy(f->domain));
2512         copy_from_local(gen, domain);
2513
2514         isl_set_free(domain);
2515 }
2516
2517 /* Use CLooG to generate code for the outer gen->shared_first loops
2518  * of the local schedule "sched".
2519  * The pretty printing of this code is handled by print_clast,
2520  * which calls print_kernel_user for each iteration of the shared tile loops.
2521  */
2522 static void print_cloog_kernel_body(struct cuda_gen *gen,
2523         __isl_keep isl_set *context, __isl_keep isl_union_map *sched)
2524 {
2525         int i;
2526         CloogOptions *options;
2527         CloogDomain *cloog_context;
2528         CloogUnionDomain *ud;
2529         CloogInput *input;
2530         struct clast_stmt *stmt;
2531         char name[20];
2532
2533         sched = isl_union_map_copy(sched);
2534         sched = isl_union_map_align_params(sched, isl_set_get_space(context));
2535
2536         options = cloog_options_malloc(gen->state);
2537         options->language = CLOOG_LANGUAGE_C;
2538         options->strides = 1;
2539         options->sh = 1;
2540         options->stop = gen->shared_len;
2541         options->f = gen->tiled_len;
2542         options->l = gen->tiled_len;
2543         options->save_domains = 1;
2544         options->noscalars = 1;
2545
2546         ud = cloog_union_domain_from_isl_union_map(sched);
2547         for (i = 0; i < gen->shared_len; ++i) {
2548                 snprintf(name, sizeof(name), "g%d", i);
2549                 ud = cloog_union_domain_set_name(ud, CLOOG_SCAT, i, name);
2550         }
2551         cloog_context = cloog_domain_from_isl_set(isl_set_copy(context));
2552         input = cloog_input_alloc(cloog_context, ud);
2553
2554         stmt = cloog_clast_create_from_input(input, options);
2555
2556         gen->kernel_code.indent = 4;
2557         gen->kernel_code.dst = gen->cuda.kernel_c;
2558         gen->kernel_code.print_user_stmt = NULL;
2559         gen->kernel_code.print_user_stmt_list = &print_kernel_user;
2560         gen->kernel_code.print_for_head = &print_kernel_for_head;
2561         gen->kernel_code.print_for_foot = &print_kernel_for_foot;
2562         gen->kernel_code.user = gen;
2563         copy_to_local(gen, context);
2564         print_clast(&gen->kernel_code, stmt);
2565         copy_from_local(gen, context);
2566
2567         cloog_clast_free(stmt);
2568         cloog_options_free(options);
2569 }
2570
2571 static void print_kernel_iterators(struct cuda_gen *gen)
2572 {
2573         int i;
2574         const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
2575         const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
2576                                         "threadIdx.z" };
2577
2578         if (gen->n_grid > 0) {
2579                 print_indent(gen->cuda.kernel_c, 4);
2580                 fprintf(gen->cuda.kernel_c, "int ");
2581                 for (i = 0; i < gen->n_grid; ++i) {
2582                         if (i)
2583                                 fprintf(gen->cuda.kernel_c, ", ");
2584                         fprintf(gen->cuda.kernel_c, "b%d = %s",
2585                                 i, block_dims[gen->n_grid - 1 - i]);
2586                 }
2587                 fprintf(gen->cuda.kernel_c, ";\n");
2588         }
2589
2590         if (gen->n_block > 0) {
2591                 print_indent(gen->cuda.kernel_c, 4);
2592                 fprintf(gen->cuda.kernel_c, "int ");
2593                 for (i = 0; i < gen->n_block; ++i) {
2594                         if (i)
2595                                 fprintf(gen->cuda.kernel_c, ", ");
2596                         fprintf(gen->cuda.kernel_c, "t%d = %s",
2597                                 i, thread_dims[gen->n_block - 1 - i]);
2598                 }
2599                 fprintf(gen->cuda.kernel_c, ";\n");
2600         }
2601 }
2602
2603 static void print_group_shared_array(struct cuda_gen *gen,
2604         struct cuda_array_ref_group *group)
2605 {
2606         int j;
2607         struct cuda_array_bound *bounds;
2608
2609         bounds = group->private_bound;
2610         if (!bounds)
2611                 bounds = group->shared_bound;
2612         if (!bounds)
2613                 return;
2614
2615         print_indent(gen->cuda.kernel_c, 4);
2616         fprintf(gen->cuda.kernel_c, "%s%s ",
2617                 group->private_bound ? "" : "__shared__ ", group->array->type);
2618         print_array_name(gen->cuda.kernel_c, group);
2619         for (j = 0; j < group->array->n_index; ++j) {
2620                 fprintf(gen->cuda.kernel_c, "[");
2621                 isl_int_print(gen->cuda.kernel_c, bounds[j].size, 0);
2622                 fprintf(gen->cuda.kernel_c, "]");
2623         }
2624         fprintf(gen->cuda.kernel_c, ";\n");
2625 }
2626
2627 static void print_shared_arrays(struct cuda_gen *gen)
2628 {
2629         int i, j;
2630
2631         for (i = 0; i < gen->n_array; ++i) {
2632                 struct cuda_array_info *array = &gen->array[i];
2633
2634                 for (j = 0; j < array->n_group; ++j)
2635                         print_group_shared_array(gen, array->groups[j]);
2636         }
2637 }
2638
2639 static void print_kernel_body(struct cuda_gen *gen,
2640         __isl_keep isl_set *host_domain, __isl_keep isl_union_map *sched)
2641 {
2642         isl_set *context;
2643
2644         context = isl_set_copy(host_domain);
2645         context = parametrize(context, 0, gen->tile_first, "h");
2646         context = isl_set_project_out(context, isl_dim_set, 0, gen->tile_first);
2647         context = add_bounded_parameters(context,
2648                                         gen->n_grid, gen->grid_dim, "b");
2649
2650         print_kernel_iterators(gen);
2651         print_shared_arrays(gen);
2652
2653         fprintf(gen->cuda.kernel_c, "\n");
2654
2655         print_cloog_kernel_body(gen, context, sched);
2656
2657         isl_set_free(context);
2658 }
2659
2660 /* Given a constraint
2661  *
2662  *              a(p,i) + j = g f(e)
2663  *
2664  * or -a(p,i) - j = g f(e) if sign < 0,
2665  * store a(p,i) in bound->shift and g (stride) in bound->stride.
2666  * a(p,i) is assumed to be an expression in only the parameters.
2667  */
2668 static void extract_stride(__isl_keep isl_constraint *c,
2669         struct cuda_array_bound *bound, isl_int stride, int sign)
2670 {
2671         int i;
2672         isl_int v;
2673         isl_space *dim;
2674         unsigned nparam;
2675         isl_aff *aff;
2676
2677         isl_int_set(bound->stride, stride);
2678
2679         dim = isl_constraint_get_space(c);
2680         dim = isl_space_params(dim);
2681
2682         nparam = isl_space_dim(dim, isl_dim_param);
2683
2684         isl_int_init(v);
2685
2686         isl_constraint_get_constant(c, &v);
2687         if (sign < 0)
2688                 isl_int_neg(v, v);
2689         aff = isl_aff_zero_on_domain(isl_local_space_from_space(dim));
2690         aff = isl_aff_set_constant(aff, v);
2691
2692         for (i = 0; i < nparam; ++i) {
2693                 isl_constraint_get_coefficient(c, isl_dim_param, i, &v);
2694                 if (isl_int_is_zero(v))
2695                         continue;
2696                 if (sign < 0)
2697                         isl_int_neg(v, v);
2698                 aff = isl_aff_add_coefficient(aff, isl_dim_param, i, v);
2699         }
2700
2701         isl_int_clear(v);
2702
2703         bound->shift = aff;
2704 }
2705
2706 /* Given an equality constraint of a map with a single output dimension j,
2707  * check if the constraint is of the form
2708  *
2709  *              a(p,i) + j = g f(e)
2710  *
2711  * with a(p,i) an expression in the parameters and input dimensions
2712  * and f(e) an expression in the existentially quantified variables.
2713  * If so, and if g is larger than any such g from a previously considered
2714  * constraint, then call extract_stride. to record the stride information
2715  * in bound.
2716  */
2717 static int check_stride_constraint(__isl_take isl_constraint *c, void *user)
2718 {
2719         int i;
2720         isl_int v, stride;
2721         unsigned n_div;
2722         struct cuda_array_bound *bound = user;
2723
2724         isl_int_init(v);
2725         isl_int_init(stride);
2726
2727         n_div = isl_constraint_dim(c, isl_dim_div);
2728         isl_constraint_get_coefficient(c, isl_dim_out, 0, &v);
2729
2730         if (n_div && (isl_int_is_one(v) || isl_int_is_negone(v))) {
2731                 int s = isl_int_sgn(v);
2732                 isl_int_set_si(stride, 0);
2733                 for (i = 0; i < n_div; ++i) {
2734                         isl_constraint_get_coefficient(c, isl_dim_div, i, &v);
2735                         isl_int_gcd(stride, stride, v);
2736                 }
2737                 if (!isl_int_is_zero(stride) &&
2738                     isl_int_gt(stride, bound->stride))
2739                         extract_stride(c, bound, stride, s);
2740         }
2741
2742         isl_int_clear(stride);
2743         isl_int_clear(v);
2744
2745         isl_constraint_free(c);
2746         return 0;
2747 }
2748
2749 /* Given contraints on an array index i, check if we can find
2750  * a shift a(p) and a stride g such that
2751  *
2752  *      a(p) + i = 0 mod g
2753  *
2754  * If so, record the information in bound and apply the mapping
2755  * i -> (i + a(p))/g to the array index in bounds and return
2756  * the new constraints.
2757  * If not, simply return the original constraints.
2758  */
2759 static __isl_give isl_basic_map *check_stride(struct cuda_gen *gen,
2760         struct cuda_array_bound *bound, __isl_take isl_basic_map *bounds)
2761 {
2762         isl_basic_map *aff;
2763         isl_basic_map *shift;
2764         isl_aff *aff_shift;
2765
2766         isl_int_set_si(bound->stride, -1);
2767
2768         aff = isl_basic_map_affine_hull(isl_basic_map_copy(bounds));
2769
2770         isl_basic_map_foreach_constraint(aff, &check_stride_constraint, bound);
2771
2772         isl_basic_map_free(aff);
2773
2774         if (isl_int_is_neg(bound->stride))
2775                 return bounds;
2776
2777         aff_shift = isl_aff_copy(bound->shift);
2778         aff_shift = isl_aff_add_dims(aff_shift, isl_dim_in, 1);
2779         aff_shift = isl_aff_add_coefficient_si(aff_shift, isl_dim_in, 0, 1);
2780         aff_shift = isl_aff_scale_down(aff_shift, bound->stride);
2781         shift = isl_basic_map_from_aff(aff_shift);
2782
2783         bound->shift_map = isl_basic_map_copy(shift);
2784         bounds = isl_basic_map_apply_range(bounds, shift);
2785
2786         return bounds;
2787 }
2788
2789 struct cuda_size_info {
2790         isl_basic_set *bset;
2791         struct cuda_array_bound *bound;
2792         int pos;
2793 };
2794
2795 /* Given a constraint from the basic set describing the bounds on
2796  * an array index, check if it is a lower bound, say m i >= b(x), and,
2797  * if so, check whether the expression "i - ceil(b(x)/m) + 1" has a constant
2798  * upper bound.  If so, and if this bound is smaller than any bound
2799  * derived from earlier constraints, set the size to this bound on
2800  * the expression and the lower bound to ceil(b(x)/m).
2801  */
2802 static int compute_size_in_direction(__isl_take isl_constraint *c, void *user)
2803 {
2804         struct cuda_size_info *size = user;
2805         unsigned nparam;
2806         unsigned n_div;
2807         isl_int v;
2808
2809         nparam = isl_basic_set_dim(size->bset, isl_dim_param);
2810         n_div = isl_constraint_dim(c, isl_dim_div);
2811
2812         if (isl_constraint_involves_dims(c, isl_dim_div, 0, n_div)) {
2813                 isl_constraint_free(c);
2814                 return 0;
2815         }
2816
2817         isl_int_init(v);
2818
2819         isl_constraint_get_coefficient(c, isl_dim_set, size->pos, &v);
2820
2821         if (isl_int_is_pos(v)) {
2822                 isl_aff *aff;
2823                 isl_aff *lb;
2824                 enum isl_lp_result res;
2825
2826                 aff = isl_constraint_get_bound(c, isl_dim_set, size->pos);
2827                 aff = isl_aff_ceil(aff);
2828
2829                 lb = isl_aff_copy(aff);
2830
2831                 aff = isl_aff_neg(aff);
2832                 aff = isl_aff_add_coefficient_si(aff, isl_dim_in, size->pos, 1);
2833
2834                 res = isl_basic_set_max(size->bset, aff, &v);
2835                 isl_aff_free(aff);
2836
2837                 if (res == isl_lp_ok) {
2838                         isl_int_add_ui(v, v, 1);
2839                         if (isl_int_is_neg(size->bound->size) ||
2840                             isl_int_lt(v, size->bound->size)) {
2841                                 isl_int_set(size->bound->size, v);
2842                                 lb = isl_aff_drop_dims(lb, isl_dim_in,
2843                                                         0, size->pos + 1);
2844                                 isl_aff_free(size->bound->lb);
2845                                 size->bound->lb = isl_aff_copy(lb);
2846                         }
2847                 }
2848                 isl_aff_free(lb);
2849         }
2850
2851         isl_int_clear(v);
2852         isl_constraint_free(c);
2853
2854         return 0;
2855 }
2856
2857 /* Given a basic map "bounds" that maps parameters and input dimensions
2858  * to a single output dimension, look for an expression in the parameters
2859  * and input dimensions such that the range of the output dimension shifted
2860  * by this expression is a constant.
2861  *
2862  * In particular, we currently only consider lower bounds on the output
2863  * dimension as candidate expressions.
2864  */
2865 static int compute_array_dim_size(struct cuda_gen *gen,
2866         struct cuda_array_bound *bound, __isl_take isl_basic_map *bounds)
2867 {
2868         struct cuda_size_info size;
2869
2870         bounds = isl_basic_map_detect_equalities(bounds);
2871         bounds = check_stride(gen, bound, bounds);
2872
2873         isl_int_set_si(bound->size, -1);
2874         bound->lb = NULL;
2875
2876         size.bound = bound;
2877         size.pos = isl_basic_map_dim(bounds, isl_dim_in);
2878         size.bset = isl_basic_map_wrap(bounds);
2879         size.bset = isl_basic_set_flatten(size.bset);
2880         size.bset = isl_set_simple_hull(isl_basic_set_compute_divs(size.bset));
2881         isl_basic_set_foreach_constraint(size.bset, &compute_size_in_direction,
2882                                         &size);
2883         isl_basic_set_free(size.bset);
2884
2885         return isl_int_is_nonneg(bound->size) ? 0 : -1;
2886 }
2887
2888 /* Check if we can find a shared memory tile for the given array
2889  * based on the given accesses, and if so, put the results
2890  * in array->shared_bound.
2891  *
2892  * We project the accesses on each index in turn and look for a parametric
2893  * offset such that the size is constant.
2894  */
2895 static int can_tile_for_shared_memory(struct cuda_gen *gen,
2896         struct cuda_array_info *array, __isl_keep isl_map *access,
2897         struct cuda_array_bound *bounds)
2898 {
2899         int i;
2900
2901         for (i = 0; i < array->n_index; ++i) {
2902                 isl_map *access_i;
2903                 isl_basic_map *hull;
2904
2905                 access_i = isl_map_copy(access);
2906                 access_i = isl_map_project_out(access_i, isl_dim_out, 0, i);
2907                 access_i = isl_map_project_out(access_i, isl_dim_out,
2908                                             1, array->n_index - (i + 1));
2909                 access_i = isl_map_compute_divs(access_i);
2910                 hull = isl_map_simple_hull(access_i);
2911                 if (compute_array_dim_size(gen, &bounds[i], hull) < 0)
2912                         return 0;
2913         }
2914
2915         return 1;
2916 }
2917
2918 /* Construct a map with input the shared tile loops and the loops that
2919  * will be wrapped around the threads that relates these later loops
2920  * to the thread indices and then projects them out.
2921  */
2922 static __isl_give isl_map *compute_privatization(struct cuda_gen *gen)
2923 {
2924         isl_map *priv;
2925         isl_map *tiling;
2926         isl_map *proj;
2927         isl_set *par;
2928         isl_space *dim;
2929
2930         dim = isl_union_map_get_space(gen->shared_sched);
2931
2932         if (gen->options->wrap)
2933                 tiling = wrap(isl_space_copy(dim), gen->shared_len + gen->n_block,
2934                                 gen->shared_len, gen->n_block, gen->block_dim);
2935         else
2936                 tiling = tile(isl_space_copy(dim), gen->shared_len + gen->n_block,
2937                                 gen->shared_len, gen->n_block, gen->block_dim);
2938
2939         priv = tiling;
2940
2941         par = parametrization(dim, gen->shared_len + 2 * gen->n_block,
2942                 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
2943                 gen->n_block, "t");
2944
2945         priv = isl_map_align_params(priv, isl_set_get_space(par));
2946         priv = isl_map_intersect_range(priv, par);
2947
2948         dim = isl_map_get_space(priv);
2949         dim = isl_space_drop_dims(dim, isl_dim_in, 0, isl_space_dim(dim, isl_dim_in));
2950         dim = isl_space_drop_dims(dim, isl_dim_out, 0, isl_space_dim(dim, isl_dim_out));
2951         proj = projection(dim, gen->shared_len + 2 * gen->n_block,
2952                           gen->shared_len);
2953
2954         priv = isl_map_apply_range(priv, proj);
2955
2956         return priv;
2957 }
2958
2959 /* Construct a map from domain_dim to domain_dim that increments
2960  * the dimension at position "pos" and leaves all other dimensions
2961  * constant.
2962  */
2963 static __isl_give isl_map *next(__isl_take isl_space *domain_dim, int pos)
2964 {
2965         int i;
2966         int len = isl_space_dim(domain_dim, isl_dim_set);
2967         isl_space *dim;
2968         isl_basic_map *next;
2969         isl_local_space *ls;
2970
2971         dim = isl_space_map_from_set(domain_dim);
2972         next = isl_basic_map_universe(isl_space_copy(dim));
2973         ls = isl_local_space_from_space(dim);
2974
2975         for (i = 0; i < len; ++i) {
2976                 isl_constraint *c;
2977
2978                 c = isl_equality_alloc(isl_local_space_copy(ls));
2979                 isl_constraint_set_coefficient_si(c, isl_dim_in, i, 1);
2980                 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
2981                 if (i == pos)
2982                         isl_constraint_set_constant_si(c, 1);
2983                 next = isl_basic_map_add_constraint(next, c);
2984         }
2985
2986         isl_local_space_free(ls);
2987
2988         return isl_map_from_basic_map(next);
2989 }
2990
2991 /* Check if the given access is coalesced.
2992  * That is, check whether incrementing the dimension that will get
2993  * wrapped over the last thread index results in incrementing
2994  * the last array index.
2995  *
2996  * This function is only called for access relations without reuse.
2997  */
2998 static int access_is_coalesced(struct cuda_gen *gen,
2999         __isl_keep isl_union_map *access)
3000 {
3001         isl_space *dim;
3002         isl_map *access_map;
3003         isl_map *next_thread_x;
3004         isl_map *next_element;
3005         isl_map *map;
3006         int coalesced;
3007
3008         access = isl_union_map_copy(access);
3009         access = isl_union_map_apply_domain(access,
3010                                 isl_union_map_copy(gen->tiled_sched));
3011         access_map = isl_map_from_union_map(access);
3012
3013         dim = isl_map_get_space(access_map);
3014         dim = isl_space_domain(dim);
3015         next_thread_x = next(dim, gen->shared_len + gen->n_block - 1);
3016
3017         dim = isl_map_get_space(access_map);
3018         dim = isl_space_range(dim);
3019         next_element = next(dim, isl_space_dim(dim, isl_dim_set) - 1);
3020
3021         map = isl_map_apply_domain(next_thread_x, isl_map_copy(access_map));
3022         map = isl_map_apply_range(map, access_map);
3023
3024         coalesced = isl_map_is_subset(map, next_element);
3025
3026         isl_map_free(next_element);
3027         isl_map_free(map);
3028
3029         return coalesced;
3030 }
3031
3032 /* For the given array reference group, check whether the access is private
3033  * to the thread.  That is, check that any given array element
3034  * is only accessed by a single thread.
3035  * We compute an access relation that maps the shared tile loop iterators
3036  * and the shared point loop iterators that will be wrapped over the
3037  * threads to the array elements.
3038  * We actually check that those iterators that will be wrapped
3039  * partition the array space.  This check is stricter than necessary
3040  * since several iterations may be mapped onto the same thread
3041  * and then they could be allowed to access the same memory elements,
3042  * but our check does not allow this situation.
3043  *
3044  * We also check that the index expression only depends on parallel
3045  * loops.  That way, we can move those loops innermost and unroll them.
3046  * Again, we use a test that is stricter than necessary.
3047  * We actually check whether the index expression only depends
3048  * on the iterators that are wrapped over the threads.
3049  * These are necessarily parallel, but there may be more parallel loops.
3050  *
3051  * Combining the injectivity of the first test with the single-valuedness
3052  * of the second test, we simply test for bijectivity.
3053  *
3054  * If it turns out we can use registers, we compute the private memory
3055  * tile size using can_tile_for_shared_memory, after introducing a dependence
3056  * on the thread indices.
3057  *
3058  * Before performing any of the above computations, we first check
3059  * if there is any reuse on the reference group.  If not, we simply
3060  * return.  If, moreover, the access is coalesced then we also remove
3061  * the shared memory tiling since we should just use global memory instead.
3062  */
3063 static void check_private_group_access(struct cuda_gen *gen,
3064         struct cuda_array_ref_group *group)
3065 {
3066         isl_map *acc;
3067         isl_union_map *access;
3068         int n_index = group->array->n_index;
3069
3070         access = group_access_relation(group, 1, 1);
3071         if (isl_union_map_is_injective(access)) {
3072                 if (group->shared_bound && access_is_coalesced(gen, access)) {
3073                         free_bound_list(group->shared_bound, n_index);
3074                         group->shared_bound = NULL;
3075                 }
3076                 isl_union_map_free(access);
3077                 return;
3078         }
3079         access = isl_union_map_apply_domain(access,
3080                                         isl_union_map_copy(gen->shared_sched));
3081
3082         acc = isl_map_from_union_map(access);
3083
3084         if (!isl_map_is_bijective(acc)) {
3085                 isl_map_free(acc);
3086                 return;
3087         }
3088
3089         group->private_bound = create_bound_list(gen->ctx, n_index);
3090         acc = isl_map_align_params(acc, isl_map_get_space(gen->privatization));
3091         acc = isl_map_apply_domain(acc, isl_map_copy(gen->privatization));
3092         if (!can_tile_for_shared_memory(gen, group->array, acc,
3093                                         group->private_bound)) {
3094                 free_bound_list(group->private_bound, n_index);
3095                 group->private_bound = NULL;
3096         }
3097
3098         isl_map_free(acc);
3099 }
3100
3101 /* Look for the last shared tile loop that affects the offset of the
3102  * shared or private tile and store the result in array->last_shared.
3103  */
3104 static void set_last_shared(struct cuda_gen *gen,
3105         struct cuda_array_ref_group *group)
3106 {
3107         int i, j;
3108         struct cuda_array_bound *bounds;
3109         unsigned first_shared = gen->first_shared;
3110         int n_index = group->array->n_index;
3111
3112         bounds = group->private_bound;
3113         if (!bounds)
3114                 bounds = group->shared_bound;
3115         if (!bounds)
3116                 return;
3117
3118         for (j = gen->shared_len - 1; j >= 0; --j) {
3119                 for (i = 0; i < n_index; ++i) {
3120                         isl_aff *lb;
3121                         isl_aff *shift;
3122
3123                         lb = bounds[i].lb;
3124                         if (isl_aff_involves_dims(lb, isl_dim_param,
3125                                                         first_shared + j, 1))
3126                                 break;
3127
3128                         shift = bounds[i].shift;
3129                         if (!shift)
3130                                 continue;
3131                         if (isl_aff_involves_dims(shift, isl_dim_param,
3132                                                         first_shared + j, 1))
3133                                 break;
3134                 }
3135                 if (i < n_index)
3136                         break;
3137         }
3138         group->last_shared = j;
3139 }
3140
3141 /* Compute the sizes of all private arrays for the current kernel,
3142  * as well as the offsets of the private pieces in the original arrays.
3143  * If we cannot or don't want to privatize a given array group,
3144  * we use the shared memory tile sizes computed in
3145  * compute_group_shared_bound instead.
3146  *
3147  * If we have been able to find a private or shared tile,
3148  * we also look for the last shared tile loop that affects the offset
3149  * (and therefore the group tile) and store the result in group->last_shared.
3150  *
3151  * A privatized copy of all access relations from reference groups that
3152  * are mapped to private memory is stored in gen->privatization.
3153  */
3154 static void compute_private_size(struct cuda_gen *gen)
3155 {
3156         int i, j;
3157         isl_union_map *private;
3158
3159         if (!gen->options->use_private_memory)
3160                 return;
3161
3162         private = isl_union_map_empty(isl_union_map_get_space(gen->shared_sched));
3163
3164         for (i = 0; i < gen->n_array; ++i) {
3165                 struct cuda_array_info *array = &gen->array[i];
3166
3167                 for (j = 0; j < array->n_group; ++j) {
3168                         check_private_group_access(gen, array->groups[j]);
3169
3170                         if (!array->groups[j]->private_bound)
3171                                 continue;
3172
3173                         private = isl_union_map_union(private,
3174                                 group_access_relation(array->groups[j], 1, 1));
3175                 }
3176
3177                 for (j = 0; j < array->n_group; ++j) {
3178                         array->groups[j]->last_shared = gen->shared_len - 1;
3179                         array->groups[j]->print_shared_level = -1;
3180                         set_last_shared(gen, array->groups[j]);
3181                 }
3182         }
3183
3184         if (isl_union_map_is_empty(private))
3185                 isl_union_map_free(private);
3186         else {
3187                 isl_union_map *priv;
3188
3189                 private = isl_union_map_apply_domain(private,
3190                                         isl_union_map_copy(gen->shared_sched));
3191                 priv = isl_union_map_from_map(isl_map_copy(gen->privatization));
3192                 private = isl_union_map_apply_domain(private, priv);
3193                 gen->private_access = private;
3194         }
3195 }
3196
3197 /* Compute the size of the tile specified by the list "bound" of n_index
3198  * cuda_array_bounds in number of elements and put the result in *size.
3199  */
3200 static void tile_size(unsigned n_index, struct cuda_array_bound *bound,
3201         isl_int *size)
3202 {
3203         int i;
3204
3205         isl_int_set_si(*size, 1);
3206
3207         for (i = 0; i < n_index; ++i)
3208                 isl_int_mul(*size, *size, bound[i].size);
3209 }
3210
3211 /* If max_shared_memory is not set to infinity (-1), then make
3212  * sure that the total amount of shared memory required by the
3213  * array reference groups mapped to shared memory is no larger
3214  * than this maximum.
3215  *
3216  * We apply a greedy approach and discard (keep in global memory)
3217  * those groups that would result in a total memory size that
3218  * is larger than the maximum.
3219  */
3220 static void check_shared_memory_bound(struct cuda_gen *gen)
3221 {
3222         int i, j;
3223         isl_int left, size;
3224
3225         if (gen->options->max_shared_memory < 0)
3226                 return;
3227
3228         isl_int_init(left);
3229         isl_int_init(size);
3230         isl_int_set_si(left, gen->options->max_shared_memory);
3231
3232         for (i = 0; i < gen->n_array; ++i) {
3233                 struct cuda_array_info *array = &gen->array[i];
3234
3235                 for (j = 0; j < array->n_group; ++j) {
3236                         struct cuda_array_ref_group *group;
3237
3238                         group = array->groups[j];
3239                         if (!group->shared_bound)
3240                                 continue;
3241
3242                         tile_size(array->n_index, group->shared_bound, &size);
3243                         isl_int_mul_ui(size, size, array->size);
3244
3245                         if (isl_int_le(size, left)) {
3246                                 isl_int_sub(left, left, size);
3247                                 continue;
3248                         }
3249
3250                         free_bound_list(group->shared_bound, array->n_index);
3251                         group->shared_bound = NULL;
3252                 }
3253         }
3254
3255         isl_int_clear(size);
3256         isl_int_clear(left);
3257 }
3258
3259 /* Fill up the groups array with singleton groups, i.e., one group
3260  * per reference, initializing the array, access, write and refs fields.
3261  * In particular the access field is initialized to the scheduled
3262  * access relation of the array reference.
3263  *
3264  * Return the number of elements initialized, i.e., the number of
3265  * active references in the current kernel.
3266  */
3267 static int populate_array_references(struct cuda_gen *gen,
3268         struct cuda_array_info *array, __isl_keep isl_union_map *sched,
3269         struct cuda_array_ref_group **groups)
3270 {
3271         int i;
3272         int n;
3273         isl_ctx *ctx = isl_union_map_get_ctx(sched);
3274
3275         n = 0;
3276         for (i = 0; i < array->n_ref; ++i) {
3277                 isl_union_map *umap;
3278                 isl_map *map;
3279                 struct cuda_array_ref_group *group;
3280                 struct cuda_stmt_access *access = array->refs[i];
3281
3282                 map = isl_map_copy(access->access);
3283                 umap = isl_union_map_from_map(map);
3284                 umap = isl_union_map_apply_domain(umap,
3285                                 isl_union_map_copy(sched));
3286
3287                 if (isl_union_map_is_empty(umap)) {
3288                         isl_union_map_free(umap);
3289                         continue;
3290                 }
3291
3292                 map = isl_map_from_union_map(umap);
3293                 map = isl_map_detect_equalities(map);
3294
3295                 group = isl_calloc_type(ctx, struct cuda_array_ref_group);
3296                 assert(group);
3297                 group->array = array;
3298                 group->access = map;
3299                 group->write = access->write;
3300                 group->refs = &array->refs[i];
3301
3302                 groups[n++] = group;
3303         }
3304
3305         return n;
3306 }
3307
3308 static void free_array_ref_group(struct cuda_array_ref_group *group,
3309         int n_index)
3310 {
3311         if (!group)
3312                 return;
3313         free_bound_list(group->shared_bound, n_index);
3314         free_bound_list(group->private_bound, n_index);
3315         isl_map_free(group->access);
3316         free(group->refs);
3317         free(group);
3318 }
3319
3320 /* Given a set where the parameters gen->first_shared up to
3321  * gen->first_shared + gen->shared_len represent the tile loops,
3322  * eliminate the innermost of those that have a fixed value
3323  * until we reach one that does not (obviously) have a fixed value.
3324  */
3325 static __isl_give isl_set *eliminate_fixed_inner_loops(struct cuda_gen *gen,
3326         __isl_take isl_set *access)
3327 {
3328         int i;
3329
3330         for (i = gen->shared_len - 1; i >= 0; --i) {
3331                 int pos = gen->first_shared + i;
3332                 if (!isl_set_plain_is_fixed(access, isl_dim_param, pos, NULL))
3333                         break;
3334                 access = isl_set_eliminate(access, isl_dim_param, pos, 1);
3335         }
3336         return access;
3337 }
3338
3339 /* Check if the accessed set of group1 and group2 overlap within
3340  * the innermost loop.  In particular, ignore any inner dimension
3341  * with a fixed value.
3342  * The copying to and from shared memory will be performed within
3343  * the innermost actual loop so we are only allowed to consider
3344  * the dimensions up to that innermost loop while checking whether
3345  * two access sets overlap.
3346  */
3347 static int accesses_overlap(struct cuda_gen *gen,
3348         struct cuda_array_ref_group *group1,
3349         struct cuda_array_ref_group *group2)
3350 {
3351         int empty;
3352         isl_set *access1, *access2;
3353
3354         access1 = isl_map_range(isl_map_copy(group1->access));
3355         access1 = eliminate_fixed_inner_loops(gen, access1);
3356         access2 = isl_map_range(isl_map_copy(group2->access));
3357         access2 = eliminate_fixed_inner_loops(gen, access2);
3358         access1 = isl_set_intersect(access1, access2);
3359         empty = isl_set_is_empty(access1);
3360         isl_set_free(access1);
3361
3362         return !empty;
3363 }
3364
3365 /* If two groups have overlapping access relations (within the innermost
3366  * loop) and if one of them involves a write, then merge the two groups
3367  * into one.
3368  *
3369  * We keep track of the grouping in "leader".  leader[j] points to
3370  * an earlier group array element that belongs to the same group,
3371  * or the array element j itself if this element is the first in the group.
3372  *
3373  * Return the number of group leaders.
3374  */
3375 static int group_overlapping_writes(struct cuda_gen *gen, int n,
3376         struct cuda_array_ref_group **groups, int *leader)
3377 {
3378         int i, j;
3379         int n_group = n;
3380
3381         for (i = 0; i < n; ++i) {
3382                 int l = i;
3383                 groups[l]->n_ref = 1;
3384                 for (j = i - 1; j >= 0; --j) {
3385                         if (leader[j] != j)
3386                                 continue;
3387                         if (!groups[l]->write && !groups[j]->write)
3388                                 continue;
3389
3390                         if (!accesses_overlap(gen, groups[l], groups[j]))
3391                                 continue;
3392
3393                         groups[j]->access = isl_map_union(groups[j]->access,
3394                                                         groups[l]->access);
3395                         groups[j]->write = 1;
3396                         groups[l]->access = NULL;
3397                         groups[j]->n_ref += groups[l]->n_ref;
3398                         l = leader[l] = j;
3399                         n_group--;
3400                 }
3401                 leader[i] = l;
3402         }
3403
3404         return n_group;
3405 }
3406
3407 /* Compute the size of the shared array corresponding to the given array
3408  * array refrence group, based on the accesses from the current kernel,
3409  * as well as the offset of the shared piece in the original array.
3410  */
3411 static void compute_group_shared_bound(struct cuda_gen *gen,
3412         struct cuda_array_info *array, struct cuda_array_ref_group *group)
3413 {
3414         isl_ctx *ctx = isl_space_get_ctx(array->dim);
3415
3416         if (!gen->options->use_shared_memory)
3417                 return;
3418         if (cuda_array_is_read_only_scalar(array))
3419                 return;
3420
3421         group->shared_bound = create_bound_list(ctx, array->n_index);
3422         if (!can_tile_for_shared_memory(gen, array, group->access,
3423                                         group->shared_bound)) {
3424                 free_bound_list(group->shared_bound, array->n_index);
3425                 group->shared_bound = NULL;
3426         }
3427 }
3428
3429 /* Is the size of the tile specified by "bound" smaller than the sum of
3430  * the sizes of the tiles specified by "bound1" and "bound2"?
3431  */
3432 static int smaller_tile(unsigned n_index, struct cuda_array_bound *bound,
3433         struct cuda_array_bound *bound1, struct cuda_array_bound *bound2)
3434 {
3435         int smaller;
3436         isl_int size, size1, size2;
3437
3438         isl_int_init(size);
3439         isl_int_init(size1);
3440         isl_int_init(size2);
3441
3442         tile_size(n_index, bound, &size);
3443         tile_size(n_index, bound1, &size1);
3444         tile_size(n_index, bound2, &size2);
3445
3446         isl_int_sub(size, size, size1);
3447         isl_int_sub(size, size, size2);
3448         smaller = isl_int_is_neg(size);
3449
3450         isl_int_clear(size2);
3451         isl_int_clear(size1);
3452         isl_int_clear(size);
3453
3454         return smaller;
3455 }
3456
3457 /* Given an initial grouping of array references and shared memory tiles
3458  * for each group that allows for a shared memory tile, merge two groups
3459  * if both have a shared memory tile, the merged group also has
3460  * a shared memory tile and the size of the tile for the merge group
3461  * is smaller than the sum of the tile sizes of the individual groups.
3462  *
3463  * Return the number of group leaders after merging.
3464  */
3465 static int group_common_shared_memory_tile(struct cuda_gen *gen,
3466         struct cuda_array_info *array, int n,
3467         struct cuda_array_ref_group **groups, int *leader, int n_group)
3468 {
3469         int i, j;
3470         isl_ctx *ctx = isl_space_get_ctx(array->dim);
3471
3472         for (i = 0; n_group > 1 && i < n; ++i) {
3473                 int l = i;
3474                 if (leader[i] != i)
3475                         continue;
3476                 if (!groups[i]->shared_bound)
3477                         continue;
3478                 for (j = i - 1; j >= 0; --j) {
3479                         isl_map *map;
3480                         int empty;
3481                         struct cuda_array_bound *shared_bound;
3482
3483                         if (leader[j] != j)
3484                                 continue;
3485                         if (!groups[j]->shared_bound)
3486                                 continue;
3487
3488                         map = isl_map_intersect(isl_map_copy(groups[l]->access),
3489                                             isl_map_copy(groups[j]->access));
3490                         empty = isl_map_is_empty(map);
3491                         isl_map_free(map);
3492
3493                         if (empty)
3494                                 continue;
3495
3496                         map = isl_map_union(isl_map_copy(groups[l]->access),
3497                                             isl_map_copy(groups[j]->access));
3498                         shared_bound = create_bound_list(ctx, array->n_index);
3499                         if (!can_tile_for_shared_memory(gen, array, map,
3500                                                         shared_bound) ||
3501                             !smaller_tile(array->n_index, shared_bound,
3502                                         groups[l]->shared_bound,
3503                                         groups[j]->shared_bound)) {
3504                                 isl_map_free(map);
3505                                 free_bound_list(shared_bound, array->n_index);
3506                                 continue;
3507                         }
3508
3509                         free_bound_list(groups[j]->shared_bound,
3510                                         array->n_index);
3511                         groups[j]->shared_bound = shared_bound;
3512                         isl_map_free(groups[j]->access);
3513                         groups[j]->access = map;
3514                         groups[j]->n_ref += groups[l]->n_ref;
3515                         l = leader[l] = j;
3516                         n_group--;
3517                 }
3518         }
3519
3520         return n_group;
3521 }
3522
3523 /* Extract an array of array reference groups from the array of references
3524  * and the grouping information in "leader".
3525  *
3526  * Store the results in array->n_group and array->groups.
3527  */
3528 static void extract_array_groups(isl_ctx *ctx, struct cuda_array_info *array,
3529         int n, struct cuda_array_ref_group **groups, int *leader, int n_group)
3530 {
3531         int i, j;
3532
3533         for (i = 2; i < n; ++i)
3534                 leader[i] = leader[leader[i]];
3535
3536         array->n_group = n_group;
3537         array->groups = isl_alloc_array(ctx, struct cuda_array_ref_group *,
3538                                         n_group);
3539         assert(array->groups);
3540
3541         j = 0;
3542         for (i = 0; i < n; ++i) {
3543                 int k, l;
3544                 struct cuda_stmt_access **refs;
3545
3546                 if (leader[i] != i) {
3547                         groups[i]->refs = NULL;
3548                         free_array_ref_group(groups[i], array->n_index);
3549                         continue;
3550                 }
3551
3552                 refs = isl_alloc_array(ctx, struct cuda_stmt_access *,
3553                                         groups[i]->n_ref);
3554                 assert(refs);
3555                 l = 0;
3556                 for (k = i; k < n; ++k)
3557                         if (leader[k] == i) {
3558                                 refs[l++] = *groups[k]->refs;
3559                                 (*groups[k]->refs)->group = j;
3560                         }
3561
3562                 groups[i]->refs = refs;
3563                 groups[i]->nr = j;
3564                 array->groups[j++] = groups[i];
3565         }
3566 }
3567
3568 /* Group array references that should be considered together when
3569  * deciding whether to access them from private, shared or global memory.
3570  *
3571  * In particular, if two array references overlap and if one of them
3572  * is a write, then the two references are grouped together.
3573  * Furthermore, if two groups admit a shared memory tile and if the
3574  * combination of the two also admits a shared memory tile, we merge
3575  * the two groups.
3576  *
3577  * During the construction the group->refs field points to a single
3578  * array reference inside the array of array references, while
3579  * group->n_ref contains the number of element in leader that
3580  * (directly or indirectly) point to this group, provided the group
3581  * is a leader.
3582  */
3583 static void group_array_references(struct cuda_gen *gen,
3584         struct cuda_array_info *array, __isl_keep isl_union_map *sched)
3585 {
3586         int i;
3587         int n, n_group;
3588         isl_ctx *ctx = isl_union_map_get_ctx(sched);
3589         struct cuda_array_ref_group **groups;
3590         int *leader;
3591
3592         groups = isl_calloc_array(ctx, struct cuda_array_ref_group *,
3593                                         array->n_ref);
3594         assert(groups);
3595
3596         n = populate_array_references(gen, array, sched, groups);
3597
3598         leader = isl_alloc_array(ctx, int, n);
3599         assert(leader);
3600
3601         n_group = group_overlapping_writes(gen, n, groups, leader);
3602
3603         for (i = 0; i < n; ++i)
3604                 if (leader[i] == i)
3605                         compute_group_shared_bound(gen, array, groups[i]);
3606
3607         n_group = group_common_shared_memory_tile(gen, array, n, groups,
3608                                                   leader, n_group);
3609
3610         extract_array_groups(ctx, array, n, groups, leader, n_group);
3611
3612         free(leader);
3613         free(groups);
3614 }
3615
3616 /* Take tiled_sched, project it onto the shared tile loops and
3617  * the loops that will be wrapped over the threads,
3618  * parametrize the shared tile loops and store the result in gen->shared_sched.
3619  * The position of the first of these parameters is stored in gen->first_shared.
3620  * Also compute a projection that projects out the loops that will be
3621  * wrapped over the threads and store this projection in gen->shared_proj.
3622  */
3623 static void compute_shared_sched(struct cuda_gen *gen)
3624 {
3625         isl_space *dim;
3626         isl_map *proj;
3627         isl_set *par;
3628         isl_union_map *sched;
3629
3630         sched = isl_union_map_copy(gen->tiled_sched);
3631
3632         dim = isl_union_map_get_space(sched);
3633         gen->first_shared = isl_space_dim(dim, isl_dim_param);
3634         proj = projection(dim, gen->tiled_len, gen->shared_len + gen->n_block);
3635         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
3636
3637         dim = isl_union_map_get_space(sched);
3638         par = parametrization(dim, gen->shared_len + gen->n_block,
3639                                 0, gen->shared_len, "g");
3640         sched = isl_union_map_intersect_range(sched,
3641                                                 isl_union_set_from_set(par));
3642
3643         dim = isl_union_map_get_space(sched);
3644         proj = projection(dim, gen->shared_len + gen->n_block, gen->shared_len);
3645
3646         gen->shared_sched = sched;
3647         gen->shared_proj = isl_union_map_from_map(proj);
3648 }
3649
3650 /* Group references of all arrays in the program.
3651  */
3652 static void group_references(struct cuda_gen *gen)
3653 {
3654         int i;
3655         isl_union_map *sched;
3656
3657         sched = isl_union_map_apply_range(isl_union_map_copy(gen->shared_sched),
3658                                           isl_union_map_copy(gen->shared_proj));
3659
3660         for (i = 0; i < gen->n_array; ++i)
3661                 group_array_references(gen, &gen->array[i], sched);
3662
3663         isl_union_map_free(sched);
3664 }
3665
3666 /* Free all array information that is local to the current kernel.
3667  */
3668 static void free_local_array_info(struct cuda_gen *gen)
3669 {
3670         int i, j;
3671
3672         for (i = 0; i < gen->n_array; ++i) {
3673                 struct cuda_array_info *array = &gen->array[i];
3674
3675                 for (j = 0; j < array->n_group; ++j)
3676                         free_array_ref_group(array->groups[j], array->n_index);
3677                 free(array->groups);
3678
3679                 if (array->n_group == 0)
3680                         continue;
3681                 for (j = 0; j < gen->array[i].n_index; ++j) {
3682                         isl_pw_aff_free(gen->array[i].local_bound[j]);
3683                         gen->array[i].local_bound[j] = NULL;
3684                 }
3685         }
3686 }
3687
3688 /* The sizes of the arrays on the host that have been computed by
3689  * extract_array_info may depend on the parameters.  Use the extra
3690  * constraints on the parameters that are valid at "host_domain"
3691  * to simplify these expressions.
3692  */
3693 static void localize_bounds(struct cuda_gen *gen,
3694         __isl_keep isl_set *host_domain)
3695 {
3696         int i, j;
3697         isl_set *context;
3698
3699         context = isl_set_copy(host_domain);
3700         context = isl_set_params(host_domain);
3701
3702         for (i = 0; i < gen->n_array; ++i) {
3703                 struct cuda_array_info *array = &gen->array[i];
3704
3705                 if (array->n_group == 0)
3706                         continue;
3707
3708                 for (j = 0; j < array->n_index; ++j) {
3709                         isl_pw_aff *pwaff;
3710
3711                         pwaff = isl_pw_aff_copy(array->bound[j]);
3712                         pwaff = isl_pw_aff_gist(pwaff, isl_set_copy(context));
3713                         array->local_bound[j] = pwaff;
3714                 }
3715         }
3716         isl_set_free(context);
3717 }
3718
3719 /* Set gen->tile_len and gen->n_parallel to those of the first statement
3720  * in the statement list u.
3721  * Because of the way the schedule is constructed, the other statements
3722  * in the list, if any, should have the same values for these properties.
3723  */
3724 static void set_tile_len(struct cuda_gen *gen, struct clast_user_stmt *u)
3725 {
3726         int nr;
3727         struct cuda_stmt *stmt;
3728
3729         nr = atoi(u->statement->name + 2);
3730         stmt = &gen->stmts[nr];
3731
3732         gen->tile_len = stmt->tile_len;
3733         gen->n_parallel = stmt->n_parallel;
3734 }
3735
3736 /* Extract a description of the grid, i.e., the possible values
3737  * of the block ids, from gen->tiled_sched.
3738  * The block ids are parameters in gen->tiled_sched.
3739  * We simply need to change them into set dimensions.
3740  */
3741 static __isl_give isl_set *extract_grid(struct cuda_gen *gen)
3742 {
3743         int i;
3744         isl_set *grid;
3745
3746         grid = isl_union_map_params(isl_union_map_copy(gen->tiled_sched));
3747         grid = isl_set_from_params(grid);
3748         grid = isl_set_add_dims(grid, isl_dim_set, gen->n_grid);
3749         for (i = 0; i < gen->n_grid; ++i) {
3750                 int pos;
3751                 char name[20];
3752
3753                 snprintf(name, sizeof(name), "b%d", i);
3754                 pos = isl_set_find_dim_by_name(grid, isl_dim_param, name);
3755                 assert(pos >= 0);
3756                 grid = isl_set_equate(grid, isl_dim_param, pos, isl_dim_set, i);
3757                 grid = isl_set_project_out(grid, isl_dim_param, pos, 1);
3758         }
3759
3760         return grid;
3761 }
3762
3763 /* Print the effective grid size as a list of the sizes in each
3764  * dimension, from innermost to outermost.
3765  *
3766  * The grid size specified by the user or set by default
3767  * in read_grid_sizes() and applied in tile_schedule(),
3768  * may be too large for the given code in the sense that
3769  * it may contain blocks that don't need to execute anything.
3770  * We therefore don't print this grid size, but instead the
3771  * smallest grid size that ensures that all blocks that actually
3772  * execute code are included in the grid.
3773  *
3774  * For each block dimension, we compute the maximal value of the block id
3775  * and add one.
3776  */
3777 static void print_grid_size(struct cuda_gen *gen, __isl_take isl_set *context)
3778 {
3779         int i;
3780         isl_printer *prn;
3781         isl_set *grid;
3782
3783         if (gen->n_grid == 0) {
3784                 isl_set_free(context);
3785                 return;
3786         }
3787
3788         grid = extract_grid(gen);
3789
3790         prn = isl_printer_to_file(gen->ctx, gen->cuda.host_c);
3791         prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
3792
3793         prn = isl_printer_print_str(prn, "(");
3794         for (i = gen->n_grid - 1; i >= 0; --i) {
3795                 isl_space *space;
3796                 isl_aff *one;
3797                 isl_pw_aff *bound = isl_set_dim_max(isl_set_copy(grid), i);
3798
3799                 bound = isl_pw_aff_coalesce(bound);
3800                 bound = isl_pw_aff_gist(bound, isl_set_copy(context));
3801
3802                 space = isl_pw_aff_get_domain_space(bound);
3803                 one = isl_aff_zero_on_domain(isl_local_space_from_space(space));
3804                 one = isl_aff_add_constant_si(one, 1);
3805                 bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one));
3806                 prn = isl_printer_print_pw_aff(prn, bound);
3807                 isl_pw_aff_free(bound);
3808
3809                 if (i > 0)
3810                         prn = isl_printer_print_str(prn, ", ");
3811         }
3812         prn = isl_printer_print_str(prn, ")");
3813
3814         isl_printer_free(prn);
3815         isl_set_free(grid);
3816         isl_set_free(context);
3817 }
3818
3819 /* This function is called for each leaf in the clast of the host code.
3820  * We first specialize the schedule to the site of the leaf, compute
3821  * the size of shared memory and then print the body of host code
3822  * and the associated kernel (through a call to print_kernel_body).
3823  */
3824 static void print_host_user(struct clast_printer_info *code,
3825         struct clast_user_stmt *u)
3826 {
3827         struct cuda_gen *gen = code->user;
3828         isl_space *dim;
3829         isl_set *par;
3830         isl_set *host_domain;
3831         isl_union_map *access;
3832         isl_union_map *local_sched;
3833         isl_union_set *arrays;
3834
3835         set_tile_len(gen, u);
3836         read_sizes(gen);
3837
3838         host_domain = extract_entire_host_domain(&u->stmt);
3839
3840         local_sched = isl_union_map_intersect_range(
3841                     isl_union_map_copy(gen->sched),
3842                     isl_union_set_from_set(extend(isl_set_copy(host_domain),
3843                                                   gen->untiled_len)));
3844         access = isl_union_map_union(isl_union_map_copy(gen->read),
3845                                      isl_union_map_copy(gen->write));
3846         access = isl_union_map_apply_domain(access,
3847                                             isl_union_map_copy(local_sched));
3848         arrays = isl_union_map_range(access);
3849
3850         print_indent(code->dst, code->indent);
3851         fprintf(code->dst, "dim3 k%d_dimBlock", gen->kernel_id);
3852         print_reverse_list(code->dst, gen->n_block, gen->block_dim);
3853         fprintf(code->dst, ";\n");
3854
3855         gen->tiled_sched = tile_schedule(gen, local_sched);
3856         gen->tiled_sched = parametrize_tiled_schedule(gen, gen->tiled_sched);
3857         gen->tiled_sched = scale_tile_loops(gen, gen->tiled_sched);
3858
3859         print_indent(code->dst, code->indent);
3860         fprintf(code->dst, "dim3 k%d_dimGrid", gen->kernel_id);
3861         print_grid_size(gen, isl_set_params(isl_set_copy(host_domain)));
3862         fprintf(code->dst, ";\n");
3863
3864         gen->local_sched = isl_union_map_copy(gen->tiled_sched);
3865
3866         dim = isl_union_map_get_space(gen->local_sched);
3867         par = parametrization(dim, gen->tiled_len, 0, gen->shared_len, "g");
3868         gen->local_sched = isl_union_map_intersect_range(gen->local_sched,
3869                                                 isl_union_set_from_set(par));
3870
3871         gen->local_sched = thread_tile_schedule(gen, gen->local_sched);
3872         gen->local_sched = scale_thread_tile_loops(gen, gen->local_sched);
3873
3874         gen->private_access = NULL;
3875         compute_shared_sched(gen);
3876         gen->privatization = compute_privatization(gen);
3877         group_references(gen);
3878         compute_private_size(gen);
3879         check_shared_memory_bound(gen);
3880         localize_bounds(gen, host_domain);
3881
3882         gen->local_sched = interchange_for_unroll(gen, gen->local_sched);
3883
3884         print_kernel_launch(gen, arrays);
3885
3886         fprintf(gen->cuda.kernel_c, "{\n");
3887
3888         print_kernel_body(gen, host_domain, gen->tiled_sched);
3889
3890         fprintf(gen->cuda.kernel_c, "}\n");
3891
3892         free_local_array_info(gen);
3893         isl_map_free(gen->privatization);
3894         isl_union_map_free(gen->private_access);
3895         isl_union_map_free(gen->local_sched);
3896         isl_union_map_free(gen->tiled_sched);
3897         isl_union_map_free(gen->shared_sched);
3898         isl_union_map_free(gen->shared_proj);
3899         isl_union_set_free(arrays);
3900         isl_set_free(host_domain);
3901
3902         free(gen->tile_size);
3903         gen->kernel_id++;
3904 }
3905
3906 /* Use CLooG to generate code for the outer gen->tile_first loops
3907  * of the global schedule in gen->sched.
3908  * The pretty printing of this code is handled by print_clast,
3909  * which calls print_host_user for each kernel invocation location.
3910  */
3911 static void print_cloog_host_code(struct cuda_gen *gen)
3912 {
3913         int i;
3914         isl_set *context;
3915         isl_union_map *sched;
3916         CloogOptions *options;
3917         CloogDomain *cloog_context;
3918         CloogUnionDomain *ud;
3919         CloogInput *input;
3920         struct clast_stmt *stmt;
3921         char name[20];
3922
3923         options = cloog_options_malloc(gen->state);
3924         options->language = CLOOG_LANGUAGE_C;
3925         options->otl = 0;
3926         options->strides = 1;
3927         options->stop = gen->tile_first;
3928         options->f = gen->untiled_len;
3929         options->l = gen->untiled_len;
3930         options->save_domains = 1;
3931         options->noscalars = 1;
3932
3933         sched = isl_union_map_copy(gen->sched);
3934         ud = cloog_union_domain_from_isl_union_map(sched);
3935         for (i = 0; i < options->stop; ++i) {
3936                 snprintf(name, sizeof(name), "h%d", i);
3937                 ud = cloog_union_domain_set_name(ud, CLOOG_SCAT, i, name);
3938         }
3939         context = isl_set_copy(gen->context);
3940         cloog_context = cloog_domain_from_isl_set(context);
3941         input = cloog_input_alloc(cloog_context, ud);
3942
3943         stmt = cloog_clast_create_from_input(input, options);
3944
3945         gen->code.indent = 0;
3946         gen->code.dst = gen->cuda.host_c;
3947         gen->code.print_user_stmt = NULL;
3948         gen->code.print_user_stmt_list = &print_host_user;
3949         gen->code.print_for_head = NULL;
3950         gen->code.print_for_foot = NULL;
3951         gen->code.user = gen;
3952         print_clast(&gen->code, stmt);
3953
3954         cloog_clast_free(stmt);
3955         cloog_options_free(options);
3956         fprintf(gen->cuda.host_c, "\n");
3957 }
3958
3959 void print_cuda_macros(struct cuda_gen *gen)
3960 {
3961         const char *macros =
3962                 "#define cudaCheckReturn(ret) assert((ret) == cudaSuccess)\n"
3963                 "#define cudaCheckKernel()"
3964                 " assert(cudaGetLastError() == cudaSuccess)\n\n";
3965         fputs(macros, gen->cuda.host_c);
3966 }
3967
3968 void print_host_code(struct cuda_gen *gen)
3969 {
3970         fprintf(gen->cuda.host_c, "{\n");
3971         print_cloog_macros(gen->cuda.host_c);
3972         print_cloog_macros(gen->cuda.kernel_c);
3973
3974         print_cuda_macros(gen);
3975
3976         declare_device_arrays(gen);
3977
3978         allocate_device_arrays(gen);
3979         copy_arrays_to_device(gen);
3980
3981         gen->kernel_id = 0;
3982         print_cloog_host_code(gen);
3983
3984         copy_arrays_from_device(gen);
3985         free_device_arrays(gen);
3986
3987         fprintf(gen->cuda.host_c, "}\n");
3988 }
3989
3990 __isl_give isl_set *add_context_from_str(__isl_take isl_set *set,
3991         const char *str)
3992 {
3993         isl_ctx *ctx;
3994         isl_set *context;
3995
3996         if (!str)
3997                 return set;
3998
3999         ctx = isl_set_get_ctx(set);
4000         context = isl_set_read_from_str(ctx, str);
4001         context = isl_set_align_params(context, isl_set_get_space(set));
4002         set = isl_set_intersect(set, context);
4003
4004         return set;
4005 }
4006
4007 __isl_give isl_union_map *extract_sizes_from_str(isl_ctx *ctx, const char *str)
4008 {
4009         if (!str)
4010                 return NULL;
4011         return isl_union_map_read_from_str(ctx, str);
4012 }
4013
4014 /* Return the union of all iteration domains of the gen->stmts[i].
4015  */
4016 static __isl_give isl_union_set *extract_domain(struct cuda_gen *gen)
4017 {
4018         int i;
4019         isl_union_set *domain;
4020
4021         domain = isl_union_set_empty(isl_set_get_space(gen->context));
4022         for (i = 0; i < gen->n_stmts; ++i) {
4023                 isl_set *domain_i;
4024
4025                 domain_i = isl_set_copy(gen->stmts[i].domain);
4026                 domain = isl_union_set_union(domain,
4027                                              isl_union_set_from_set(domain_i));
4028         }
4029
4030         return domain;
4031 }
4032
4033 /* Information about the outermost tilable bands in the forest of bands.
4034  *
4035  * tile_len and n_parallel are only sets on band_info structures
4036  * that correspond to outermost bands.  For other bands (in particular,
4037  * ancestors of the outermost bands), n_parallal is set to 0.
4038  *
4039  * prefix is the (padded) schedule leading up to the outermost tilable bands.
4040  *
4041  * tile_first is the number of schedule dimensions in prefix.
4042  *
4043  * suffix is the schedule of the outermost tilable bands and their descendants.
4044  */
4045 struct band_info {
4046         struct cuda_gen *gen;
4047         int tile_first;
4048         int tile_len;
4049         int n_parallel;
4050         isl_union_map *prefix;
4051         isl_union_map *suffix;
4052 };
4053
4054 /* Set tile_len and n_parallel of the statement to that of
4055  * their outermost band, recorded in the band_info.
4056  */
4057 static int set_stmt_tile_len(__isl_take isl_map *map, void *user)
4058 {
4059         struct band_info *info = user;
4060         int nr;
4061         struct cuda_stmt *stmt;
4062
4063         nr = atoi(isl_map_get_tuple_name(map, isl_dim_in) + 2);
4064         stmt = &info->gen->stmts[nr];
4065
4066         stmt->tile_len = info->tile_len;
4067         stmt->n_parallel = info->n_parallel;
4068
4069         isl_map_free(map);
4070
4071         return 0;
4072 }
4073
4074 static void list_select_outer_band(struct cuda_gen *gen,
4075         __isl_take isl_band_list *list, int pos, struct band_info *list_info);
4076
4077 /* Check if this band has any parallel loops.  If so, take it as
4078  * the outermost tilable band.  If not, continue looking for the
4079  * outermost tilable band in the children of the current band.
4080  */
4081 static void band_select_outer_band(struct cuda_gen *gen,
4082         __isl_take isl_band *band, int pos, struct band_info *info)
4083 {
4084         int n = isl_band_n_member(band);
4085         int n_parallel;
4086
4087         for (n_parallel = 0; n_parallel < n; ++n_parallel)
4088                 if (!isl_band_member_is_zero_distance(band, n_parallel))
4089                         break;
4090
4091         info->n_parallel = n_parallel;
4092         if (n_parallel) {
4093                 info->gen = gen;
4094                 info->tile_first = pos;
4095                 info->tile_len = n;
4096                 info->prefix = isl_band_get_prefix_schedule(band);
4097                 info->suffix = isl_union_map_flat_range_product(
4098                                 isl_band_get_partial_schedule(band),
4099                                 isl_band_get_suffix_schedule(band));
4100                 isl_union_map_foreach_map(info->prefix,
4101                                             &set_stmt_tile_len, info);
4102         } else if (isl_band_has_children(band)) {
4103                 isl_band_list *children;
4104                 children = isl_band_get_children(band);
4105                 list_select_outer_band(gen, children, pos + n, info);
4106         } else {
4107                 info->gen = gen;
4108                 info->tile_first = pos + n;
4109                 info->tile_len = 0;
4110                 info->prefix = isl_union_map_flat_range_product(
4111                                 isl_band_get_prefix_schedule(band),
4112                                 isl_band_get_partial_schedule(band));
4113                 info->suffix = isl_band_get_suffix_schedule(band);
4114                 isl_union_map_foreach_map(info->prefix,
4115                                             &set_stmt_tile_len, info);
4116         }
4117
4118         isl_band_free(band);
4119 }
4120
4121 /* Comparison function that returns a non-zero value for band_infos
4122  * with different tile_len fields or different n_parallel fields.
4123  */
4124 static int cmp_band(const void *p1, const void *p2)
4125 {
4126         const struct band_info *info1 = p1;
4127         const struct band_info *info2 = p2;
4128
4129         if (info1->tile_len != info2->tile_len)
4130                 return info1->tile_len - info2->tile_len;
4131
4132         return info1->n_parallel - info2->n_parallel;
4133 }
4134
4135 /* Extend "umap" with coordinates with fixed value "val"
4136  * to a total length of "dst_len", assuming the original dimension is "src_len".
4137  */
4138 static __isl_give isl_union_map *extend_range(__isl_take isl_union_map *umap,
4139         int src_len, int dst_len, int val)
4140 {
4141         isl_space *dim;
4142         isl_map *map;
4143         int i;
4144
4145         dim = isl_union_map_get_space(umap);
4146         map = isl_map_reverse(projection(dim, dst_len, src_len));
4147         for (i = src_len; i < dst_len; ++i)
4148                 map = isl_map_fix_si(map, isl_dim_out, i, val);
4149
4150         umap = isl_union_map_apply_range(umap, isl_union_map_from_map(map));
4151
4152         return umap;
4153 }
4154
4155 /* Group bands with the same values for tile_len and n_parallel.
4156  * The prefix schedule is then extended with a fixed coordinate that
4157  * is different for each such group.
4158  * Note that the actual values for this coordinate are not important.
4159  * The bands have already been effectively separated at a higher level
4160  * or they are independent and may be executed in parallel.
4161  * The list of band_info has been sorted before this functions is called.
4162  */
4163 static void separate_bands(struct band_info *info, int n)
4164 {
4165         int i;
4166         int j = 0;
4167
4168         for (i = 0; i < n; ++i) {
4169                 int l = info[i].tile_first;
4170
4171                 if (i &&
4172                     (info[i].tile_len != info[i - 1].tile_len ||
4173                      info[i].n_parallel != info[i - 1].n_parallel))
4174                         j++;
4175
4176                 info[i].prefix = extend_range(info[i].prefix,
4177                                                 l, l + 1, j);
4178                 info[i].tile_first = l + 1;
4179         }
4180 }
4181
4182 /* Select the outermost bands in the elements of the list, align
4183  * their prefix schedules, separate bands with different values
4184  * for tile_len and/or n_parallel and then combine the resulting
4185  * prefix and suffix schedules into a single pair of prefix and
4186  * suffix schedules for the entire list.
4187  */
4188 static void list_select_outer_band(struct cuda_gen *gen,
4189         __isl_take isl_band_list *list, int pos, struct band_info *list_info)
4190 {
4191         isl_band *band;
4192         int i;
4193         int n = isl_band_list_n_band(list);
4194         isl_ctx *ctx = isl_band_list_get_ctx(list);
4195         struct band_info *info;
4196         int max_tile_first;
4197         isl_union_map *prefix;
4198         isl_union_map *suffix;
4199
4200         assert(n >= 1);
4201         info = isl_calloc_array(ctx, struct band_info, n);
4202         assert(info);
4203
4204         max_tile_first = 0;
4205         for (i = 0; i < n; ++i) {
4206                 band = isl_band_list_get_band(list, i);
4207                 band_select_outer_band(gen, band, pos, &info[i]);
4208                 if (info[i].tile_first > max_tile_first)
4209                         max_tile_first = info[i].tile_first;
4210         }
4211
4212         for (i = 0; i < n; ++i) {
4213                 if (info[i].tile_first == max_tile_first)
4214                         continue;
4215                 info[i].prefix = extend_range(info[i].prefix,
4216                                         info[i].tile_first, max_tile_first, 0);
4217                 info[i].tile_first = max_tile_first;
4218         }
4219
4220         qsort(info, n, sizeof(struct band_info), &cmp_band);
4221
4222         for (i = 0; i < n - 1; ++i)
4223                 if (info[i].tile_len != info[i + 1].tile_len ||
4224                     info[i].n_parallel != info[i + 1].n_parallel)
4225                         break;
4226
4227         if (i < n -1)
4228                 separate_bands(info, n);
4229
4230         prefix = info[0].prefix;
4231         suffix = info[0].suffix;
4232
4233         for (i = 1; i < n; ++i) {
4234                 prefix = isl_union_map_union(prefix, info[i].prefix);
4235                 suffix = isl_union_map_union(suffix, info[i].suffix);
4236         }
4237
4238         list_info->tile_first = info[0].tile_first;
4239         list_info->tile_len = -1;
4240         list_info->prefix = prefix;
4241         list_info->suffix = suffix;
4242
4243         isl_band_list_free(list);
4244         free(info);
4245 }
4246
4247 /* Set max_out to the maximal number of output dimensions over
4248  * all maps.
4249  */
4250 static int update_max_out(__isl_take isl_map *map, void *user)
4251 {
4252         int *max_out = user;
4253         int n_out = isl_map_dim(map, isl_dim_out);
4254
4255         if (n_out > *max_out)
4256                 *max_out = n_out;
4257
4258         isl_map_free(map);
4259         return 0;
4260 }
4261
4262 struct align_range_data {
4263         int max_out;
4264         isl_union_map *res;
4265 };
4266
4267 /* Extend the dimension of the range of the given map to data->max_out and
4268  * then add the result to data->res.
4269  */
4270 static int map_align_range(__isl_take isl_map *map, void *user)
4271 {
4272         struct align_range_data *data = user;
4273         int i;
4274         isl_space *dim;
4275         isl_map *proj;
4276         int n_out = isl_map_dim(map, isl_dim_out);
4277
4278         dim = isl_union_map_get_space(data->res);
4279         proj = isl_map_reverse(projection(dim, data->max_out, n_out));
4280         for (i = n_out; i < data->max_out; ++i)
4281                 proj = isl_map_fix_si(proj, isl_dim_out, i, 0);
4282
4283         map = isl_map_apply_range(map, proj);
4284
4285         data->res = isl_union_map_add_map(data->res, map);
4286
4287         return 0;
4288 }
4289
4290 /* Extend the ranges of the maps in the union map such they all have
4291  * the same dimension.
4292  */
4293 static __isl_give isl_union_map *align_range(__isl_take isl_union_map *umap)
4294 {
4295         struct align_range_data data;
4296
4297         data.max_out = 0;
4298         isl_union_map_foreach_map(umap, &update_max_out, &data.max_out);
4299
4300         data.res = isl_union_map_empty(isl_union_map_get_space(umap));
4301         isl_union_map_foreach_map(umap, &map_align_range, &data);
4302
4303         isl_union_map_free(umap);
4304         return data.res;
4305 }
4306
4307 /* Select the outermost tilable band that (by construction)
4308  * has at least one parallel loop.
4309  * The starting position of the aligned band is stored in the pair
4310  * gen->tile_first.
4311  * The sizes and number of parallel loops may be different in different
4312  * parts of the band forest and are therefore stored in the cuda_stmts.
4313  *
4314  * Return the complete schedule, with the tilable bands aligned
4315  * at gen->tile_first and padded with zero, if needed.
4316  */
4317 static __isl_give isl_union_map *select_outer_tilable_band(struct cuda_gen *gen,
4318         __isl_keep isl_schedule *schedule)
4319 {
4320         isl_band_list *list;
4321         struct band_info info;
4322
4323         gen->n_parallel = 0;
4324         gen->tile_len = -1;
4325
4326         list = isl_schedule_get_band_forest(schedule);
4327
4328         list_select_outer_band(gen, list, 0, &info);
4329
4330         gen->tile_first = info.tile_first;
4331         info.suffix = align_range(info.suffix);
4332
4333         return isl_union_map_flat_range_product(info.prefix, info.suffix);
4334 }
4335
4336 /* Set gen->untiled_len to the number of scheduling dimensions
4337  * for the schedule of the first domain.
4338  * We assume here that this number is the same for all domains.
4339  */
4340 static int set_untiled_len(__isl_take isl_map *map, void *user)
4341 {
4342         unsigned *untiled_len = user;
4343
4344         *untiled_len = isl_map_dim(map, isl_dim_out);
4345
4346         isl_map_free(map);
4347         return -1;
4348 }
4349
4350 /* Compute an appropriate schedule based on the accesses in
4351  * gen->read and gen->write.
4352  *
4353  * We first compute dependences and then use those to compute
4354  * a schedule that has a parallel loop in each tilable band.
4355  * Finally, we select the outermost tilable band.
4356  */
4357 static void compute_schedule(struct cuda_gen *gen,
4358         __isl_take isl_union_map *sched)
4359 {
4360         isl_union_set *domain;
4361         isl_union_map *empty;
4362         isl_union_map *dep_raw, *dep2, *dep3, *dep;
4363         isl_union_map *uninitialized;
4364         isl_schedule *schedule;
4365
4366         empty = isl_union_map_empty(isl_union_map_get_space(sched));
4367
4368         isl_union_map_compute_flow(isl_union_map_copy(gen->read),
4369                             isl_union_map_copy(gen->write), empty,
4370                             isl_union_map_copy(sched),
4371                             &dep_raw, NULL, &uninitialized, NULL);
4372         isl_union_map_compute_flow(isl_union_map_copy(gen->write),
4373                             isl_union_map_copy(gen->write),
4374                             isl_union_map_copy(gen->read),
4375                             isl_union_map_copy(sched),
4376                             &dep2, &dep3, NULL, NULL);
4377         isl_union_map_free(sched);
4378
4379         gen->copy_in = isl_union_map_range(uninitialized);
4380
4381         dep = isl_union_map_union(dep2, dep3);
4382         dep = isl_union_map_union(dep, dep_raw);
4383         dep = isl_union_map_coalesce(dep);
4384
4385         domain = extract_domain(gen);
4386         schedule = isl_union_set_compute_schedule(isl_union_set_copy(domain),
4387                                 isl_union_map_copy(dep), dep);
4388
4389         sched = select_outer_tilable_band(gen, schedule);
4390
4391         isl_union_map_foreach_map(sched, &set_untiled_len, &gen->untiled_len);
4392         sched = isl_union_map_intersect_domain(sched, domain);
4393         gen->sched = sched;
4394
4395         isl_schedule_free(schedule);
4396 }
4397
4398 static struct cuda_stmt_access **expr_extract_access(struct pet_expr *expr,
4399         struct cuda_stmt_access **next_access)
4400 {
4401         struct cuda_stmt_access *access;
4402         isl_ctx *ctx = isl_map_get_ctx(expr->acc.access);
4403
4404         access = isl_alloc_type(ctx, struct cuda_stmt_access);
4405         assert(access);
4406         access->next = NULL;
4407         access->read = expr->acc.read;
4408         access->write = expr->acc.write;
4409         access->access = isl_map_copy(expr->acc.access);
4410
4411         *next_access = access;
4412         next_access = &(*next_access)->next;
4413         return next_access;
4414 }
4415
4416 static struct cuda_stmt_access **expr_extract_accesses(struct pet_expr *expr,
4417         struct cuda_stmt_access **next_access)
4418 {
4419         int i;
4420
4421         for (i = 0; i < expr->n_arg; ++i)
4422                 next_access = expr_extract_accesses(expr->args[i],
4423                                                         next_access);
4424
4425         if (expr->type == pet_expr_access)
4426                 next_access = expr_extract_access(expr, next_access);
4427
4428         return next_access;
4429 }
4430
4431 static void pet_stmt_extract_accesses(struct cuda_stmt *stmt)
4432 {
4433         struct cuda_stmt_access **next_access = &stmt->accesses;
4434
4435         stmt->accesses = NULL;
4436         expr_extract_accesses(stmt->body, next_access);
4437 }
4438
4439 /* Return an array of cuda_stmt representing the statements in "scop".
4440  */
4441 static struct cuda_stmt *extract_stmts(isl_ctx *ctx, struct pet_scop *scop,
4442         __isl_keep isl_set *context)
4443 {
4444         int i;
4445         struct cuda_stmt *stmts;
4446
4447         stmts = isl_calloc_array(ctx, struct cuda_stmt, scop->n_stmt);
4448         assert(stmts);
4449
4450         for (i = 0; i < scop->n_stmt; ++i) {
4451                 struct cuda_stmt *s = &stmts[i];
4452
4453                 s->domain = isl_set_copy(scop->stmts[i]->domain);
4454                 s->domain = isl_set_intersect_params(s->domain,
4455                                                         isl_set_copy(context));
4456                 s->body = scop->stmts[i]->body;
4457                 pet_stmt_extract_accesses(s);
4458         }
4459
4460         return stmts;
4461 }
4462
4463 /* Replace the scop in the "input" file by equivalent code
4464  * that uses the GPU.  "scop" is assumed to correspond to this scop.
4465  *
4466  * We first compute a schedule that respects the dependences
4467  * of the original program and select the outermost band
4468  * of tilable dimensions that has at least one parallel loop.
4469  * We then have three blocks of dimensions
4470  *
4471  *      H               B                       G
4472  *
4473  * The tilable band "B" is first tiled according to "tile" sizes, resulting
4474  * in
4475  *
4476  *      H       T               P               G
4477  *
4478  * For each iteration of the T loop and for each array, we compute
4479  * the array elements accessed by that iteration, construct a rectangular
4480  * box around it and shift it to the origin.  The result is used
4481  * as shared memory for the array.
4482  *
4483  * We then split off at most 2 parallel loops from the T loops and
4484  * at most 3 parallel loops from the P loops
4485  *
4486  *      H       T1      T2      P1      P2      G
4487  *
4488  * The T1/P1 loops are then tiled or "wrapped" over the blocks/threads,
4489  * according to "grid"/"block" sizes.
4490  *
4491  *      H       T1T T1P T2      P1T P1P P2      G
4492  *
4493  * Finally, the T1P and P1P iterators are equated to the block and
4494  * thread dimensions respectively and so are effectively removed.
4495  * The H loops are run on the host.  The T1T, T2, P1T, P2 and G loops
4496  * are run on the GPU.
4497  *
4498  * Code is generated in three stages.  We first generate code for the
4499  * host (the H loops), with iterators h%d.  Then, for each leaf node
4500  * of the resulting AST, we generate code for the shared loops (up to
4501  * and including T2), with iterators g%d and after equating the H loops
4502  * to h%d parameters and the T1P loops to the block dimensions.
4503  * Finally, we generate code for the remaining loops in a similar fashion.
4504  */
4505 int generate_cuda(isl_ctx *ctx, struct pet_scop *scop,
4506         struct ppcg_options *options, const char *input)
4507 {
4508         isl_union_map *sched;
4509         struct cuda_gen gen;
4510
4511         if (!scop)
4512                 return -1;
4513
4514         scop = pet_scop_align_params(scop);
4515
4516         gen.ctx = ctx;
4517         gen.context = isl_set_copy(scop->context);
4518         gen.context = add_context_from_str(gen.context, options->ctx);
4519         gen.sizes = extract_sizes_from_str(ctx, options->sizes);
4520         gen.n_stmts = scop->n_stmt;
4521         gen.stmts = extract_stmts(ctx, scop, gen.context);
4522         gen.read = pet_scop_collect_reads(scop);
4523         gen.write = pet_scop_collect_writes(scop);
4524         gen.options = options;
4525         gen.state = cloog_isl_state_malloc(gen.ctx);
4526         gen.scop = scop;
4527
4528         cuda_open_files(&gen.cuda, input);
4529
4530         collect_array_info(&gen);
4531
4532         sched = pet_scop_collect_schedule(scop);
4533
4534         compute_schedule(&gen, sched);
4535
4536         print_host_code(&gen);
4537
4538         cloog_state_free(gen.state);
4539         clear_cuda_gen(&gen);
4540
4541         cuda_close_files(&gen.cuda);
4542
4543         return 0;
4544 }