cuda.c

   1 /*
   2  * Copyright 2010-2011 INRIA Saclay
   3  *
   4  * Use of this software is governed by the GNU LGPLv2.1 license
   5  *
   6  * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
   7  * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
   8  * 91893 Orsay, France
   9  */
  10
  11 #include <assert.h>
  12 #include <stdlib.h>
  13
  14 #include <isl/polynomial.h>
  15 #include <isl/union_set.h>
  16 #include <isl/aff.h>
  17 #include <isl/ilp.h>
  18 #include <isl/flow.h>
  19 #include <isl/band.h>
  20 #include <isl/schedule.h>
  21 #include <isl/options.h>
  22 #include <cloog/isl/cloog.h>
  23
  24 #include "cuda.h"
  25 #include "cuda_common.h"
  26 #include "gpucode.h"
  27 #include "schedule.h"
  28 #include "scoplib_isl.h"
  29 #include "ppcg_options.h"
  30
  31 /* The fields stride, shift and shift_map only contain valid information
  32  * if shift != NULL.
  33  * If so, they express that current index is such that if you add shift,
  34  * then the result is always a multiple of stride.
  35  * shift_map contains the mapping
  36  *
  37  *      i -> (i + shift)/stride
  38  */
  39 struct cuda_array_bound {
  40         isl_int size;
  41         isl_aff *lb;
  42
  43         isl_int stride;
  44         isl_qpolynomial *shift;
  45         isl_basic_map *shift_map;
  46 };
  47
  48 struct cuda_array_info;
  49
  50 /* A group of array references in a kernel that should be handled together.
  51  * If private_bound is not NULL, then it is mapped to registers.
  52  * Otherwise, if shared_bound is not NULL, it is mapped to shared memory.
  53  * Otherwise, it is accesses from global memory.
  54  */
  55 struct cuda_array_ref_group {
  56         /* The references in this group access this array. */
  57         struct cuda_array_info *array;
  58         /* Position of this group in the list of reference groups of array. */
  59         int nr;
  60
  61         /* The following fields are use during the construction of the groups.
  62          * access is the combined access relation relative to the shared
  63          * memory tiling.
  64          * write is set if any access in the group is a write.
  65          */
  66         isl_map *access;
  67         int write;
  68
  69         /* For each index, size and offset of piece in shared memory. */
  70         struct cuda_array_bound *shared_bound;
  71
  72         /* For each index, size and offset of piece in private memory. */
  73         struct cuda_array_bound *private_bound;
  74
  75         /* References in this group; point to elements of a linked list. */
  76         int n_ref;
  77         struct cuda_stmt_access **refs;
  78 };
  79
  80 struct cuda_array_info {
  81         isl_dim *dim;
  82         /* Name of the array. */
  83         char *name;
  84         /* Number of indices. */
  85         unsigned n_index;
  86         /* For each index, a bound on the array in that direction. */
  87         isl_pw_aff **bound;
  88         /* For each index, bound[i] specialized to the current kernel. */
  89         isl_pw_aff **local_bound;
  90
  91         /* All references to this array; point to elements of a linked list. */
  92         int n_ref;
  93         struct cuda_stmt_access **refs;
  94
  95         /* The reference groups associated to this array. */
  96         int n_group;
  97         struct cuda_array_ref_group **groups;
  98
  99         /* Last shared memory tile dimension that affects tile of this array. */
 100         int last_shared;
 101         /* Dimension at which copying to/from shared memory is printed.
 102          * if >= 0, then the value is >= last_shared
 103          * if -1, then the copying is done at the leaf level.
 104          */
 105         int print_shared_level;
 106 };
 107
 108 /* Print the name of the local copy of a given group of array references.
 109  */
 110 static void print_array_name(FILE *out, struct cuda_array_ref_group *group)
 111 {
 112         int global = 0;
 113
 114         if (group->private_bound)
 115                 fprintf(out, "private_");
 116         else if (group->shared_bound)
 117                 fprintf(out, "shared_");
 118         else
 119                 global = 1;
 120         fprintf(out, "%s", group->array->name);
 121         if (!global && group->array->n_group > 1)
 122                 fprintf(out, "_%d", group->nr);
 123 }
 124
 125 /* Collect all references to the given array and store pointers to them
 126  * in array->refs.
 127  */
 128 static void collect_references(struct cuda_gen *gen,
 129         struct cuda_array_info *array)
 130 {
 131         int i;
 132         int n;
 133
 134         n = 0;
 135         for (i = 0; i < gen->n_stmts; ++i) {
 136                 struct cuda_stmt *stmt = &gen->stmts[i];
 137                 struct cuda_stmt_access *access;
 138
 139                 for (access = stmt->accesses; access; access = access->next) {
 140                         const char *name;
 141                         name = isl_map_get_tuple_name(access->access,
 142                                                       isl_dim_out);
 143                         if (name && !strcmp(array->name, name))
 144                                 n++;
 145                 }
 146         }
 147
 148         array->n_ref = n;
 149         array->refs = isl_alloc_array(gen->ctx, struct cuda_stmt_access *, n);
 150         assert(array->refs);
 151
 152         n = 0;
 153         for (i = 0; i < gen->n_stmts; ++i) {
 154                 struct cuda_stmt *stmt = &gen->stmts[i];
 155                 struct cuda_stmt_access *access;
 156
 157                 for (access = stmt->accesses; access; access = access->next) {
 158                         const char *name;
 159                         name = isl_map_get_tuple_name(access->access,
 160                                                       isl_dim_out);
 161                         if (!name || strcmp(array->name, name))
 162                                 continue;
 163
 164                         array->refs[n++] = access;
 165                 }
 166         }
 167 }
 168
 169 static struct cuda_array_bound *create_bound_list(isl_ctx *ctx, int n_index)
 170 {
 171         int i;
 172         struct cuda_array_bound *bound;
 173
 174         bound = isl_alloc_array(ctx, struct cuda_array_bound, n_index);
 175         assert(bound);
 176
 177         for (i = 0; i < n_index; ++i) {
 178                 isl_int_init(bound[i].size);
 179                 bound[i].lb = NULL;
 180                 isl_int_init(bound[i].stride);
 181                 bound[i].shift = NULL;
 182                 bound[i].shift_map = NULL;
 183         }
 184
 185         return bound;
 186 }
 187
 188 static void free_bound_list(struct cuda_array_bound *bound, int n_index)
 189 {
 190         int j;
 191
 192         if (!bound)
 193                 return;
 194
 195         for (j = 0; j < n_index; ++j) {
 196                 isl_int_clear(bound[j].size);
 197                 isl_int_clear(bound[j].stride);
 198                 isl_aff_free(bound[j].lb);
 199                 isl_qpolynomial_free(bound[j].shift);
 200                 isl_basic_map_free(bound[j].shift_map);
 201         }
 202         free(bound);
 203 }
 204
 205 /* Compute bounds on the host arrays based on the accessed elements
 206  * and collect all references to the array.
 207  */
 208 static int extract_array_info(__isl_take isl_set *array, void *user)
 209 {
 210         int i;
 211         struct cuda_gen *gen = (struct cuda_gen *)user;
 212         const char *name;
 213         int n_index;
 214         isl_pw_aff **bounds;
 215         isl_pw_aff **local_bounds;
 216
 217         n_index = isl_set_dim(array, isl_dim_set);
 218         name = isl_set_get_tuple_name(array);
 219         bounds = isl_alloc_array(isl_set_get_ctx(array),
 220                                  isl_pw_aff *, n_index);
 221         assert(bounds);
 222         local_bounds = isl_calloc_array(isl_set_get_ctx(array),
 223                                  isl_pw_aff *, n_index);
 224         assert(local_bounds);
 225         gen->array[gen->n_array].dim = isl_set_get_dim(array);
 226         gen->array[gen->n_array].name = strdup(name);
 227         gen->array[gen->n_array].n_index = n_index;
 228         gen->array[gen->n_array].bound = bounds;
 229         gen->array[gen->n_array].local_bound = local_bounds;
 230
 231         for (i = 0; i < n_index; ++i) {
 232                 isl_set *dom;
 233                 isl_local_space *ls;
 234                 isl_aff *one;
 235                 isl_pw_aff *bound;
 236
 237                 bound = isl_set_dim_max(isl_set_copy(array), i);
 238                 dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
 239                 ls = isl_local_space_from_dim(isl_set_get_dim(dom));
 240                 one = isl_aff_zero(ls);
 241                 one = isl_aff_add_constant_si(one, 1);
 242                 bound = isl_pw_aff_add(bound, isl_pw_aff_alloc(dom, one));
 243                 bound = isl_pw_aff_gist(bound, isl_set_copy(gen->context));
 244
 245                 bounds[i] = bound;
 246         }
 247
 248         collect_references(gen, &gen->array[gen->n_array]);
 249
 250         gen->n_array++;
 251
 252         isl_set_free(array);
 253         return 0;
 254 }
 255
 256 void collect_array_info(struct cuda_gen *gen)
 257 {
 258         isl_union_set *arrays;
 259
 260         arrays = isl_union_map_range(isl_union_map_copy(gen->read));
 261         arrays = isl_union_set_union(arrays,
 262                         isl_union_map_range(isl_union_map_copy(gen->write)));
 263         arrays = isl_union_set_coalesce(arrays);
 264
 265         gen->n_array = isl_union_set_n_set(arrays);
 266         gen->array = isl_alloc_array(gen->ctx,
 267                                      struct cuda_array_info, gen->n_array);
 268         assert(gen->array);
 269         gen->n_array = 0;
 270         isl_union_set_foreach_set(arrays, &extract_array_info, gen);
 271         isl_union_set_free(arrays);
 272 }
 273
 274 static void free_array_info(struct cuda_gen *gen)
 275 {
 276         int i, j;
 277
 278         for (i = 0; i < gen->n_array; ++i) {
 279                 int n_index = gen->array[i].n_index;
 280                 free(gen->array[i].name);
 281                 for (j = 0; j < n_index; ++j) {
 282                         isl_pw_aff_free(gen->array[i].bound[j]);
 283                         isl_pw_aff_free(gen->array[i].local_bound[j]);
 284                 }
 285                 isl_dim_free(gen->array[i].dim);
 286                 free(gen->array[i].bound);
 287                 free(gen->array[i].local_bound);
 288                 free(gen->array[i].refs);
 289         }
 290         free(gen->array);
 291 }
 292
 293 static void declare_device_arrays(struct cuda_gen *gen)
 294 {
 295         int i;
 296
 297         for (i = 0; i < gen->n_array; ++i)
 298                 fprintf(gen->cuda.host_c, "%s *dev_%s;\n",
 299                         gen->options->type, gen->array[i].name);
 300 }
 301
 302 static void print_array_size(struct cuda_gen *gen, FILE *out,
 303         struct cuda_array_info *array)
 304 {
 305         int i;
 306         isl_printer *prn;
 307
 308         prn = isl_printer_to_file(gen->ctx, out);
 309         prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
 310         for (i = 0; i < array->n_index; ++i) {
 311                 prn = isl_printer_print_str(prn, "(");
 312                 prn = isl_printer_print_pw_aff(prn, array->bound[i]);
 313                 prn = isl_printer_print_str(prn, ") * ");
 314         }
 315         prn = isl_printer_print_str(prn, "sizeof(");
 316         prn = isl_printer_print_str(prn, gen->options->type);
 317         prn = isl_printer_print_str(prn, ")");
 318         isl_printer_free(prn);
 319 }
 320
 321 static void allocate_device_arrays(struct cuda_gen *gen)
 322 {
 323         int i;
 324
 325         for (i = 0; i < gen->n_array; ++i) {
 326                 fprintf(gen->cuda.host_c, "cudaMalloc(&dev_%s, ",
 327                         gen->array[i].name);
 328                 print_array_size(gen, gen->cuda.host_c, &gen->array[i]);
 329                 fprintf(gen->cuda.host_c, ");\n");
 330         }
 331 }
 332
 333 static void free_device_arrays(struct cuda_gen *gen)
 334 {
 335         int i;
 336
 337         for (i = 0; i < gen->n_array; ++i)
 338                 fprintf(gen->cuda.host_c, "cudaFree(dev_%s);\n",
 339                         gen->array[i].name);
 340 }
 341
 342 static void copy_arrays_to_device(struct cuda_gen *gen)
 343 {
 344         int i;
 345
 346         for (i = 0; i < gen->n_array; ++i) {
 347                 isl_dim *dim;
 348                 isl_set *read_i;
 349                 int empty;
 350
 351                 dim = isl_dim_copy(gen->array[i].dim);
 352                 read_i = isl_union_set_extract_set(gen->copy_in, dim);
 353                 empty = isl_set_fast_is_empty(read_i);
 354                 isl_set_free(read_i);
 355                 if (empty)
 356                         continue;
 357
 358                 fprintf(gen->cuda.host_c, "assert(sizeof(%s) == ",
 359                         gen->array[i].name);
 360                 print_array_size(gen, gen->cuda.host_c, &gen->array[i]);
 361                 fprintf(gen->cuda.host_c, ");\n");
 362                 fprintf(gen->cuda.host_c, "cudaMemcpy(dev_%s, %s, ",
 363                         gen->array[i].name, gen->array[i].name);
 364                 print_array_size(gen, gen->cuda.host_c, &gen->array[i]);
 365                 fprintf(gen->cuda.host_c, ", cudaMemcpyHostToDevice);\n");
 366         }
 367 }
 368
 369 static void copy_arrays_from_device(struct cuda_gen *gen)
 370 {
 371         int i;
 372         isl_union_set *write;
 373         write = isl_union_map_range(isl_union_map_copy(gen->write));
 374
 375         for (i = 0; i < gen->n_array; ++i) {
 376                 isl_dim *dim;
 377                 isl_set *write_i;
 378                 int empty;
 379
 380                 dim = isl_dim_copy(gen->array[i].dim);
 381                 write_i = isl_union_set_extract_set(write, dim);
 382                 empty = isl_set_fast_is_empty(write_i);
 383                 isl_set_free(write_i);
 384                 if (empty)
 385                         continue;
 386
 387                 fprintf(gen->cuda.host_c, "cudaMemcpy(%s, dev_%s, ",
 388                         gen->array[i].name, gen->array[i].name);
 389                 print_array_size(gen, gen->cuda.host_c, &gen->array[i]);
 390                 fprintf(gen->cuda.host_c, ", cudaMemcpyDeviceToHost);\n");
 391         }
 392
 393         isl_union_set_free(write);
 394 }
 395
 396 static void read_sizes_from_file(struct cuda_gen *gen, const char *filename,
 397         int *sizes, int len)
 398 {
 399         int i;
 400         FILE *file;
 401
 402         file = fopen(filename, "r");
 403         if (!file)
 404                 return;
 405
 406         for (i = 0; i < len; ++i)
 407                 if (fscanf(file, "%d", &sizes[i]) < 1)
 408                         break;
 409
 410         fclose(file);
 411 }
 412
 413 static void reverse_list(int *list, int len)
 414 {
 415         int i;
 416         int t;
 417
 418         for (i = 0; 2 * i < len; ++i) {
 419                 t = list[i];
 420                 list[i] = list[len - 1 - i];
 421                 list[len - 1 - i] = t;
 422         }
 423 }
 424
 425 /* Read user specified sizes from "tile.sizes", "block.sizes" and "grid.sizes"
 426  * after filling in some potentially useful defaults.
 427  */
 428 static void read_sizes(struct cuda_gen *gen)
 429 {
 430         int n;
 431
 432         gen->tile_size = isl_alloc_array(gen->ctx, int, gen->tile_len);
 433         assert(gen->tile_size);
 434         for (n = 0; n < gen->tile_len; ++n)
 435                 gen->tile_size[n] = gen->options->tile_size;
 436         read_sizes_from_file(gen, "tile.sizes", gen->tile_size, gen->tile_len);
 437
 438         n = gen->n_parallel;
 439         gen->n_block = (n <= 3) ? n : 3;
 440         switch (gen->n_block) {
 441         case 1:
 442                 gen->block_dim[0] = 512;
 443                 break;
 444         case 2:
 445                 gen->block_dim[0] = 32;
 446                 gen->block_dim[1] = 16;
 447                 break;
 448         default:
 449                 gen->block_dim[0] = 32;
 450                 gen->block_dim[1] = 4;
 451                 gen->block_dim[2] = 4;
 452                 break;
 453         }
 454         read_sizes_from_file(gen, "block.sizes", gen->block_dim, gen->n_block);
 455         reverse_list(gen->block_dim, gen->n_block);
 456
 457         gen->n_grid = (n <= 2) ? n : 2;
 458         switch (gen->n_grid) {
 459         case 1:
 460                 gen->grid_dim[0] = 65536;
 461                 break;
 462         default:
 463                 gen->grid_dim[0] = 256;
 464                 gen->grid_dim[1] = 256;
 465                 break;
 466         }
 467         read_sizes_from_file(gen, "grid.sizes", gen->grid_dim, gen->n_grid);
 468         reverse_list(gen->grid_dim, gen->n_grid);
 469 }
 470
 471 static void free_stmts(struct cuda_stmt *stmts, int n)
 472 {
 473         int i;
 474
 475         for (i = 0; i < n; ++i) {
 476                 struct cuda_stmt_access *access, *next;
 477
 478                 for (access = stmts[i].accesses; access; access = next) {
 479                         next = access->next;
 480                         isl_map_free(access->access);
 481                         free(access);
 482                 }
 483
 484                 isl_set_free(stmts[i].domain);
 485                 free(stmts[i].text);
 486         }
 487         free(stmts);
 488 }
 489
 490 void clear_cuda_gen(struct cuda_gen *gen)
 491 {
 492         free_stmts(gen->stmts, gen->n_stmts);
 493         free_array_info(gen);
 494         isl_set_free(gen->context);
 495         isl_union_set_free(gen->copy_in);
 496         isl_union_map_free(gen->sched);
 497         isl_union_map_free(gen->read);
 498         isl_union_map_free(gen->write);
 499 }
 500
 501 static void print_reverse_list(FILE *out, int len, int *list)
 502 {
 503         int i;
 504
 505         for (i = 0; i < len; ++i) {
 506                 if (i)
 507                         fprintf(out, ", ");
 508                 fprintf(out, "%d", list[len - 1 - i]);
 509         }
 510 }
 511
 512 static void print_kernel_launch(struct cuda_gen *gen,
 513         __isl_keep isl_union_set *arrays)
 514 {
 515         int i;
 516         int first = 1;
 517         unsigned nparam;
 518         isl_dim *dim;
 519
 520         print_indent(gen->code.dst, gen->code.indent);
 521         fprintf(gen->code.dst, "kernel%d <<<k%d_dimGrid, k%d_dimBlock>>> (",
 522                 gen->kernel_id, gen->kernel_id, gen->kernel_id);
 523         fprintf(gen->cuda.kernel_c, "__global__ void kernel%d(",
 524                 gen->kernel_id);
 525         fprintf(gen->cuda.kernel_h, "__global__ void kernel%d(",
 526                 gen->kernel_id);
 527
 528         for (i = 0; i < gen->n_array; ++i) {
 529                 isl_dim *dim;
 530                 isl_set *arr;
 531                 int empty;
 532
 533                 dim = isl_dim_copy(gen->array[i].dim);
 534                 arr = isl_union_set_extract_set(arrays, dim);
 535                 empty = isl_set_fast_is_empty(arr);
 536                 isl_set_free(arr);
 537                 if (empty)
 538                         continue;
 539
 540                 if (!first) {
 541                         fprintf(gen->code.dst, ", ");
 542                         fprintf(gen->cuda.kernel_c, ", ");
 543                         fprintf(gen->cuda.kernel_h, ", ");
 544                 }
 545
 546                 fprintf(gen->code.dst, "dev_%s", gen->array[i].name);
 547                 fprintf(gen->cuda.kernel_c, "%s *%s",
 548                         gen->options->type, gen->array[i].name);
 549                 fprintf(gen->cuda.kernel_h, "%s *%s",
 550                         gen->options->type, gen->array[i].name);
 551
 552                 first = 0;
 553         }
 554
 555         dim = isl_union_set_get_dim(arrays);
 556         nparam = isl_dim_size(dim, isl_dim_param);
 557         for (i = 0; i < nparam; ++i) {
 558                 const char *name = isl_dim_get_name(dim, isl_dim_param, i);
 559                 if (!first) {
 560                         fprintf(gen->code.dst, ", ");
 561                         fprintf(gen->cuda.kernel_c, ", ");
 562                         fprintf(gen->cuda.kernel_h, ", ");
 563                 }
 564                 fprintf(gen->code.dst, "%s", name);
 565                 fprintf(gen->cuda.kernel_c, "int %s", name);
 566                 fprintf(gen->cuda.kernel_h, "int %s", name);
 567                 first = 0;
 568         }
 569         isl_dim_free(dim);
 570
 571         for (i = 0; i < gen->tile_first; ++i) {
 572                 if (!first) {
 573                         fprintf(gen->code.dst, ", ");
 574                         fprintf(gen->cuda.kernel_c, ", ");
 575                         fprintf(gen->cuda.kernel_h, ", ");
 576                 }
 577                 fprintf(gen->code.dst, "h%d", i);
 578                 fprintf(gen->cuda.kernel_c, "int h%d", i);
 579                 fprintf(gen->cuda.kernel_h, "int h%d", i);
 580                 first = 0;
 581         }
 582
 583         fprintf(gen->code.dst, ");\n");
 584         fprintf(gen->cuda.kernel_c, ")\n");
 585         fprintf(gen->cuda.kernel_h, ");\n");
 586 }
 587
 588 /* Construct a map from a domain of dimensionality "len"
 589  * to a domain of dimensionality "len" + "tile_len" that tiles
 590  * the "tile_len" coordinates starting at "first".
 591  * In particular, [s_i] -> [s_i / tile_size[i], s_i % tile_size[i]].
 592  * "dim" prescribes the parameters.
 593  */
 594 static __isl_give isl_map *tile(__isl_take isl_dim *dim, int len,
 595         int first, int tile_len, int *tile_size)
 596 {
 597         int i;
 598         isl_int v;
 599         isl_basic_map *bmap;
 600         isl_constraint *c;
 601
 602         isl_int_init(v);
 603
 604         dim = isl_dim_add(dim, isl_dim_in, len);
 605         dim = isl_dim_add(dim, isl_dim_out, len + tile_len);
 606         bmap = isl_basic_map_universe(isl_dim_copy(dim));
 607
 608         for (i = 0; i < len - tile_len; ++i) {
 609                 int j = i < first ? i : i + tile_len;
 610                 int k = i < first ? i : i + 2 * tile_len;
 611
 612                 c = isl_equality_alloc(isl_dim_copy(dim));
 613                 isl_int_set_si(v, -1);
 614                 isl_constraint_set_coefficient(c, isl_dim_in, j, v);
 615                 isl_int_set_si(v, 1);
 616                 isl_constraint_set_coefficient(c, isl_dim_out, k, v);
 617                 bmap = isl_basic_map_add_constraint(bmap, c);
 618         }
 619
 620         for (i = 0; i < tile_len; ++i) {
 621                 c = isl_equality_alloc(isl_dim_copy(dim));
 622                 isl_int_set_si(v, -1);
 623                 isl_constraint_set_coefficient(c, isl_dim_in, first + i, v);
 624                 isl_int_set_si(v, tile_size[i]);
 625                 isl_constraint_set_coefficient(c, isl_dim_out, first + i, v);
 626                 isl_int_set_si(v, 1);
 627                 isl_constraint_set_coefficient(c, isl_dim_out,
 628                                                 first + i + tile_len, v);
 629                 bmap = isl_basic_map_add_constraint(bmap, c);
 630
 631                 c = isl_inequality_alloc(isl_dim_copy(dim));
 632                 isl_int_set_si(v, 1);
 633                 isl_constraint_set_coefficient(c, isl_dim_out,
 634                                                 first + i + tile_len, v);
 635                 bmap = isl_basic_map_add_constraint(bmap, c);
 636
 637                 c = isl_inequality_alloc(isl_dim_copy(dim));
 638                 isl_int_set_si(v, -1);
 639                 isl_constraint_set_coefficient(c, isl_dim_out,
 640                                                 first + i + tile_len, v);
 641                 isl_int_set_si(v, tile_size[i] - 1);
 642                 isl_constraint_set_constant(c, v);
 643                 bmap = isl_basic_map_add_constraint(bmap, c);
 644         }
 645
 646         isl_dim_free(dim);
 647         isl_int_clear(v);
 648
 649         return isl_map_from_basic_map(bmap);
 650 }
 651
 652 /* Construct a map from a domain of dimensionality "len"
 653  * to a domain of dimensionality "len" + "wrap_len" that "wraps"
 654  * the "wrap_len" coordinates starting at "first" according to "wrap_size".
 655  * In particular, [s_i] -> [s_i, s_i % wrap_size[i]].
 656  * To do so, we need extra variables corresponding to [s_i / wrap_size[i]],
 657  * that are projected out at the end.
 658  * "dim" prescribes the parameters.
 659  */
 660 static __isl_give isl_map *wrap(__isl_take isl_dim *dim, int len,
 661         int first, int wrap_len, int *wrap_size)
 662 {
 663         int i;
 664         isl_basic_map *bmap;
 665         isl_constraint *c;
 666
 667         dim = isl_dim_add(dim, isl_dim_in, len);
 668         dim = isl_dim_add(dim, isl_dim_out, len + 2 * wrap_len);
 669         bmap = isl_basic_map_universe(isl_dim_copy(dim));
 670
 671         for (i = 0; i < len; ++i) {
 672                 int k = i < first + wrap_len ? i : i + 2 * wrap_len;
 673
 674                 c = isl_equality_alloc(isl_dim_copy(dim));
 675                 isl_constraint_set_coefficient_si(c, isl_dim_in, i, -1);
 676                 isl_constraint_set_coefficient_si(c, isl_dim_out, k, 1);
 677                 bmap = isl_basic_map_add_constraint(bmap, c);
 678         }
 679
 680         for (i = 0; i < wrap_len; ++i) {
 681                 c = isl_equality_alloc(isl_dim_copy(dim));
 682                 isl_constraint_set_coefficient_si(c, isl_dim_out,
 683                                                     first + i, -1);
 684                 isl_constraint_set_coefficient_si(c, isl_dim_out,
 685                                                     first + wrap_len + i, 1);
 686                 isl_constraint_set_coefficient_si(c, isl_dim_out,
 687                                     first + 2 * wrap_len + i, wrap_size[i]);
 688                 bmap = isl_basic_map_add_constraint(bmap, c);
 689
 690                 c = isl_inequality_alloc(isl_dim_copy(dim));
 691                 isl_constraint_set_coefficient_si(c, isl_dim_out,
 692                                                     first + wrap_len + i, 1);
 693                 bmap = isl_basic_map_add_constraint(bmap, c);
 694
 695                 c = isl_inequality_alloc(isl_dim_copy(dim));
 696                 isl_constraint_set_coefficient_si(c, isl_dim_out,
 697                                                     first + wrap_len + i, -1);
 698                 isl_constraint_set_constant_si(c, wrap_size[i] - 1);
 699                 bmap = isl_basic_map_add_constraint(bmap, c);
 700         }
 701
 702         isl_dim_free(dim);
 703
 704         bmap = isl_basic_map_project_out(bmap, isl_dim_out,
 705                                 first + 2 * wrap_len, wrap_len);
 706
 707         return isl_map_from_basic_map(bmap);
 708 }
 709
 710 /* Add "n" parameters named prefix%d.
 711  */
 712 static __isl_give isl_set *add_params( __isl_take isl_set *set,
 713         int n, const char *prefix)
 714 {
 715         int i;
 716         unsigned nparam;
 717         char name[20];
 718
 719         nparam = isl_set_dim(set, isl_dim_param);
 720         set = isl_set_add_dims(set, isl_dim_param, n);
 721
 722         for (i = 0; i < n; ++i) {
 723                 snprintf(name, sizeof(name), "%s%d", prefix, i);
 724                 set = isl_set_set_dim_name(set, isl_dim_param,
 725                                             nparam + i, name);
 726         }
 727
 728         return set;
 729 }
 730
 731 /* Equate the "n" dimensions of "set" starting at "first" to
 732  * freshly created parameters named prefix%d.
 733  */
 734 static __isl_give isl_set *parametrize(__isl_take isl_set *set,
 735         int first, int n, const char *prefix)
 736 {
 737         int i;
 738         unsigned nparam;
 739         isl_int v;
 740         isl_dim *dim;
 741         isl_basic_set *bset;
 742         isl_constraint *c;
 743
 744         nparam = isl_set_dim(set, isl_dim_param);
 745
 746         set = add_params(set, n, prefix);
 747
 748         dim = isl_set_get_dim(set);
 749         bset = isl_basic_set_universe(isl_dim_copy(dim));
 750
 751         isl_int_init(v);
 752
 753         for (i = 0; i < n; ++i) {
 754                 c = isl_equality_alloc(isl_dim_copy(dim));
 755                 isl_int_set_si(v, -1);
 756                 isl_constraint_set_coefficient(c, isl_dim_param, nparam + i, v);
 757                 isl_int_set_si(v, 1);
 758                 isl_constraint_set_coefficient(c, isl_dim_set, first + i, v);
 759                 bset = isl_basic_set_add_constraint(bset, c);
 760         }
 761
 762         isl_int_clear(v);
 763         isl_dim_free(dim);
 764
 765         return isl_set_intersect(set, isl_set_from_basic_set(bset));
 766 }
 767
 768 static __isl_give isl_set *parametrization(__isl_take isl_dim *dim,
 769         int len, int first, int n, const char *prefix)
 770 {
 771         isl_set *set;
 772
 773         dim = isl_dim_add(dim, isl_dim_set, len);
 774         set = isl_set_universe(dim);
 775
 776         return parametrize(set, first, n, prefix);
 777 }
 778
 779 /* Tile the B loops over the tile sizes and then tile/wrap
 780  * the T1 loops over the blocks.
 781  */
 782 static __isl_give isl_union_map *tile_schedule(struct cuda_gen *gen,
 783         __isl_take isl_union_map *sched)
 784 {
 785         isl_dim *dim;
 786         isl_map *tiling, *block_tiling;
 787
 788         dim = isl_union_map_get_dim(sched);
 789         tiling = tile(isl_dim_copy(dim), gen->untiled_len,
 790                       gen->tile_first, gen->tile_len, gen->tile_size);
 791
 792         if (gen->options->wrap)
 793                 block_tiling = wrap(dim, gen->untiled_len + gen->tile_len,
 794                                 gen->tile_first, gen->n_grid, gen->grid_dim);
 795         else
 796                 block_tiling = tile(dim, gen->untiled_len + gen->tile_len,
 797                                 gen->tile_first, gen->n_grid, gen->grid_dim);
 798
 799         gen->tiled_len = gen->untiled_len + gen->tile_len + gen->n_grid;
 800
 801         tiling = isl_map_apply_range(tiling, block_tiling);
 802
 803         sched = isl_union_map_apply_range(sched,
 804                                              isl_union_map_from_map(tiling));
 805
 806         gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
 807
 808         return sched;
 809 }
 810
 811 static __isl_give isl_union_map *parametrize_tiled_schedule(
 812         struct cuda_gen *gen, __isl_take isl_union_map *sched)
 813 {
 814         isl_dim *dim;
 815         isl_set *par;
 816
 817         dim = isl_union_map_get_dim(sched);
 818         par = parametrization(dim, gen->tiled_len, 0, gen->tile_first, "h");
 819         sched = isl_union_map_intersect_range(sched,
 820                                                 isl_union_set_from_set(par));
 821
 822         dim = isl_union_map_get_dim(sched);
 823         par = parametrization(dim, gen->tiled_len,
 824                 gen->tile_first + gen->n_grid, gen->n_grid, "b");
 825         sched = isl_union_map_intersect_range(sched,
 826                                                 isl_union_set_from_set(par));
 827
 828         return sched;
 829 }
 830
 831 /* Tile/wrap the P1 loops over the threads.
 832  */
 833 static __isl_give isl_union_map *thread_tile_schedule(struct cuda_gen *gen,
 834         __isl_take isl_union_map *sched)
 835 {
 836         isl_dim *dim;
 837         isl_map *tiling;
 838         isl_set *par;
 839
 840         dim = isl_union_map_get_dim(sched);
 841
 842         if (gen->options->wrap)
 843                 tiling = wrap(isl_dim_copy(dim), gen->tiled_len,
 844                                 gen->shared_len, gen->n_block, gen->block_dim);
 845         else
 846                 tiling = tile(isl_dim_copy(dim), gen->tiled_len,
 847                                 gen->shared_len, gen->n_block, gen->block_dim);
 848         gen->thread_tiled_len = gen->tiled_len + gen->n_block;
 849
 850         sched = isl_union_map_apply_range(sched,
 851                                              isl_union_map_from_map(tiling));
 852
 853         par = parametrization(dim, gen->thread_tiled_len,
 854                 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
 855                 gen->n_block, "t");
 856         sched = isl_union_map_intersect_range(sched,
 857                                                 isl_union_set_from_set(par));
 858
 859         gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
 860
 861         return sched;
 862 }
 863
 864 /* If the user asked for it, scale the shared memory tile loops
 865  * (T1P and T2) of "sched" by gen->tile_size[i].
 866  * If we are not performing "wrapping", then additionally scale the T1P
 867  * loops by gen->grid_dim[i].
 868  */
 869 static __isl_give isl_union_map *scale_tile_loops(struct cuda_gen *gen,
 870         __isl_take isl_union_map *sched)
 871 {
 872         int i;
 873         isl_dim *dim;
 874         isl_basic_map *scale;
 875         isl_constraint *c;
 876
 877         if (!gen->options->scale_tile_loops)
 878                 return sched;
 879
 880         dim = isl_union_map_get_dim(sched);
 881         dim = isl_dim_add(dim, isl_dim_in, gen->tiled_len);
 882         dim = isl_dim_add(dim, isl_dim_out, gen->tiled_len);
 883         scale = isl_basic_map_universe(isl_dim_copy(dim));
 884
 885         for (i = 0; i < gen->tiled_len; ++i) {
 886                 int f = 1;
 887
 888                 if (i >= gen->tile_first && i < gen->tile_first + gen->n_grid) {
 889                         f = gen->tile_size[i - gen->tile_first];
 890                         if (!gen->options->wrap)
 891                                 f *= gen->grid_dim[i - gen->tile_first];
 892                 } else if (i >= gen->tile_first + gen->n_grid &&
 893                            i < gen->tile_first + gen->n_grid + gen->tile_len) {
 894                         f = gen->tile_size[i - (gen->tile_first + gen->n_grid)];
 895                 }
 896
 897                 c = isl_equality_alloc(isl_dim_copy(dim));
 898                 isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
 899                 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
 900                 scale = isl_basic_map_add_constraint(scale, c);
 901         }
 902
 903         isl_dim_free(dim);
 904
 905         sched = isl_union_map_apply_range(sched,
 906                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
 907
 908         return sched;
 909 }
 910
 911 /* If we are not performing "wrapping" and if the user asked for it,
 912  * scale the thread tile loops (P1T) of "sched" by gen->block_dim[i].
 913  */
 914 static __isl_give isl_union_map *scale_thread_tile_loops(struct cuda_gen *gen,
 915         __isl_take isl_union_map *sched)
 916 {
 917         int i;
 918         isl_dim *dim;
 919         isl_basic_map *scale;
 920         isl_constraint *c;
 921
 922         if (gen->options->wrap)
 923                 return sched;
 924         if (!gen->options->scale_tile_loops)
 925                 return sched;
 926
 927         dim = isl_union_map_get_dim(sched);
 928         dim = isl_dim_add(dim, isl_dim_in, gen->thread_tiled_len);
 929         dim = isl_dim_add(dim, isl_dim_out, gen->thread_tiled_len);
 930         scale = isl_basic_map_universe(isl_dim_copy(dim));
 931
 932         for (i = 0; i < gen->thread_tiled_len; ++i) {
 933                 int f = 1;
 934
 935                 if (i >= gen->shared_len &&
 936                     i < gen->shared_len + gen->n_block)
 937                         f = gen->block_dim[i - gen->shared_len];
 938
 939                 c = isl_equality_alloc(isl_dim_copy(dim));
 940                 isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
 941                 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
 942                 scale = isl_basic_map_add_constraint(scale, c);
 943         }
 944
 945         isl_dim_free(dim);
 946
 947         sched = isl_union_map_apply_range(sched,
 948                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
 949
 950         return sched;
 951 }
 952
 953 /* If we are not performing "wrapping" and if the user asked for it,
 954  * scale the "n_tile" loops starting at "first" of "sched" by gen->block_dim[i].
 955  */
 956 static __isl_give isl_union_map *scale_access_tile_loops(struct cuda_gen *gen,
 957         __isl_take isl_union_map *sched, int len, int first, int n_tile)
 958 {
 959         int i;
 960         isl_dim *dim;
 961         isl_basic_map *scale;
 962         isl_constraint *c;
 963
 964         if (gen->options->wrap)
 965                 return sched;
 966         if (!gen->options->scale_tile_loops)
 967                 return sched;
 968
 969         dim = isl_union_map_get_dim(sched);
 970         dim = isl_dim_add(dim, isl_dim_in, len);
 971         dim = isl_dim_add(dim, isl_dim_out, len);
 972         scale = isl_basic_map_universe(isl_dim_copy(dim));
 973
 974         for (i = 0; i < len; ++i) {
 975                 int f = 1;
 976
 977                 if (i >= first && i < first + n_tile)
 978                         f = gen->block_dim[i - first];
 979
 980                 c = isl_equality_alloc(isl_dim_copy(dim));
 981                 isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
 982                 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
 983                 scale = isl_basic_map_add_constraint(scale, c);
 984         }
 985
 986         isl_dim_free(dim);
 987
 988         sched = isl_union_map_apply_range(sched,
 989                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
 990
 991         return sched;
 992 }
 993
 994 /* If print_user_stmt is set, we want to print the statements ourselves,
 995  * instead of relying on the C preprocessor.  If so, we need to use
 996  * the stop option so that the domains will be saved on the statement
 997  * nodes.
 998  */
 999 static void print_cloog_shared_body(struct cuda_gen *gen,
1000         __isl_keep isl_set *context, __isl_keep isl_union_map *sched, int len,
1001         void (*print_user_stmt)(struct gpucode_info *info,
1002                                 struct clast_user_stmt *s),
1003         int first_unroll)
1004 {
1005         int i;
1006         CloogOptions *options;
1007         CloogDomain *cloog_context;
1008         CloogUnionDomain *ud;
1009         CloogInput *input;
1010         struct clast_stmt *stmt;
1011         char name[20];
1012
1013         sched = isl_union_map_copy(sched);
1014         sched = isl_union_map_align_params(sched, isl_set_get_dim(context));
1015
1016         options = cloog_options_malloc(gen->state);
1017         options->language = LANGUAGE_C;
1018         options->strides = 1;
1019         options->sh = 1;
1020         options->f = len;
1021         options->l = -1;
1022         options->override = 1;
1023         options->save_domains = 1;
1024         options->noscalars = 1;
1025         options->first_unroll = first_unroll;
1026
1027         ud = cloog_union_domain_from_isl_union_map(sched);
1028         for (i = 0; i < len; ++i) {
1029                 snprintf(name, sizeof(name), "c%d", i);
1030                 ud = cloog_union_domain_set_name(ud, CLOOG_SCAT, i, name);
1031         }
1032         cloog_context = cloog_domain_from_isl_set(isl_set_copy(context));
1033         input = cloog_input_alloc(cloog_context, ud);
1034
1035         stmt = cloog_clast_create_from_input(input, options);
1036
1037         gen->stmt_code.indent = gen->kernel_code.indent;
1038         gen->stmt_code.dst = gen->cuda.kernel_c;
1039         gen->stmt_code.print_user_stmt = print_user_stmt;
1040         gen->stmt_code.print_user_stmt_list = NULL;
1041         gen->stmt_code.print_for_head = NULL;
1042         gen->stmt_code.print_for_foot = NULL;
1043         gen->stmt_code.user = gen;
1044         gpu_print_host_stmt(&gen->stmt_code, stmt);
1045
1046         cloog_clast_free(stmt);
1047         cloog_options_free(options);
1048 }
1049
1050 /* Add "len" parameters p[i] called prefix%d,
1051  * with bounds to 0 <= p[i] < size[i].
1052  */
1053 __isl_give isl_set *add_bounded_parameters(__isl_take isl_set *set,
1054         int len, int *size, const char *prefix)
1055 {
1056         int i;
1057         unsigned nparam;
1058         isl_int v;
1059         isl_dim *dim;
1060         isl_basic_set *bset;
1061         isl_constraint *c;
1062         char name[20];
1063
1064         nparam = isl_set_dim(set, isl_dim_param);
1065         set = isl_set_add_dims(set, isl_dim_param, len);
1066
1067         for (i = 0; i < len; ++i) {
1068                 snprintf(name, sizeof(name), "%s%d", prefix, i);
1069                 set = isl_set_set_dim_name(set, isl_dim_param,
1070                                             nparam + i, name);
1071         }
1072
1073         dim = isl_set_get_dim(set);
1074         bset = isl_basic_set_universe(isl_dim_copy(dim));
1075
1076         isl_int_init(v);
1077
1078         for (i = 0; i < len; ++i) {
1079                 c = isl_inequality_alloc(isl_dim_copy(dim));
1080                 isl_int_set_si(v, 1);
1081                 isl_constraint_set_coefficient(c, isl_dim_param, nparam + i, v);
1082                 bset = isl_basic_set_add_constraint(bset, c);
1083
1084                 c = isl_inequality_alloc(isl_dim_copy(dim));
1085                 isl_int_set_si(v, -1);
1086                 isl_constraint_set_coefficient(c, isl_dim_param, nparam + i, v);
1087                 isl_int_set_si(v, size[i] - 1);
1088                 isl_constraint_set_constant(c, v);
1089                 bset = isl_basic_set_add_constraint(bset, c);
1090         }
1091
1092         isl_int_clear(v);
1093         isl_dim_free(dim);
1094
1095         return isl_set_intersect(set, isl_set_from_basic_set(bset));
1096 }
1097
1098 static void print_shared_body(struct cuda_gen *gen,
1099         __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *sched,
1100         int len, void (*print_user_stmt)(struct gpucode_info *info,
1101                                          struct clast_user_stmt *s),
1102         int first_unroll)
1103 {
1104         isl_set *context;
1105
1106         context = isl_set_copy(shared_domain);
1107         context = parametrize(context, 0, gen->shared_len, "g");
1108         context = isl_set_project_out(context, isl_dim_set, 0, gen->shared_len);
1109         context = add_bounded_parameters(context,
1110                                         gen->n_block, gen->block_dim, "t");
1111
1112         print_cloog_shared_body(gen, context, sched, len, print_user_stmt,
1113                                 first_unroll);
1114
1115         isl_set_free(context);
1116 }
1117
1118 /* Given a tile of an array, construct a map that maps each element
1119  * of the tile to a copy of the tile shifted to the origin
1120  * (based on the lower bounds in group->private_bound or group->shared_bound).
1121  * If any of the indices is strided, then {private,shared}_bound[i].shift_map
1122  * is applied to the index first.
1123  * The domain of the resulting map is "access",
1124  * while the range space is anonymous.
1125  */
1126 static __isl_give isl_map *shift_access(__isl_take isl_set *access,
1127         struct cuda_array_ref_group *group)
1128 {
1129         int i;
1130         isl_dim *dim;
1131         isl_basic_set *bset;
1132         isl_basic_map *bmap;
1133         isl_aff *lb;
1134         isl_basic_set *offset;
1135         isl_basic_map *shift;
1136         isl_basic_map *pre_shift;
1137         isl_map *sched;
1138         const char *name;
1139         struct cuda_array_bound *bounds;
1140         int n_index = group->array->n_index;
1141
1142         bounds = group->private_bound;
1143         if (!bounds)
1144                 bounds = group->shared_bound;
1145
1146         dim = isl_set_get_dim(access);
1147         dim = isl_dim_drop(dim, isl_dim_set, 0, n_index);
1148         offset = isl_basic_set_universe(dim);
1149         for (i = 0; i < n_index; ++i) {
1150                 lb = isl_aff_copy(bounds[i].lb);
1151                 bmap = isl_basic_map_from_aff(lb);
1152                 bset = isl_basic_map_range(bmap);
1153                 offset = isl_basic_set_flat_product(offset, bset);
1154         }
1155         offset = isl_basic_set_neg(offset);
1156
1157         dim = isl_dim_map_from_set(isl_set_get_dim(access));
1158         shift = isl_basic_map_identity(dim);
1159         shift = isl_basic_map_set_tuple_name(shift, isl_dim_out, NULL);
1160
1161         bset = isl_basic_set_universe(isl_set_get_dim(access));
1162         bmap = isl_basic_map_from_domain_and_range(bset, offset);
1163
1164         shift = isl_basic_map_sum(shift, bmap);
1165
1166         dim = isl_set_get_dim(access);
1167         dim = isl_dim_drop(dim, isl_dim_set, 0, n_index);
1168         dim = isl_dim_map_from_set(dim);
1169         pre_shift = isl_basic_map_universe(isl_dim_copy(dim));
1170         dim = isl_dim_add(dim, isl_dim_in, 1);
1171         dim = isl_dim_add(dim, isl_dim_out, 1);
1172         for (i = 0; i < n_index; ++i) {
1173                 if (!bounds[i].shift_map)
1174                         bmap = isl_basic_map_identity(isl_dim_copy(dim));
1175                 else
1176                         bmap = isl_basic_map_copy(bounds[i].shift_map);
1177                 pre_shift = isl_basic_map_flat_product(pre_shift, bmap);
1178         }
1179         isl_dim_free(dim);
1180         name = isl_basic_map_get_tuple_name(shift, isl_dim_in);
1181         pre_shift = isl_basic_map_set_tuple_name(pre_shift, isl_dim_in, name);
1182         pre_shift = isl_basic_map_set_tuple_name(pre_shift, isl_dim_out, name);
1183         shift = isl_basic_map_apply_range(pre_shift, shift);
1184
1185         sched = isl_map_from_basic_map(shift);
1186         sched = isl_map_intersect_domain(sched, access);
1187
1188         return sched;
1189 }
1190
1191 /* Construct a schedule for iterating over all elements in the given
1192  * piece of an array.  The schedule iterates over a copy of the piece
1193  * that is shifted to the origin.
1194  * We subsequently also perform the tiling/wrapping over the threads.
1195  *
1196  * In particular, we tile the final iterators so that the final thread
1197  * dimension runs over the final array dimension.
1198  * However, if those final iterators have only a single iteration,
1199  * we try to tile earlier iterators instead.
1200  */
1201 static __isl_give isl_union_map *access_schedule(struct cuda_gen *gen,
1202         __isl_take isl_set *access, struct cuda_array_ref_group *group)
1203 {
1204         isl_dim *dim;
1205         isl_map *sched;
1206         isl_union_map *usched;
1207         isl_map *tiling;
1208         isl_set *par;
1209         unsigned nvar = isl_set_dim(access, isl_dim_set);
1210         int n_tile;
1211         int first;
1212
1213         sched = shift_access(access, group);
1214
1215         n_tile = gen->n_block;
1216         if (n_tile > nvar) {
1217                 int i;
1218                 sched = isl_map_insert(sched, isl_dim_out, 0, n_tile - nvar);
1219                 for (i = 0; i < n_tile - nvar; ++i)
1220                         sched = isl_map_fix_si(sched, isl_dim_out, i, 0);
1221                 nvar = n_tile;
1222         }
1223
1224         first = nvar - n_tile;
1225
1226         for (; first > 0; first --)
1227                 if (!isl_map_plain_is_fixed(sched, isl_dim_out,
1228                                                 first + n_tile - 1, NULL))
1229                         break;
1230
1231         dim = isl_map_get_dim(sched);
1232         dim = isl_dim_drop(dim, isl_dim_in, 0, isl_dim_size(dim, isl_dim_in));
1233         dim = isl_dim_drop(dim, isl_dim_out, 0, nvar);
1234         if (gen->options->wrap)
1235                 tiling = wrap(isl_dim_copy(dim), nvar, first,
1236                                 n_tile, gen->block_dim);
1237         else
1238                 tiling = tile(isl_dim_copy(dim), nvar, first,
1239                                 n_tile, gen->block_dim);
1240         sched = isl_map_apply_range(sched, tiling);
1241
1242         par = parametrization(dim, nvar + n_tile, first + n_tile, n_tile, "t");
1243         usched = isl_union_map_from_map(sched);
1244         usched = isl_union_map_intersect_range(usched,
1245                                                 isl_union_set_from_set(par));
1246
1247         usched = scale_access_tile_loops(gen, usched, nvar + n_tile,
1248                                          first, n_tile);
1249
1250         return usched;
1251 }
1252
1253 static void print_shared_access(struct cuda_gen *gen,
1254         __isl_keep isl_set *shared_domain, __isl_take isl_set *access,
1255         const char *type, struct cuda_array_ref_group *group)
1256 {
1257         const char *array_name;
1258         char *name;
1259         isl_ctx *ctx;
1260         isl_union_map *sched;
1261         unsigned nvar = isl_set_dim(access, isl_dim_set);
1262         int n_tile;
1263
1264         ctx = isl_set_get_ctx(access);
1265         array_name = isl_set_get_tuple_name(access);
1266         name = isl_alloc_array(ctx, char,
1267                 strlen(type) + sizeof("_shared_") + strlen(array_name) + 20);
1268         if (group->array->n_group > 1)
1269                 sprintf(name, "%s_shared_%s_%d", type, array_name, group->nr);
1270         else
1271                 sprintf(name, "%s_shared_%s", type, array_name);
1272         access = isl_set_set_tuple_name(access, name);
1273         free(name);
1274
1275         sched = access_schedule(gen, access, group);
1276
1277         n_tile = gen->n_block;
1278         if (n_tile > nvar)
1279                 n_tile = nvar;
1280
1281         print_shared_body(gen, shared_domain, sched, nvar + n_tile, NULL, -1);
1282
1283         isl_union_map_free(sched);
1284 }
1285
1286 /* Return the union of all read (read = 1) and/or write (write = 1)
1287  * access relations in the group.
1288  */
1289 static __isl_give isl_union_map *group_access_relation(
1290         struct cuda_array_ref_group *group, int read, int write)
1291 {
1292         int i;
1293         isl_union_map *access;
1294
1295         access = isl_union_map_empty(isl_map_get_dim(group->access));
1296         for (i = 0; i < group->n_ref; ++i) {
1297                 isl_map *map_i;
1298
1299                 if (!((read && group->refs[i]->read) ||
1300                      (write && group->refs[i]->write)))
1301                         continue;
1302                 map_i = isl_map_copy(group->refs[i]->access);
1303                 access = isl_union_map_union(access,
1304                                             isl_union_map_from_map(map_i));
1305         }
1306
1307         return access;
1308 }
1309
1310 /* Check that none of the shared memory tiles involve any strides.
1311  */
1312 static int no_strides(struct cuda_array_ref_group *group)
1313 {
1314         int i;
1315         int n_index = group->array->n_index;
1316
1317         for (i = 0; i < n_index; ++i)
1318                 if (group->shared_bound[i].shift)
1319                         return 0;
1320
1321         return 1;
1322 }
1323
1324 /* Return a set containing the values of the given index i
1325  * of the elements in the array tile in global memory that corresponds
1326  * to the shared memory copy.
1327  * In particular, if a is the index, we return a set with constraints
1328  *
1329  *      tile_offset <= a <= tile_offset + tile_size - 1
1330  *
1331  * and
1332  *
1333  *      0 <= a <= array_size - 1
1334  *
1335  */
1336 static __isl_give isl_set *group_tile_dim(struct cuda_array_ref_group *group,
1337         int i)
1338 {
1339         isl_basic_set *tile;
1340         isl_aff *aff;
1341         isl_constraint *c;
1342         isl_local_space *ls;
1343         isl_pw_aff *bound;
1344         isl_set *dom;
1345         isl_set *tile_set;
1346
1347         aff = isl_aff_copy(group->shared_bound[i].lb);
1348         aff = isl_aff_add_dims(aff, isl_dim_set, 1);
1349         ls = isl_aff_get_local_space(aff);
1350         aff = isl_aff_neg(aff);
1351         aff = isl_aff_add_coefficient_si(aff, isl_dim_set, 0, 1);
1352         c = isl_inequality_from_aff(isl_aff_copy(aff));
1353         tile = isl_basic_set_from_constraint(c);
1354
1355         aff = isl_aff_neg(aff);
1356         aff = isl_aff_add_constant(aff, group->shared_bound[i].size);
1357         aff = isl_aff_add_constant_si(aff, -1);
1358         c = isl_inequality_from_aff(aff);
1359         tile = isl_basic_set_add_constraint(tile, c);
1360
1361         aff = isl_aff_zero(ls);
1362         aff = isl_aff_add_coefficient_si(aff, isl_dim_set, 0, 1);
1363         c = isl_inequality_from_aff(aff);
1364         tile = isl_basic_set_add_constraint(tile, c);
1365
1366         bound = isl_pw_aff_copy(group->array->bound[i]);
1367         bound = isl_pw_aff_add_dims(bound, isl_dim_set, 1);
1368         ls = isl_local_space_from_dim(isl_pw_aff_get_dim(bound));
1369         aff = isl_aff_zero(ls);
1370         aff = isl_aff_add_coefficient_si(aff, isl_dim_set, 0, 1);
1371         aff = isl_aff_add_constant_si(aff, 1);
1372         dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
1373
1374         tile_set = isl_pw_aff_ge_set(bound, isl_pw_aff_alloc(dom, aff));
1375         tile_set = isl_set_align_params(tile_set, isl_basic_set_get_dim(tile));
1376         tile_set = isl_set_intersect(tile_set, isl_set_from_basic_set(tile));
1377
1378         return tile_set;
1379 }
1380
1381 /* Return a set containing the elements in the array tile in
1382  * global memory that corresponds to the shared memory copy.
1383  */
1384 static __isl_give isl_set *group_tile(struct cuda_array_ref_group *group)
1385 {
1386         int i;
1387         int n_index = group->array->n_index;
1388         isl_set *tile;
1389
1390         tile = group_tile_dim(group, 0);
1391         for (i = 1; i < n_index; ++i) {
1392                 isl_set *tile_i;
1393
1394                 tile_i = group_tile_dim(group, i);
1395                 tile = isl_set_flat_product(tile, tile_i);
1396         }
1397
1398         tile = isl_set_set_tuple_name(tile, group->array->name);
1399
1400         return tile;
1401 }
1402
1403 /* Print code for reading into or writing from shared memory
1404  * the given array reference group.
1405  *
1406  * sched maps the original iteration domains to the shared memory tile loops.
1407  *
1408  * If we are performing a read from global memory to shared memory,
1409  * if the array involved is not a scalar and if the definition of the
1410  * shared memory tiles does not involve any strides, then we copy
1411  * the entire tile to shared memory.  This may result in some extra
1412  * elements getting copied, but it should lead to simpler code
1413  * (which means that fewer registers may be needed) and less divergence.
1414  *
1415  * Otherwise, we only copy the elements that will be read or have been written
1416  * in the kernel.
1417  *
1418  * Note that the absence of stride requirement can easily be lifted.
1419  * We would just need to add constraints of the form
1420  *
1421  *      shift + a = stride * alpha
1422  */
1423 static int print_group_shared_accesses(struct cuda_gen *gen,
1424         struct cuda_array_ref_group *group, const char *type,
1425         __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *sched)
1426 {
1427         int read;
1428         isl_union_map *access;
1429         isl_union_set *uset;
1430         isl_set *access_set;
1431
1432         if (group->private_bound)
1433                 return 0;
1434         if (!group->shared_bound)
1435                 return 0;
1436
1437         read = !strcmp(type, "read");
1438
1439         access = group_access_relation(group, read, !read);
1440         access = isl_union_map_apply_domain(access, isl_union_map_copy(sched));
1441         uset = isl_union_map_range(access);
1442
1443         if (isl_union_set_is_empty(uset)) {
1444                 isl_union_set_free(uset);
1445                 return 0;
1446         }
1447
1448         if (read && group->array->n_index > 0 && no_strides(group)) {
1449                 isl_union_set_free(uset);
1450                 access_set = group_tile(group);
1451                 print_shared_access(gen, shared_domain, access_set,
1452                                     type, group);
1453                 return 1;
1454         }
1455
1456         access_set = isl_set_from_union_set(uset);
1457         access_set = isl_set_coalesce(access_set);
1458
1459         print_shared_access(gen, shared_domain, access_set, type, group);
1460
1461         return 1;
1462 }
1463
1464 /* Print code for reading into or writing from shared memory at
1465  * the given level (-1 for innermost).
1466  *
1467  * If we are not printing at the innermost level, then the dimensionality
1468  * of shared_domain may be smaller than gen->shared_len.
1469  * As the rest of the code assumes that the domain of access has
1470  * gen->shared_len dimensions, we therefore may need to embed this domain
1471  * in a higher dimensional space after intersection with shared_domain.
1472  */
1473 static void print_shared_accesses(struct cuda_gen *gen,
1474         __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *access,
1475         const char *type, int level)
1476 {
1477         int i, j;
1478         isl_dim *dim;
1479         isl_map *proj;
1480         isl_set *par;
1481         int shared_len = isl_set_dim(shared_domain, isl_dim_set);
1482         int sync = 0;
1483         isl_union_map *sched;
1484
1485         shared_domain = isl_set_copy(shared_domain);
1486         sched = isl_union_map_copy(gen->tiled_sched);
1487         dim = isl_union_map_get_dim(sched);
1488         proj = projection(dim, gen->tiled_len, shared_len);
1489         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
1490         sched = isl_union_map_intersect_range(sched,
1491                         isl_union_set_from_set(isl_set_copy(shared_domain)));
1492         if (shared_len != gen->shared_len) {
1493                 dim = isl_union_map_get_dim(sched);
1494                 proj = projection(dim, gen->shared_len, shared_len);
1495                 proj = isl_map_reverse(proj);
1496                 shared_domain = isl_set_apply(shared_domain,
1497                                                 isl_map_copy(proj));
1498                 sched = isl_union_map_apply_range(sched,
1499                                 isl_union_map_from_map(proj));
1500         }
1501
1502         dim = isl_union_map_get_dim(sched);
1503         par = parametrization(dim, gen->shared_len, 0, gen->shared_len, "g");
1504         sched = isl_union_map_intersect_range(sched,
1505                                                 isl_union_set_from_set(par));
1506
1507         for (i = 0; i < gen->n_array; ++i) {
1508                 struct cuda_array_info *array = &gen->array[i];
1509
1510                 if (gen->array[i].print_shared_level != level)
1511                         continue;
1512
1513                 for (j = 0; j < array->n_group; ++j) {
1514                         if (print_group_shared_accesses(gen, array->groups[j],
1515                                             type, shared_domain, sched))
1516                                 sync = 1;
1517                 }
1518         }
1519
1520         isl_union_map_free(sched);
1521         isl_set_free(shared_domain);
1522
1523         if (sync) {
1524                 print_indent(gen->cuda.kernel_c, gen->kernel_code.indent);
1525                 fprintf(gen->cuda.kernel_c, "__syncthreads();\n");
1526         }
1527 }
1528
1529 /* Given an index expression into a tile of an array, adjust the expression
1530  * to a shift of the tile to the origin
1531  * (based on the lower bounds in array->shared_bound).
1532  * If the index is strided, then we first add
1533  * bound->shift and divide by bound->stride.
1534  */
1535 static __isl_give isl_qpolynomial *shift_index(__isl_take isl_qpolynomial *qp,
1536         struct cuda_array_info *array,
1537         struct cuda_array_bound *bound, __isl_take isl_set *domain)
1538 {
1539         isl_qpolynomial *lb;
1540
1541         if (bound->shift) {
1542                 isl_qpolynomial *shift, *t;
1543                 isl_int one;
1544                 isl_dim *dim;
1545                 shift = bound->shift;
1546                 shift = isl_qpolynomial_copy(shift);
1547                 shift = isl_qpolynomial_drop_dims(shift, isl_dim_set, 0,
1548                             isl_qpolynomial_dim(shift, isl_dim_set));
1549                 shift = isl_qpolynomial_align_params(shift,
1550                                           isl_qpolynomial_get_dim(qp));
1551                 qp = isl_qpolynomial_add(qp, shift);
1552                 dim = isl_qpolynomial_get_dim(qp);
1553                 isl_int_init(one);
1554                 isl_int_set_si(one, 1);
1555                 t = isl_qpolynomial_rat_cst(dim, one, bound->stride);
1556                 isl_int_clear(one);
1557                 qp = isl_qpolynomial_mul(qp, t);
1558         }
1559
1560         lb = isl_qpolynomial_from_aff(isl_aff_copy(bound->lb));
1561         lb = isl_qpolynomial_drop_dims(lb, isl_dim_set, 0,
1562                         isl_qpolynomial_dim(lb, isl_dim_set));
1563
1564         lb = isl_qpolynomial_align_params(lb, isl_qpolynomial_get_dim(qp));
1565
1566         qp = isl_qpolynomial_sub(qp, lb);
1567         qp = isl_qpolynomial_gist(qp, domain);
1568
1569         return qp;
1570 }
1571
1572 /* This function is called for each access to an array in some statement
1573  * in the original code.
1574  * Replace that access by an access to shared or (linearized) global memory.
1575  * Since the array in shared memory is just
1576  * a shifted copy of part of the original array, we simply need
1577  * to subtract the lower bound, which was computed
1578  * in can_tile_for_shared_memory.
1579  * If any of the indices is strided, then we first add
1580  * shared_bound[i].shift and divide by shared_bound[i].stride.
1581  *
1582  * If the given array is accessed directly from global memory,
1583  * we don't need to perform any shifting and simply simplify
1584  * expression in the context of the domain instead.
1585  *
1586  * If the array space (range of access) has no name, then we are
1587  * accessing an iterator in the original program.
1588  */
1589 static void print_access(struct cuda_gen *gen, __isl_take isl_map *access,
1590         int group_nr)
1591 {
1592         int i;
1593         const char *name;
1594         unsigned n_index;
1595         struct cuda_array_info *array = NULL;
1596         isl_printer *prn;
1597         isl_basic_set *aff;
1598         isl_set *data_set;
1599         isl_set *domain;
1600         struct cuda_array_bound *bounds = NULL;
1601
1602         access = isl_map_align_params(access,
1603                                         isl_set_get_dim(gen->stmt_domain));
1604
1605         data_set = isl_set_apply(isl_set_copy(gen->stmt_domain), access);
1606
1607         name = isl_set_get_tuple_name(data_set);
1608
1609         if (!name)
1610                 fprintf(gen->cuda.kernel_c, "(");
1611         else {
1612                 struct cuda_array_ref_group *group;
1613
1614                 for (i = 0; i < gen->n_array; ++i) {
1615                         if (strcmp(name, gen->array[i].name))
1616                                 continue;
1617                         array = &gen->array[i];
1618                 }
1619                 assert(array);
1620                 group = array->groups[group_nr];
1621                 bounds = group->private_bound;
1622                 if (!bounds)
1623                         bounds = group->shared_bound;
1624
1625                 print_array_name(gen->cuda.kernel_c, group);
1626                 fprintf(gen->cuda.kernel_c, "[");
1627         }
1628
1629
1630         n_index = isl_set_dim(data_set, isl_dim_set);
1631         aff = isl_set_affine_hull(data_set);
1632
1633         prn = isl_printer_to_file(gen->ctx, gen->cuda.kernel_c);
1634         prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
1635
1636         if (!bounds)
1637                 for (i = 0; i + 1 < n_index; ++i)
1638                         prn = isl_printer_print_str(prn, "(");
1639
1640         for (i = 0; i < n_index; ++i) {
1641                 isl_constraint *c;
1642                 isl_qpolynomial *qp;
1643                 int ok;
1644
1645                 ok = isl_basic_set_has_defining_equality(aff,
1646                                                         isl_dim_out, i, &c);
1647                 assert(ok);
1648                 qp = isl_qpolynomial_from_constraint(c, isl_dim_out, i);
1649                 qp = isl_qpolynomial_drop_dims(qp, isl_dim_set, 0,
1650                                 isl_qpolynomial_dim(qp, isl_dim_set));
1651
1652                 if (!array) {
1653                         prn = isl_printer_print_qpolynomial(prn, qp);
1654                         isl_qpolynomial_free(qp);
1655                         continue;
1656                 }
1657
1658                 domain = isl_set_copy(gen->stmt_domain);
1659                 domain = isl_set_project_out(domain, isl_dim_set, 0,
1660                                             isl_set_dim(domain, isl_dim_set));
1661                 if (!bounds)
1662                         qp = isl_qpolynomial_gist(qp, domain);
1663                 else
1664                         qp = shift_index(qp, array, &bounds[i], domain);
1665
1666                 if (i) {
1667                         if (!bounds) {
1668                                 prn = isl_printer_print_str(prn, ") * (");
1669                                 prn = isl_printer_print_pw_aff(prn,
1670                                                         array->local_bound[i]);
1671                                 prn = isl_printer_print_str(prn, ") + ");
1672                         } else
1673                                 prn = isl_printer_print_str(prn, "][");
1674                 }
1675                 prn = isl_printer_print_qpolynomial(prn, qp);
1676                 isl_qpolynomial_free(qp);
1677         }
1678         if (!name)
1679                 prn = isl_printer_print_str(prn, ")");
1680         else
1681                 prn = isl_printer_print_str(prn, "]");
1682         isl_printer_free(prn);
1683
1684         isl_basic_set_free(aff);
1685 }
1686
1687 static void print_stmt_body(struct cuda_gen *gen,
1688         FILE *out, struct cuda_stmt *stmt)
1689 {
1690         int last = 0;
1691         struct cuda_stmt_access *access;
1692
1693         for (access = stmt->accesses; access; access = access->next) {
1694                 fwrite(stmt->text + last, 1, access->text_offset - last, out);
1695                 last = access->text_offset + access->text_len;
1696
1697                 print_access(gen, isl_map_copy(access->access),
1698                              access->group);
1699         }
1700
1701         fprintf(out, "%s\n", stmt->text + last);
1702 }
1703
1704 /* This function is called for each leaf in the innermost clast,
1705  * i.e., for each statemetn.
1706  * We print the statement body, simplifying the accesses based
1707  * on the schedule.
1708  */
1709 static void print_statement(struct gpucode_info *code,
1710         struct clast_user_stmt *u)
1711 {
1712         struct cuda_gen *gen = code->user;
1713         isl_dim *dim;
1714         isl_set *par;
1715         isl_set *stmt_domain;
1716         isl_union_map *stmt_sched;
1717         isl_union_set *uset;
1718         int nr;
1719         struct cuda_stmt *stmt;
1720
1721         nr = atoi(u->statement->name + 2);
1722         stmt = &gen->stmts[nr];
1723
1724         stmt_domain = extract_host_domain(u);
1725
1726         stmt_sched = isl_union_map_intersect_range(
1727                     isl_union_map_copy(gen->local_sched),
1728                     isl_union_set_from_set(extend(stmt_domain,
1729                                                   gen->thread_tiled_len)));
1730         dim = isl_union_map_get_dim(stmt_sched);
1731         par = parametrization(dim, gen->thread_tiled_len, 0,
1732                                 gen->thread_tiled_len, "c");
1733         stmt_sched = isl_union_map_intersect_range(stmt_sched,
1734                                                 isl_union_set_from_set(par));
1735
1736         uset = isl_union_map_domain(stmt_sched);
1737         dim = isl_union_set_get_dim(uset);
1738         dim = isl_dim_add(dim, isl_dim_set,
1739                           isl_set_dim(stmt->domain, isl_dim_set));
1740         dim = isl_dim_set_tuple_name(dim, isl_dim_set, u->statement->name);
1741         gen->stmt_domain = isl_union_set_extract_set(uset, dim);
1742         isl_union_set_free(uset);
1743
1744         print_indent(code->dst, code->indent);
1745         print_stmt_body(gen, code->dst, stmt);
1746
1747         isl_set_free(gen->stmt_domain);
1748 }
1749
1750 /* Print an access to the element in the global memory copy of the
1751  * given array that corresponds to element [qp[0]][qp[1]]...
1752  * of the original array.
1753  * The copy in global memory has been linearized, so we need to take
1754  * the array size into account.
1755  */
1756 static void print_private_global_index(isl_ctx *ctx, FILE *out,
1757         struct cuda_array_info *array, __isl_keep isl_qpolynomial **qp)
1758 {
1759         int i;
1760         isl_printer *prn;
1761
1762         fprintf(out, "%s[", array->name);
1763         prn = isl_printer_to_file(ctx, out);
1764         prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
1765         for (i = 0; i + 1 < array->n_index; ++i)
1766                 prn = isl_printer_print_str(prn, "(");
1767         for (i = 0; i < array->n_index; ++i) {
1768                 if (i) {
1769                         prn = isl_printer_print_str(prn, ") * (");
1770                         prn = isl_printer_print_pw_aff(prn,
1771                                                         array->local_bound[i]);
1772                         prn = isl_printer_print_str(prn, ") + ");
1773                 }
1774                 prn = isl_printer_print_qpolynomial(prn, qp[i]);
1775         }
1776         isl_printer_free(prn);
1777         fprintf(out, "]");
1778 }
1779
1780 /* Print an access to the element in the shared memory copy of the
1781  * given array reference group that corresponds to element [qps[0]][qps[1]]...
1782  * of the original array.
1783  * Since the array in shared memory is just a shifted copy of part
1784  * of the original array, we simply need to subtract the lower bound,
1785  * which was computed in can_tile_for_shared_memory.
1786  * If any of the indices is strided, then we first add
1787  * shared_bound[i].shift and divide by shared_bound[i].stride.
1788  */
1789 static void print_private_local_index(isl_ctx *ctx, FILE *out,
1790         struct cuda_array_ref_group *group,
1791         __isl_keep isl_qpolynomial **qps, __isl_keep isl_set *domain)
1792 {
1793         int i;
1794         isl_printer *prn;
1795         struct cuda_array_info *array = group->array;
1796         struct cuda_array_bound *bounds = group->private_bound;
1797
1798         print_array_name(out, group);
1799         for (i = 0; i < array->n_index; ++i) {
1800                 isl_qpolynomial *qp = isl_qpolynomial_copy(qps[i]);
1801
1802                 qp = shift_index(qp, array, &bounds[i], isl_set_copy(domain));
1803
1804                 fprintf(out, "[");
1805                 prn = isl_printer_to_file(ctx, out);
1806                 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
1807                 prn = isl_printer_print_qpolynomial(prn, qp);
1808                 isl_printer_free(prn);
1809                 fprintf(out, "]");
1810                 isl_qpolynomial_free(qp);
1811         }
1812 }
1813
1814 /* This function is called for each leaf in the clast of the code
1815  * for copying to or from private memory.
1816  * The statement name is read_private_<array> or write_private_<array>.
1817  *
1818  * The schedule iterates over the array elements, so we can use
1819  * the domain of private_sched at the current scheduling position
1820  * as the index of the array.
1821  */
1822 static void print_private_copy_statement(struct gpucode_info *code,
1823         struct clast_user_stmt *u)
1824 {
1825         struct cuda_gen *gen = code->user;
1826         isl_set *domain;
1827         isl_map *sched;
1828         struct cuda_array_ref_group *group = gen->private_group;
1829         int i;
1830         unsigned n_in;
1831         unsigned n_out;
1832         isl_dim *dim;
1833         isl_set *param;
1834         isl_set *index;
1835         isl_basic_set *aff;
1836         isl_ctx *ctx;
1837         isl_qpolynomial **qp;
1838         int read;
1839
1840         read = !strncmp(u->statement->name, "read", 4);
1841
1842         domain = extract_host_domain(u);
1843         assert(domain);
1844
1845         sched = isl_map_copy(gen->private_sched);
1846         sched = isl_map_reverse(sched);
1847         sched = isl_map_intersect_domain(sched, domain);
1848         n_in = isl_map_dim(sched, isl_dim_in);
1849         n_out = isl_map_dim(sched, isl_dim_out);
1850         dim = isl_map_get_dim(sched);
1851         dim = isl_dim_drop(dim, isl_dim_in, 0, n_in);
1852         dim = isl_dim_drop(dim, isl_dim_out, 0, n_out);
1853         param = parametrization(dim, n_in, 0, n_in, "c");
1854         sched = isl_map_align_params(sched, isl_set_get_dim(param));
1855         sched = isl_map_intersect_domain(sched, param);
1856         index = isl_map_range(sched);
1857         domain = isl_set_copy(index);
1858         aff = isl_set_affine_hull(index);
1859         domain = isl_set_project_out(domain, isl_dim_set, 0, n_out);
1860
1861         ctx = isl_basic_set_get_ctx(aff);
1862         qp = isl_alloc_array(ctx, isl_qpolynomial *, n_out);
1863         assert(qp);
1864
1865         for (i = 0; i < n_out; ++i) {
1866                 isl_constraint *c;
1867                 int ok;
1868
1869                 ok = isl_basic_set_has_defining_equality(aff,
1870                                                         isl_dim_set, i, &c);
1871                 assert(ok);
1872                 qp[i] = isl_qpolynomial_from_constraint(c, isl_dim_set, i);
1873                 qp[i] = isl_qpolynomial_drop_dims(qp[i], isl_dim_set, 0, n_out);
1874         }
1875
1876         print_indent(code->dst, code->indent);
1877         if (read) {
1878                 print_private_local_index(ctx, code->dst, group, qp, domain);
1879                 fprintf(code->dst, " = ");
1880                 print_private_global_index(ctx, code->dst, group->array, qp);
1881         } else {
1882                 print_private_global_index(ctx, code->dst, group->array, qp);
1883                 fprintf(code->dst, " = ");
1884                 print_private_local_index(ctx, code->dst, group, qp, domain);
1885         }
1886         fprintf(code->dst, ";\n");
1887
1888         for (i = 0; i < n_out; ++i)
1889                 isl_qpolynomial_free(qp[i]);
1890         free(qp);
1891
1892         isl_basic_set_free(aff);
1893         isl_set_free(domain);
1894 }
1895
1896 static void print_private_access(struct cuda_gen *gen,
1897         __isl_keep isl_set *shared_domain, __isl_take isl_set *access,
1898         const char *type, struct cuda_array_ref_group *group)
1899 {
1900         const char *array_name;
1901         char *name;
1902         isl_ctx *ctx;
1903         unsigned nvar = isl_set_dim(access, isl_dim_set);
1904         isl_union_map *usched;
1905
1906         if (isl_set_fast_is_empty(access)) {
1907                 isl_set_free(access);
1908                 return;
1909         }
1910
1911         ctx = isl_set_get_ctx(access);
1912         array_name = isl_set_get_tuple_name(access);
1913         name = isl_alloc_array(ctx, char,
1914                 strlen(type) + sizeof("_private_") + strlen(array_name) + 20);
1915         if (group->array->n_group > 1)
1916                 sprintf(name, "%s_private_%s_%d", type, array_name, group->nr);
1917         else
1918                 sprintf(name, "%s_private_%s", type, array_name);
1919         access = isl_set_set_tuple_name(access, name);
1920         free(name);
1921
1922         gen->private_sched = shift_access(access, group);
1923         gen->private_group = group;
1924
1925         usched = isl_union_map_from_map(isl_map_copy(gen->private_sched));
1926         print_shared_body(gen, shared_domain, usched, nvar,
1927                                 &print_private_copy_statement, 1);
1928         isl_union_map_free(usched);
1929
1930         isl_map_free(gen->private_sched);
1931 }
1932
1933 /* Print code for reading into or writing from private memory
1934  * the given array reference group.
1935  *
1936  * sched maps the original iteration domains to the shared memory tile loops.
1937  */
1938 static void print_group_private_accesses(struct cuda_gen *gen,
1939         struct cuda_array_ref_group *group,
1940          const char *type, __isl_keep isl_set *shared_domain,
1941         unsigned first_shared, int shared_len, __isl_keep isl_union_map *sched)
1942 {
1943         int read;
1944         isl_union_map *access;
1945         isl_union_set *uset;
1946         isl_set *access_set;
1947
1948         if (!group->private_bound)
1949                 return;
1950
1951         read = !strcmp(type, "read");
1952
1953         access = group_access_relation(group, read, !read);
1954         access = isl_union_map_apply_domain(access, isl_union_map_copy(sched));
1955         access = isl_union_map_intersect(access,
1956                                     isl_union_map_copy(gen->private_access));
1957         uset = isl_union_map_range(access);
1958
1959         if (isl_union_set_is_empty(uset)) {
1960                 isl_union_set_free(uset);
1961                 return;
1962         }
1963
1964         access_set = isl_set_from_union_set(uset);
1965         access_set = isl_set_coalesce(access_set);
1966         access_set = isl_set_eliminate(access_set, isl_dim_param,
1967                         first_shared + shared_len,
1968                         gen->shared_len - shared_len);
1969
1970         print_private_access(gen, shared_domain, access_set, type, group);
1971 }
1972
1973 /* Print code for reading into or writing from private memory at
1974  * the given level (-1 for innermost).
1975  *
1976  * If we are not printing at the innermost level, then the dimensionality
1977  * of shared_domain may be smaller than gen->shared_len.
1978  * As the rest of the code assumes that the domain of access has
1979  * gen->shared_len dimensions, we therefore may need to embed this domain
1980  * in a higher dimensional space after intersection with shared_domain.
1981  *
1982  * This code is very similar to print_shared_accesses.
1983  * The main difference is that we to take into account gen->private_access.
1984  */
1985 static void print_private_accesses(struct cuda_gen *gen,
1986         __isl_keep isl_set *shared_domain, __isl_keep isl_union_map *access,
1987         const char *type, int level)
1988 {
1989         int i, j;
1990         isl_dim *dim;
1991         isl_map *proj;
1992         int shared_len = isl_set_dim(shared_domain, isl_dim_set);
1993         unsigned first_shared;
1994         isl_union_map *sched;
1995
1996         shared_domain = isl_set_copy(shared_domain);
1997         sched = isl_union_map_copy(gen->tiled_sched);
1998         dim = isl_union_map_get_dim(sched);
1999         first_shared = isl_dim_size(dim, isl_dim_param);
2000         proj = projection(dim, gen->tiled_len, shared_len);
2001         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
2002         sched = isl_union_map_intersect_range(sched,
2003                         isl_union_set_from_set(isl_set_copy(shared_domain)));
2004         if (shared_len != gen->shared_len) {
2005                 dim = isl_union_map_get_dim(sched);
2006                 proj = projection(dim, gen->shared_len, shared_len);
2007                 proj = isl_map_reverse(proj);
2008                 shared_domain = isl_set_apply(shared_domain,
2009                                                 isl_map_copy(proj));
2010                 sched = isl_union_map_apply_range(sched,
2011                                 isl_union_map_from_map(proj));
2012         }
2013
2014         for (i = 0; i < gen->n_array; ++i) {
2015                 struct cuda_array_info *array = &gen->array[i];
2016
2017                 if (gen->array[i].print_shared_level != level)
2018                         continue;
2019
2020                 for (j = 0; j < array->n_group; ++j)
2021                         print_group_private_accesses(gen, array->groups[j],
2022                                             type, shared_domain,
2023                                             first_shared, shared_len, sched);
2024         }
2025
2026         isl_union_map_free(sched);
2027         isl_set_free(shared_domain);
2028 }
2029
2030 /* Set unroll[j] if the input dimension j is involved in
2031  * the index expression represented by bmap.
2032  */
2033 static int check_unroll(__isl_take isl_basic_map *bmap, void *user)
2034 {
2035         int i, j;
2036         int n_in = isl_basic_map_dim(bmap, isl_dim_in);
2037         int n_out = isl_basic_map_dim(bmap, isl_dim_out);
2038         int *unroll = user;
2039
2040         for (i = 0; i < n_out; ++i) {
2041                 isl_constraint *c;
2042                 int ok;
2043
2044                 ok = isl_basic_map_has_defining_equality(bmap,
2045                                                         isl_dim_out, i, &c);
2046                 assert(ok);
2047                 for (j = 0; j < n_in; ++j)
2048                         if (isl_constraint_involves_dims(c, isl_dim_in, j, 1))
2049                                 unroll[j] = 1;
2050                 isl_constraint_free(c);
2051         }
2052
2053         isl_basic_map_free(bmap);
2054         return 0;
2055 }
2056
2057 /* Given an array pos mapping input dimensions to the corresponding
2058  * output dimension, construct the corresponding map.
2059  */
2060 static __isl_give isl_map *permutation(__isl_take isl_dim *dim,
2061         int *pos, int len)
2062 {
2063         int i;
2064         isl_constraint *c;
2065         isl_basic_map *bmap;
2066
2067         dim = isl_dim_add(dim, isl_dim_in, len);
2068         dim = isl_dim_add(dim, isl_dim_out, len);
2069         bmap = isl_basic_map_universe(isl_dim_copy(dim));
2070
2071         for (i = 0; i < len; ++i) {
2072                 c = isl_equality_alloc(isl_dim_copy(dim));
2073                 isl_constraint_set_coefficient_si(c, isl_dim_in, i, -1);
2074                 isl_constraint_set_coefficient_si(c, isl_dim_out, pos[i], 1);
2075                 bmap = isl_basic_map_add_constraint(bmap, c);
2076         }
2077         isl_dim_free(dim);
2078
2079         return isl_map_from_basic_map(bmap);
2080 }
2081
2082 /* Find all loops involved in any of the index expressions for any of
2083  * the private accesses, move them innermost and then mark them as
2084  * requiring unrolling by setting gen->first_unroll.
2085  * The loops involved should all be parallel because of the checks
2086  * we performed in check_private_group_access.  Moving them innermost
2087  * is therefore a valid transformation.
2088  */
2089 static __isl_give isl_union_map *interchange_for_unroll(struct cuda_gen *gen,
2090         __isl_take isl_union_map *sched)
2091 {
2092         int i, j;
2093         int unroll[gen->thread_tiled_len];
2094         int perm[gen->thread_tiled_len];
2095         isl_dim *dim;
2096         isl_map *permute;
2097         int len = gen->shared_len + gen->n_parallel + gen->n_block;
2098
2099         gen->first_unroll = -1;
2100
2101         for (i = 0; i < gen->thread_tiled_len; ++i)
2102                 unroll[i] = 0;
2103         for (i = 0; i < gen->n_array; ++i) {
2104                 struct cuda_array_info *array = &gen->array[i];
2105
2106                 for (j = 0; j < array->n_group; ++j) {
2107                         isl_union_map *access;
2108                         isl_map *acc;
2109
2110                         if (!array->groups[j]->private_bound)
2111                                 continue;
2112
2113                         access = group_access_relation(array->groups[j], 1, 1);
2114                         access = isl_union_map_apply_domain(access,
2115                                                 isl_union_map_copy(sched));
2116
2117                         acc = isl_map_from_union_map(access);
2118                         isl_map_foreach_basic_map(acc, &check_unroll, unroll);
2119
2120                         isl_map_free(acc);
2121                 }
2122         }
2123
2124         for (i = 0; i < gen->shared_len; ++i)
2125                 if (unroll[i])
2126                         return sched;
2127
2128         for (i = gen->shared_len; i < len; ++i)
2129                 if (unroll[i])
2130                         break;
2131
2132         if (i >= len)
2133                 return sched;
2134
2135         for (i = len; i < gen->thread_tiled_len; ++i)
2136                 if (unroll[i])
2137                         return sched;
2138
2139         j = 0;
2140         for (i = 0; i < gen->thread_tiled_len; ++i)
2141                 if (!unroll[i])
2142                         perm[i] = j++;
2143         gen->first_unroll = 1 + j;
2144         for (i = 0; i < len; ++i)
2145                 if (unroll[i])
2146                         perm[i] = j++;
2147
2148         dim = isl_union_map_get_dim(sched);
2149         permute = permutation(dim, perm, gen->thread_tiled_len);
2150         sched = isl_union_map_apply_range(sched,
2151                                           isl_union_map_from_map(permute));
2152
2153         return sched;
2154 }
2155
2156 /* This function is called for each leaf in the clast of the kernel code.
2157  * We first specialize the schedule to the site of the leaf and
2158  * print code for reading into shared memory, performing the actual
2159  * computations and writing from shared memory, with the required
2160  * synchronizations.
2161  */
2162 static void print_kernel_user(struct gpucode_info *code,
2163         struct clast_user_stmt *u)
2164 {
2165         struct cuda_gen *gen = code->user;
2166         isl_set *shared_domain;
2167
2168         shared_domain = extract_entire_host_domain(u);
2169
2170         print_shared_accesses(gen, shared_domain, gen->read, "read", -1);
2171
2172         print_private_accesses(gen, shared_domain, gen->read, "read", -1);
2173
2174         print_shared_body(gen, shared_domain, gen->local_sched,
2175                             gen->thread_tiled_len, &print_statement,
2176                             gen->first_unroll);
2177
2178         print_private_accesses(gen, shared_domain, gen->write, "write", -1);
2179
2180         print_indent(gen->cuda.kernel_c, gen->kernel_code.indent);
2181         fprintf(gen->cuda.kernel_c, "__syncthreads();\n");
2182
2183         print_shared_accesses(gen, shared_domain, gen->write, "write", -1);
2184
2185         isl_set_free(shared_domain);
2186 }
2187
2188 /* Check if we need to perform any copying to shared memory at this level
2189  * and if so, print the copying instructions.
2190  * Any array for which we are allowed to print copying instructions at
2191  * this level, but haven't done so already, is printed.
2192  */
2193 static void print_kernel_for_head(struct gpucode_info *code,
2194         struct clast_for *f)
2195 {
2196         int i;
2197         struct cuda_gen *gen = code->user;
2198         isl_set *domain;
2199         int level;
2200         int print = 0;
2201
2202         domain = isl_set_from_cloog_domain(cloog_domain_copy(f->domain));
2203         level = isl_set_dim(domain, isl_dim_set) - 1;
2204
2205         for (i = 0; i < gen->n_array; ++i) {
2206                 if (gen->array[i].print_shared_level >= 0)
2207                         continue;
2208                 if (gen->array[i].last_shared > level)
2209                         continue;
2210                 gen->array[i].print_shared_level = level;
2211                 print = 1;
2212         }
2213
2214         if (print) {
2215                 print_shared_accesses(gen, domain, gen->read, "read", level);
2216                 print_private_accesses(gen, domain, gen->read, "read", level);
2217         }
2218
2219         isl_set_free(domain);
2220 }
2221
2222 /* Print instructions for copying from shared memory for each array
2223  * for which print_kernel_for_head has added copying instructions
2224  * to shared memory.
2225  */
2226 static void print_kernel_for_foot(struct gpucode_info *code,
2227         struct clast_for *f)
2228 {
2229         int i;
2230         struct cuda_gen *gen = code->user;
2231         isl_set *domain;
2232         int level;
2233         int print = 0;
2234
2235         domain = isl_set_from_cloog_domain(cloog_domain_copy(f->domain));
2236         level = isl_set_dim(domain, isl_dim_set) - 1;
2237
2238         for (i = 0; i < gen->n_array; ++i) {
2239                 if (gen->array[i].print_shared_level != level)
2240                         continue;
2241                 print = 1;
2242                 break;
2243         }
2244
2245         if (print) {
2246                 print_private_accesses(gen, domain, gen->write, "write", level);
2247                 print_shared_accesses(gen, domain, gen->write, "write", level);
2248         }
2249
2250         isl_set_free(domain);
2251 }
2252
2253 /* Use CLooG to generate code for the outer gen->shared_first loops
2254  * of the local schedule "sched".
2255  * The pretty printing of this code is handled by gpu_print_host_stmt,
2256  * which calls print_kernel_user for each iteration of the shared tile loops.
2257  */
2258 static void print_cloog_kernel_body(struct cuda_gen *gen,
2259         __isl_keep isl_set *context, __isl_keep isl_union_map *sched)
2260 {
2261         int i;
2262         CloogOptions *options;
2263         CloogDomain *cloog_context;
2264         CloogUnionDomain *ud;
2265         CloogInput *input;
2266         struct clast_stmt *stmt;
2267         char name[20];
2268
2269         sched = isl_union_map_copy(sched);
2270         sched = isl_union_map_align_params(sched, isl_set_get_dim(context));
2271
2272         options = cloog_options_malloc(gen->state);
2273         options->language = LANGUAGE_C;
2274         options->strides = 1;
2275         options->sh = 1;
2276         options->stop = gen->shared_len;
2277         options->f = gen->tiled_len;
2278         options->l = gen->tiled_len;
2279         options->save_domains = 1;
2280         options->noscalars = 1;
2281
2282         ud = cloog_union_domain_from_isl_union_map(sched);
2283         for (i = 0; i < gen->shared_len; ++i) {
2284                 snprintf(name, sizeof(name), "g%d", i);
2285                 ud = cloog_union_domain_set_name(ud, CLOOG_SCAT, i, name);
2286         }
2287         cloog_context = cloog_domain_from_isl_set(isl_set_copy(context));
2288         input = cloog_input_alloc(cloog_context, ud);
2289
2290         stmt = cloog_clast_create_from_input(input, options);
2291
2292         gen->kernel_code.indent = 4;
2293         gen->kernel_code.dst = gen->cuda.kernel_c;
2294         gen->kernel_code.print_user_stmt = NULL;
2295         gen->kernel_code.print_user_stmt_list = &print_kernel_user;
2296         gen->kernel_code.print_for_head = &print_kernel_for_head;
2297         gen->kernel_code.print_for_foot = &print_kernel_for_foot;
2298         gen->kernel_code.user = gen;
2299         gpu_print_host_stmt(&gen->kernel_code, stmt);
2300
2301         cloog_clast_free(stmt);
2302         cloog_options_free(options);
2303 }
2304
2305 static void print_kernel_iterators(struct cuda_gen *gen)
2306 {
2307         int i;
2308         const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
2309         const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
2310                                         "threadIdx.z" };
2311
2312         if (gen->n_grid > 0) {
2313                 print_indent(gen->cuda.kernel_c, 4);
2314                 fprintf(gen->cuda.kernel_c, "int ");
2315                 for (i = 0; i < gen->n_grid; ++i) {
2316                         if (i)
2317                                 fprintf(gen->cuda.kernel_c, ", ");
2318                         fprintf(gen->cuda.kernel_c, "b%d = %s",
2319                                 i, block_dims[gen->n_grid - 1 - i]);
2320                 }
2321                 fprintf(gen->cuda.kernel_c, ";\n");
2322         }
2323
2324         if (gen->n_block > 0) {
2325                 print_indent(gen->cuda.kernel_c, 4);
2326                 fprintf(gen->cuda.kernel_c, "int ");
2327                 for (i = 0; i < gen->n_block; ++i) {
2328                         if (i)
2329                                 fprintf(gen->cuda.kernel_c, ", ");
2330                         fprintf(gen->cuda.kernel_c, "t%d = %s",
2331                                 i, thread_dims[gen->n_block - 1 - i]);
2332                 }
2333                 fprintf(gen->cuda.kernel_c, ";\n");
2334         }
2335 }
2336
2337 static void print_group_shared_array(struct cuda_gen *gen,
2338         struct cuda_array_ref_group *group)
2339 {
2340         int j;
2341         struct cuda_array_bound *bounds;
2342
2343         bounds = group->private_bound;
2344         if (!bounds)
2345                 bounds = group->shared_bound;
2346         if (!bounds)
2347                 return;
2348
2349         print_indent(gen->cuda.kernel_c, 4);
2350         fprintf(gen->cuda.kernel_c, "%s%s ",
2351                 group->private_bound ? "" : "__shared__ ", gen->options->type);
2352         print_array_name(gen->cuda.kernel_c, group);
2353         for (j = 0; j < group->array->n_index; ++j) {
2354                 fprintf(gen->cuda.kernel_c, "[");
2355                 isl_int_print(gen->cuda.kernel_c, bounds[j].size, 0);
2356                 fprintf(gen->cuda.kernel_c, "]");
2357         }
2358         fprintf(gen->cuda.kernel_c, ";\n");
2359 }
2360
2361 static void print_shared_arrays(struct cuda_gen *gen)
2362 {
2363         int i, j;
2364
2365         for (i = 0; i < gen->n_array; ++i) {
2366                 struct cuda_array_info *array = &gen->array[i];
2367
2368                 for (j = 0; j < array->n_group; ++j)
2369                         print_group_shared_array(gen, array->groups[j]);
2370         }
2371 }
2372
2373 static void print_kernel_body(struct cuda_gen *gen,
2374         __isl_keep isl_set *host_domain, __isl_keep isl_union_map *sched)
2375 {
2376         isl_set *context;
2377
2378         context = isl_set_copy(host_domain);
2379         context = parametrize(context, 0, gen->tile_first, "h");
2380         context = isl_set_project_out(context, isl_dim_set, 0, gen->tile_first);
2381         context = add_bounded_parameters(context,
2382                                         gen->n_grid, gen->grid_dim, "b");
2383
2384         print_kernel_iterators(gen);
2385         print_shared_arrays(gen);
2386
2387         fprintf(gen->cuda.kernel_c, "\n");
2388
2389         print_cloog_kernel_body(gen, context, sched);
2390
2391         isl_set_free(context);
2392 }
2393
2394 /* Given a constraint
2395  *
2396  *              a(p,i) + j = g f(e)
2397  *
2398  * or -a(p,i) - j = g f(e) if sign < 0,
2399  * store a(p,i) in bound->shift and g (stride) in bound->stride.
2400  * a(p,i) is assumed to be an expression in only the parameters.
2401  */
2402 static void extract_stride(__isl_keep isl_constraint *c,
2403         struct cuda_array_bound *bound, isl_int stride, int sign)
2404 {
2405         int i;
2406         isl_int v;
2407         isl_int one;
2408         isl_dim *dim;
2409         unsigned nparam;
2410         isl_qpolynomial *qp;
2411
2412         isl_int_set(bound->stride, stride);
2413
2414         dim = isl_constraint_get_dim(c);
2415         dim = isl_dim_drop(dim, isl_dim_out, 0, 1);
2416         dim = isl_dim_drop(dim, isl_dim_in, 0, isl_dim_size(dim, isl_dim_in));
2417         dim = isl_dim_domain(dim);
2418
2419         nparam = isl_dim_size(dim, isl_dim_param);
2420
2421         isl_int_init(v);
2422         isl_int_init(one);
2423         isl_int_set_si(one, 1);
2424
2425         isl_constraint_get_constant(c, &v);
2426         if (sign < 0)
2427                 isl_int_neg(v, v);
2428         qp = isl_qpolynomial_rat_cst(isl_dim_copy(dim), v, one);
2429
2430         for (i = 0; i < nparam; ++i) {
2431                 isl_qpolynomial *t, *p;
2432
2433                 isl_constraint_get_coefficient(c, isl_dim_param, i, &v);
2434                 if (isl_int_is_zero(v))
2435                         continue;
2436                 if (sign < 0)
2437                         isl_int_neg(v, v);
2438                 t = isl_qpolynomial_rat_cst(isl_dim_copy(dim), v, one);
2439                 p = isl_qpolynomial_var(isl_dim_copy(dim), isl_dim_param, i);
2440                 t = isl_qpolynomial_mul(t, p);
2441                 qp = isl_qpolynomial_add(qp, t);
2442         }
2443
2444         isl_dim_free(dim);
2445         isl_int_clear(one);
2446         isl_int_clear(v);
2447
2448         bound->shift = qp;
2449 }
2450
2451 /* Given an equality constraint of a map with a single output dimension j,
2452  * check if the constraint is of the form
2453  *
2454  *              a(p,i) + j = g f(e)
2455  *
2456  * with a(p,i) an expression in the parameters and input dimensions
2457  * and f(e) an expression in the existentially quantified variables.
2458  * If so, and if g is larger than any such g from a previously considered
2459  * constraint, then call extract_stride. to record the stride information
2460  * in bound.
2461  */
2462 static int check_stride_constraint(__isl_take isl_constraint *c, void *user)
2463 {
2464         int i;
2465         isl_int v, stride;
2466         unsigned n_div;
2467         struct cuda_array_bound *bound = user;
2468
2469         isl_int_init(v);
2470         isl_int_init(stride);
2471
2472         n_div = isl_constraint_dim(c, isl_dim_div);
2473         isl_constraint_get_coefficient(c, isl_dim_out, 0, &v);
2474
2475         if (n_div && (isl_int_is_one(v) || isl_int_is_negone(v))) {
2476                 int s = isl_int_sgn(v);
2477                 isl_int_set_si(stride, 0);
2478                 for (i = 0; i < n_div; ++i) {
2479                         isl_constraint_get_coefficient(c, isl_dim_div, i, &v);
2480                         isl_int_gcd(stride, stride, v);
2481                 }
2482                 if (!isl_int_is_zero(stride) &&
2483                     isl_int_gt(stride, bound->stride))
2484                         extract_stride(c, bound, stride, s);
2485         }
2486
2487         isl_int_clear(stride);
2488         isl_int_clear(v);
2489
2490         isl_constraint_free(c);
2491         return 0;
2492 }
2493
2494 /* Given contraints on an array index i, check if we can find
2495  * a shift a(p) and a stride g such that
2496  *
2497  *      a(p) + i = 0 mod g
2498  *
2499  * If so, record the information in bound and apply the mapping
2500  * i -> (i + a(p))/g to the array index in bounds and return
2501  * the new constraints.
2502  * If not, simply return the original constraints.
2503  */
2504 static __isl_give isl_basic_map *check_stride(struct cuda_gen *gen,
2505         struct cuda_array_bound *bound, __isl_take isl_basic_map *bounds)
2506 {
2507         isl_dim *dim;
2508         isl_basic_map *aff;
2509         isl_basic_map *shift;
2510         isl_qpolynomial *qp, *t;
2511         isl_int one;
2512
2513         isl_int_set_si(bound->stride, -1);
2514
2515         aff = isl_basic_map_affine_hull(isl_basic_map_copy(bounds));
2516
2517         isl_basic_map_foreach_constraint(aff, &check_stride_constraint, bound);
2518
2519         isl_basic_map_free(aff);
2520
2521         if (isl_int_is_neg(bound->stride))
2522                 return bounds;
2523
2524         qp = isl_qpolynomial_copy(bound->shift);
2525         qp = isl_qpolynomial_add_dims(qp, isl_dim_set, 1);
2526         dim = isl_qpolynomial_get_dim(qp);
2527         t = isl_qpolynomial_var(isl_dim_copy(dim), isl_dim_set, 0);
2528         qp = isl_qpolynomial_add(qp, t);
2529         isl_int_init(one);
2530         isl_int_set_si(one, 1);
2531         t = isl_qpolynomial_rat_cst(dim, one, bound->stride);
2532         isl_int_clear(one);
2533         qp = isl_qpolynomial_mul(qp, t);
2534         shift = isl_basic_map_from_qpolynomial(qp);
2535
2536         bound->shift_map = isl_basic_map_copy(shift);
2537         bounds = isl_basic_map_apply_range(bounds, shift);
2538
2539         return bounds;
2540 }
2541
2542 struct cuda_size_info {
2543         isl_basic_set *bset;
2544         struct cuda_array_bound *bound;
2545         int pos;
2546 };
2547
2548 /* Given a constraint from the basic set describing the bounds on
2549  * an array index, check if it is a lower bound, say m i >= b(x), and,
2550  * if so, check whether the expression "i - ceil(b(x)/m) + 1" has a constant
2551  * upper bound.  If so, and if this bound is smaller than any bound
2552  * derived from earlier constraints, set the size to this bound on
2553  * the expression and the lower bound to ceil(b(x)/m).
2554  */
2555 static int compute_size_in_direction(__isl_take isl_constraint *c, void *user)
2556 {
2557         struct cuda_size_info *size = user;
2558         unsigned nparam;
2559         unsigned n_div;
2560         isl_int v;
2561
2562         nparam = isl_basic_set_dim(size->bset, isl_dim_param);
2563         n_div = isl_constraint_dim(c, isl_dim_div);
2564
2565         if (isl_constraint_involves_dims(c, isl_dim_div, 0, n_div)) {
2566                 isl_constraint_free(c);
2567                 return 0;
2568         }
2569
2570         isl_int_init(v);
2571
2572         isl_constraint_get_coefficient(c, isl_dim_set, size->pos, &v);
2573
2574         if (isl_int_is_pos(v)) {
2575                 isl_aff *aff;
2576                 isl_aff *lb;
2577                 enum isl_lp_result res;
2578
2579                 aff = isl_constraint_get_bound(c, isl_dim_set, size->pos);
2580                 aff = isl_aff_ceil(aff);
2581
2582                 lb = isl_aff_copy(aff);
2583
2584                 aff = isl_aff_neg(aff);
2585                 aff = isl_aff_add_coefficient_si(aff, isl_dim_set, size->pos, 1);
2586
2587                 res = isl_basic_set_max(size->bset, aff, &v);
2588                 isl_aff_free(aff);
2589
2590                 if (res == isl_lp_ok) {
2591                         isl_int_add_ui(v, v, 1);
2592                         if (isl_int_is_neg(size->bound->size) ||
2593                             isl_int_lt(v, size->bound->size)) {
2594                                 isl_int_set(size->bound->size, v);
2595                                 lb = isl_aff_drop_dims(lb, isl_dim_set,
2596                                                         0, size->pos + 1);
2597                                 isl_aff_free(size->bound->lb);
2598                                 size->bound->lb = isl_aff_copy(lb);
2599                         }
2600                 }
2601                 isl_aff_free(lb);
2602         }
2603
2604         isl_int_clear(v);
2605         isl_constraint_free(c);
2606
2607         return 0;
2608 }
2609
2610 /* Given a basic map "bounds" that maps parameters and input dimensions
2611  * to a single output dimension, look for an expression in the parameters
2612  * and input dimensions such that the range of the output dimension shifted
2613  * by this expression is a constant.
2614  *
2615  * In particular, we currently only consider lower bounds on the output
2616  * dimension as candidate expressions.
2617  */
2618 static int compute_array_dim_size(struct cuda_gen *gen,
2619         struct cuda_array_bound *bound, __isl_take isl_basic_map *bounds)
2620 {
2621         struct cuda_size_info size;
2622
2623         bounds = check_stride(gen, bound, bounds);
2624
2625         isl_int_set_si(bound->size, -1);
2626         bound->lb = NULL;
2627
2628         size.bound = bound;
2629         size.pos = isl_basic_map_dim(bounds, isl_dim_in);
2630         size.bset = isl_basic_map_wrap(bounds);
2631         size.bset = isl_basic_set_flatten(size.bset);
2632         isl_basic_set_foreach_constraint(size.bset, &compute_size_in_direction,
2633                                         &size);
2634         isl_basic_set_free(size.bset);
2635
2636         return isl_int_is_nonneg(bound->size) ? 0 : -1;
2637 }
2638
2639 /* Check if we can find a shared memory tile for the given array
2640  * based on the given accesses, and if so, put the results
2641  * in array->shared_bound.
2642  *
2643  * We project the accesses on each index in turn and look for a parametric
2644  * offset such that the size is constant.
2645  */
2646 static int can_tile_for_shared_memory(struct cuda_gen *gen,
2647         struct cuda_array_info *array, __isl_keep isl_map *access,
2648         struct cuda_array_bound *bounds)
2649 {
2650         int i;
2651
2652         for (i = 0; i < array->n_index; ++i) {
2653                 isl_map *access_i;
2654                 isl_basic_map *hull;
2655
2656                 access_i = isl_map_copy(access);
2657                 access_i = isl_map_project_out(access_i, isl_dim_out, 0, i);
2658                 access_i = isl_map_project_out(access_i, isl_dim_out,
2659                                             1, array->n_index - (i + 1));
2660                 access_i = isl_map_compute_divs(access_i);
2661                 hull = isl_map_simple_hull(access_i);
2662                 if (compute_array_dim_size(gen, &bounds[i], hull) < 0)
2663                         return 0;
2664         }
2665
2666         return 1;
2667 }
2668
2669 /* Construct a map with input the shared tile loops and the loops that
2670  * will be wrapped around the threads that relates these later loops
2671  * to the thread indices and the projects them out.
2672  */
2673 static __isl_give isl_map *compute_privatization(struct cuda_gen *gen)
2674 {
2675         isl_map *priv;
2676         isl_map *tiling;
2677         isl_map *proj;
2678         isl_set *par;
2679         isl_dim *dim;
2680
2681         dim = isl_union_map_get_dim(gen->shared_sched);
2682
2683         if (gen->options->wrap)
2684                 tiling = wrap(isl_dim_copy(dim), gen->shared_len + gen->n_block,
2685                                 gen->shared_len, gen->n_block, gen->block_dim);
2686         else
2687                 tiling = tile(isl_dim_copy(dim), gen->shared_len + gen->n_block,
2688                                 gen->shared_len, gen->n_block, gen->block_dim);
2689
2690         priv = tiling;
2691
2692         par = parametrization(dim, gen->shared_len + 2 * gen->n_block,
2693                 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
2694                 gen->n_block, "t");
2695
2696         priv = isl_map_align_params(priv, isl_set_get_dim(par));
2697         priv = isl_map_intersect_range(priv, par);
2698
2699         dim = isl_map_get_dim(priv);
2700         dim = isl_dim_drop(dim, isl_dim_in, 0, isl_dim_size(dim, isl_dim_in));
2701         dim = isl_dim_drop(dim, isl_dim_out, 0, isl_dim_size(dim, isl_dim_out));
2702         proj = projection(dim, gen->shared_len + 2 * gen->n_block,
2703                           gen->shared_len);
2704
2705         priv = isl_map_apply_range(priv, proj);
2706
2707         return priv;
2708 }
2709
2710 /* Construct a map from domain_dim to domain_dim that increments
2711  * the dimension at position "pos" and leaves all other dimensions
2712  * constant.
2713  */
2714 static __isl_give isl_map *next(__isl_take isl_dim *domain_dim, int pos)
2715 {
2716         int i;
2717         int len = isl_dim_size(domain_dim, isl_dim_set);
2718         isl_dim *dim;
2719         isl_basic_map *next;
2720
2721         dim = isl_dim_map_from_set(domain_dim);
2722         next = isl_basic_map_universe(isl_dim_copy(dim));
2723
2724         for (i = 0; i < len; ++i) {
2725                 isl_constraint *c;
2726
2727                 c = isl_equality_alloc(isl_dim_copy(dim));
2728                 isl_constraint_set_coefficient_si(c, isl_dim_in, i, 1);
2729                 isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
2730                 if (i == pos)
2731                         isl_constraint_set_constant_si(c, 1);
2732                 next = isl_basic_map_add_constraint(next, c);
2733         }
2734
2735         isl_dim_free(dim);
2736
2737         return isl_map_from_basic_map(next);
2738 }
2739
2740 /* Check if the given access is coalesced.
2741  * That is, check whether incrementing the dimension that will get
2742  * wrapped over the last thread index results in incrementing
2743  * the last array index.
2744  *
2745  * This function is only called for access relations without reuse.
2746  */
2747 static int access_is_coalesced(struct cuda_gen *gen,
2748         __isl_keep isl_union_map *access)
2749 {
2750         isl_dim *dim;
2751         isl_map *access_map;
2752         isl_map *next_thread_x;
2753         isl_map *next_element;
2754         isl_map *map;
2755         int coalesced;
2756
2757         access = isl_union_map_copy(access);
2758         access = isl_union_map_apply_domain(access,
2759                                 isl_union_map_copy(gen->tiled_sched));
2760         access_map = isl_map_from_union_map(access);
2761
2762         dim = isl_map_get_dim(access_map);
2763         dim = isl_dim_domain(dim);
2764         next_thread_x = next(dim, gen->shared_len + gen->n_block - 1);
2765
2766         dim = isl_map_get_dim(access_map);
2767         dim = isl_dim_range(dim);
2768         next_element = next(dim, isl_dim_size(dim, isl_dim_set) - 1);
2769
2770         map = isl_map_apply_domain(next_thread_x, isl_map_copy(access_map));
2771         map = isl_map_apply_range(map, access_map);
2772
2773         coalesced = isl_map_is_subset(map, next_element);
2774
2775         isl_map_free(next_element);
2776         isl_map_free(map);
2777
2778         return coalesced;
2779 }
2780
2781 /* For the given array reference group, check whether the access is private
2782  * to the thread.  That is, check that any given array element
2783  * is only accessed by a single thread.
2784  * We compute an access relation that maps the shared tile loop iterators
2785  * and the shared point loop iterators that will be wrapped over the
2786  * threads to the array elements.
2787  * We actually check that those iterators that will be wrapped
2788  * partition the array space.  This check is stricter than necessary
2789  * since several iterations may be mapped onto the same thread
2790  * and then they could be allowed to access the same memory elements,
2791  * but our check does not allow this situation.
2792  *
2793  * We also check that the index expression only depends on parallel
2794  * loops.  That way, we can move those loops innermost and unroll them.
2795  * Again, we use a test that is stricter than necessary.
2796  * We actually check whether the index expression only depends
2797  * on the iterators that are wrapped over the threads.
2798  * These are necessarily parallel, but there may be more parallel loops.
2799  *
2800  * Combining the injectivity of the first test with the single-valuedness
2801  * of the second test, we simply test for bijectivity.
2802  *
2803  * If it turns out we can use registers, we compute the private memory
2804  * tile size using can_tile_for_shared_memory, after introducing a dependence
2805  * on the thread indices.
2806  *
2807  * Before performing any of the above computations, we first check
2808  * if there is any reuse on the reference group.  If not, we simply
2809  * return.  If, moreover, the access is coalesced then we also remove
2810  * the shared memory tiling since we should just use global memory instead.
2811  */
2812 static void check_private_group_access(struct cuda_gen *gen,
2813         struct cuda_array_ref_group *group)
2814 {
2815         isl_map *acc;
2816         isl_union_map *access;
2817         int n_index = group->array->n_index;
2818
2819         access = group_access_relation(group, 1, 1);
2820         if (isl_union_map_is_injective(access)) {
2821                 if (group->shared_bound && access_is_coalesced(gen, access)) {
2822                         free_bound_list(group->shared_bound, n_index);
2823                         group->shared_bound = NULL;
2824                 }
2825                 isl_union_map_free(access);
2826                 return;
2827         }
2828         access = isl_union_map_apply_domain(access,
2829                                         isl_union_map_copy(gen->shared_sched));
2830
2831         acc = isl_map_from_union_map(access);
2832
2833         if (!isl_map_is_bijective(acc)) {
2834                 isl_map_free(acc);
2835                 return;
2836         }
2837
2838         group->private_bound = create_bound_list(gen->ctx, n_index);
2839         acc = isl_map_align_params(acc, isl_map_get_dim(gen->privatization));
2840         acc = isl_map_apply_domain(acc, isl_map_copy(gen->privatization));
2841         if (!can_tile_for_shared_memory(gen, group->array, acc,
2842                                         group->private_bound)) {
2843                 free_bound_list(group->private_bound, n_index);
2844                 group->private_bound = NULL;
2845         }
2846
2847         isl_map_free(acc);
2848 }
2849
2850 /* Look for the last shared tile loop that affects the offset of the
2851  * shared or private tile and store the result in array->last_shared.
2852  */
2853 static void set_last_shared(struct cuda_gen *gen,
2854         struct cuda_array_ref_group *group)
2855 {
2856         int i, j;
2857         struct cuda_array_bound *bounds;
2858         unsigned first_shared = gen->first_shared;
2859         int n_index = group->array->n_index;
2860
2861         bounds = group->private_bound;
2862         if (!bounds)
2863                 bounds = group->shared_bound;
2864         if (!bounds)
2865                 return;
2866
2867         for (j = gen->shared_len - 1; j >= 0; --j) {
2868                 for (i = 0; i < n_index; ++i) {
2869                         isl_aff *lb;
2870                         isl_qpolynomial *shift;
2871
2872                         lb = bounds[i].lb;
2873                         if (isl_aff_involves_dims(lb, isl_dim_param,
2874                                                         first_shared + j, 1))
2875                                 break;
2876
2877                         shift = bounds[i].shift;
2878                         if (!shift)
2879                                 continue;
2880                         if (isl_qpolynomial_involves_dims(shift, isl_dim_param,
2881                                                         first_shared + j, 1))
2882                                 break;
2883                 }
2884                 if (i < n_index)
2885                         break;
2886         }
2887         group->array->last_shared = j;
2888 }
2889
2890 /* Compute the sizes of all private arrays for the current kernel,
2891  * as well as the offsets of the private pieces in the original arrays.
2892  * If we cannot or don't want to privatize a given array group,
2893  * we use the shared memory tile sizes computed in
2894  * compute_group_shared_bound instead.
2895  *
2896  * If a given Array only has a single reference group and if we have
2897  * been able to find a privated or shared tile,
2898  * we also look for the last shared tile loop that affects the offset
2899  * (and therefore the array tile) and store the result in array->last_shared.
2900  *
2901  * A privatized copy of all access relations from reference groups that
2902  * are mapped to private memory is stored in gen->privatization.
2903  */
2904 static void compute_private_size(struct cuda_gen *gen)
2905 {
2906         int i, j;
2907         isl_union_map *private;
2908
2909         private = isl_union_map_empty(isl_union_map_get_dim(gen->shared_sched));
2910
2911         for (i = 0; i < gen->n_array; ++i) {
2912                 struct cuda_array_info *array = &gen->array[i];
2913
2914                 for (j = 0; j < array->n_group; ++j) {
2915                         check_private_group_access(gen, array->groups[j]);
2916
2917                         if (!array->groups[j]->private_bound)
2918                                 continue;
2919
2920                         private = isl_union_map_union(private,
2921                                 group_access_relation(array->groups[j], 1, 1));
2922                 }
2923
2924                 array->last_shared = gen->shared_len - 1;
2925                 array->print_shared_level = -1;
2926
2927                 if (array->n_group != 1)
2928                         continue;
2929                 set_last_shared(gen, array->groups[0]);
2930         }
2931
2932         if (isl_union_map_is_empty(private))
2933                 isl_union_map_free(private);
2934         else {
2935                 isl_union_map *priv;
2936
2937                 private = isl_union_map_apply_domain(private,
2938                                         isl_union_map_copy(gen->shared_sched));
2939                 priv = isl_union_map_from_map(isl_map_copy(gen->privatization));
2940                 private = isl_union_map_apply_domain(private, priv);
2941                 gen->private_access = private;
2942         }
2943 }
2944
2945 /* Fill up the groups array with singleton groups, i.e., one group
2946  * per reference, initializing the array, access, write and refs fields.
2947  * In particular the access field is initialized to the scheduled
2948  * access relation of the array reference.
2949  *
2950  * Return the number of elements initialized, i.e., the number of
2951  * active references in the current kernel.
2952  */
2953 static int populate_array_references(struct cuda_gen *gen,
2954         struct cuda_array_info *array, __isl_keep isl_union_map *sched,
2955         struct cuda_array_ref_group **groups)
2956 {
2957         int i;
2958         int n;
2959         isl_ctx *ctx = isl_union_map_get_ctx(sched);
2960
2961         n = 0;
2962         for (i = 0; i < array->n_ref; ++i) {
2963                 isl_union_map *umap;
2964                 isl_map *map;
2965                 struct cuda_array_ref_group *group;
2966                 struct cuda_stmt_access *access = array->refs[i];
2967
2968                 map = isl_map_copy(access->access);
2969                 umap = isl_union_map_from_map(map);
2970                 umap = isl_union_map_apply_domain(umap,
2971                                 isl_union_map_copy(sched));
2972
2973                 if (isl_union_map_is_empty(umap)) {
2974                         isl_union_map_free(umap);
2975                         continue;
2976                 }
2977
2978                 map = isl_map_from_union_map(umap);
2979
2980                 group = isl_calloc_type(ctx, struct cuda_array_ref_group);
2981                 assert(group);
2982                 group->array = array;
2983                 group->access = map;
2984                 group->write = access->write;
2985                 group->refs = &array->refs[i];
2986
2987                 groups[n++] = group;
2988         }
2989
2990         return n;
2991 }
2992
2993 static void free_array_ref_group(struct cuda_array_ref_group *group,
2994         int n_index)
2995 {
2996         if (!group)
2997                 return;
2998         free_bound_list(group->shared_bound, n_index);
2999         free_bound_list(group->private_bound, n_index);
3000         isl_map_free(group->access);
3001         free(group->refs);
3002         free(group);
3003 }
3004
3005 /* If two groups have overlapping access relations and if one of them
3006  * involves a write, then merge the two groups into one.
3007  *
3008  * We keep track of the grouping in "leader".  leader[j] points to
3009  * an earlier group array element that belongs to the same group,
3010  * or the array element j itself if this element is the first in the group.
3011  *
3012  * Return the number of group leaders.
3013  */
3014 static int group_overlapping_writes(int n,
3015         struct cuda_array_ref_group **groups, int *leader)
3016 {
3017         int i, j;
3018         int n_group = n;
3019
3020         for (i = 0; i < n; ++i) {
3021                 int l = i;
3022                 groups[l]->n_ref = 1;
3023                 for (j = i - 1; j >= 0; --j) {
3024                         isl_map *map;
3025                         int empty;
3026
3027                         if (leader[j] != j)
3028                                 continue;
3029                         if (!groups[l]->write && !groups[j]->write)
3030                                 continue;
3031
3032                         map = isl_map_intersect(isl_map_copy(groups[l]->access),
3033                                             isl_map_copy(groups[j]->access));
3034                         empty = isl_map_is_empty(map);
3035                         isl_map_free(map);
3036
3037                         if (empty)
3038                                 continue;
3039
3040                         groups[j]->access = isl_map_union(groups[j]->access,
3041                                                         groups[l]->access);
3042                         groups[j]->write = 1;
3043                         groups[l]->access = NULL;
3044                         groups[j]->n_ref += groups[l]->n_ref;
3045                         l = leader[l] = j;
3046                         n_group--;
3047                 }
3048                 leader[i] = l;
3049         }
3050
3051         return n_group;
3052 }
3053
3054 /* Compute the size of the shared array corresponding to the given array
3055  * array refrence group, based on the accesses from the current kernel,
3056  * as well as the offset of the shared piece in the original array.
3057  */
3058 static void compute_group_shared_bound(struct cuda_gen *gen,
3059         struct cuda_array_info *array, struct cuda_array_ref_group *group)
3060 {
3061         isl_ctx *ctx = isl_dim_get_ctx(array->dim);
3062
3063         group->shared_bound = create_bound_list(ctx, array->n_index);
3064         if (!can_tile_for_shared_memory(gen, array, group->access,
3065                                         group->shared_bound)) {
3066                 free_bound_list(group->shared_bound, array->n_index);
3067                 group->shared_bound = NULL;
3068         }
3069 }
3070
3071 /* Given an initial grouping of array references and shared memory tiles
3072  * for each group that allows for a shared memory tile, merge two groups
3073  * if both have a shared memory tile and if the merged group also has
3074  * a shared memory tile.
3075  *
3076  * Return the number of group leaders after merging.
3077  */
3078 static int group_common_shared_memory_tile(struct cuda_gen *gen,
3079         struct cuda_array_info *array, int n,
3080         struct cuda_array_ref_group **groups, int *leader, int n_group)
3081 {
3082         int i, j;
3083         isl_ctx *ctx = isl_dim_get_ctx(array->dim);
3084
3085         for (i = 0; n_group > 1 && i < n; ++i) {
3086                 int l = i;
3087                 if (leader[i] != i)
3088                         continue;
3089                 if (!groups[i]->shared_bound)
3090                         continue;
3091                 for (j = i - 1; j >= 0; --j) {
3092                         isl_map *map;
3093                         int empty;
3094                         struct cuda_array_bound *shared_bound;
3095
3096                         if (leader[j] != j)
3097                                 continue;
3098                         if (!groups[j]->shared_bound)
3099                                 continue;
3100
3101                         map = isl_map_intersect(isl_map_copy(groups[l]->access),
3102                                             isl_map_copy(groups[j]->access));
3103                         empty = isl_map_is_empty(map);
3104                         isl_map_free(map);
3105
3106                         if (empty)
3107                                 continue;
3108
3109                         map = isl_map_union(isl_map_copy(groups[l]->access),
3110                                             isl_map_copy(groups[j]->access));
3111                         shared_bound = create_bound_list(ctx, array->n_index);
3112                         if (!can_tile_for_shared_memory(gen, array, map,
3113                                                         shared_bound)) {
3114                                 isl_map_free(map);
3115                                 free_bound_list(shared_bound, array->n_index);
3116                                 continue;
3117                         }
3118
3119                         free_bound_list(groups[j]->shared_bound,
3120                                         array->n_index);
3121                         groups[j]->shared_bound = shared_bound;
3122                         isl_map_free(groups[j]->access);
3123                         groups[j]->access = map;
3124                         groups[j]->n_ref += groups[l]->n_ref;
3125                         l = leader[l] = j;
3126                         n_group--;
3127                 }
3128         }
3129
3130         return n_group;
3131 }
3132
3133 /* Extract an array of array reference groups from the array of references
3134  * and the grouping information in "leader".
3135  *
3136  * Store the results in array->n_group and array->groups.
3137  */
3138 static void extract_array_groups(isl_ctx *ctx, struct cuda_array_info *array,
3139         int n, struct cuda_array_ref_group **groups, int *leader, int n_group)
3140 {
3141         int i, j;
3142
3143         for (i = 2; i < n; ++i)
3144                 leader[i] = leader[leader[i]];
3145
3146         array->n_group = n_group;
3147         array->groups = isl_alloc_array(ctx, struct cuda_array_ref_group *,
3148                                         n_group);
3149         assert(array->groups);
3150
3151         j = 0;
3152         for (i = 0; i < n; ++i) {
3153                 int k, l;
3154                 struct cuda_stmt_access **refs;
3155
3156                 if (leader[i] != i) {
3157                         groups[i]->refs = NULL;
3158                         free_array_ref_group(groups[i], array->n_index);
3159                         continue;
3160                 }
3161
3162                 refs = isl_alloc_array(ctx, struct cuda_stmt_access *,
3163                                         groups[i]->n_ref);
3164                 assert(refs);
3165                 l = 0;
3166                 for (k = i; k < n; ++k)
3167                         if (leader[k] == i) {
3168                                 refs[l++] = *groups[k]->refs;
3169                                 (*groups[k]->refs)->group = j;
3170                         }
3171
3172                 groups[i]->refs = refs;
3173                 groups[i]->nr = j;
3174                 array->groups[j++] = groups[i];
3175         }
3176 }
3177
3178 /* Group array references that should be considered together when
3179  * deciding whether to access them from private, shared or global memory.
3180  *
3181  * In particular, if two array references overlap and if one of them
3182  * is a write, then the two references are grouped together.
3183  * Furthermore, if two groups admit a shared memory tile and if the
3184  * combination of the two also admits a shared memory tile, we merge
3185  * the two groups.
3186  *
3187  * During the construction the group->refs field points to a single
3188  * array reference inside the array of array references, while
3189  * group->n_ref contains the number of element in leader that
3190  * (directly or indirectly) point to this group, provided the group
3191  * is a leader.
3192  */
3193 static void group_array_references(struct cuda_gen *gen,
3194         struct cuda_array_info *array, __isl_keep isl_union_map *sched)
3195 {
3196         int i;
3197         int n, n_group;
3198         isl_ctx *ctx = isl_union_map_get_ctx(sched);
3199         struct cuda_array_ref_group **groups;
3200         int *leader;
3201
3202         groups = isl_calloc_array(ctx, struct cuda_array_ref_group *,
3203                                         array->n_ref);
3204         assert(groups);
3205
3206         n = populate_array_references(gen, array, sched, groups);
3207
3208         leader = isl_alloc_array(ctx, int, n);
3209         assert(leader);
3210
3211         n_group = group_overlapping_writes(n, groups, leader);
3212
3213         for (i = 0; i < n; ++i)
3214                 if (leader[i] == i)
3215                         compute_group_shared_bound(gen, array, groups[i]);
3216
3217         n_group = group_common_shared_memory_tile(gen, array, n, groups,
3218                                                   leader, n_group);
3219
3220         extract_array_groups(ctx, array, n, groups, leader, n_group);
3221
3222         free(leader);
3223         free(groups);
3224 }
3225
3226 /* Take tiled_sched, project it onto the shared tile loops and
3227  * the loops that will be wrapped over the threads,
3228  * parametrize the shared tile loops and store the result in gen->shared_sched.
3229  * The position of the first of these parameters is stored in gen->first_shared.
3230  * Also compute a projection that projects out the loops that will be
3231  * wrapped over the threads and store this projection in gen->shared_proj.
3232  */
3233 static void compute_shared_sched(struct cuda_gen *gen)
3234 {
3235         isl_dim *dim;
3236         isl_map *proj;
3237         isl_set *par;
3238         isl_union_map *sched;
3239
3240         sched = isl_union_map_copy(gen->tiled_sched);
3241
3242         dim = isl_union_map_get_dim(sched);
3243         gen->first_shared = isl_dim_size(dim, isl_dim_param);
3244         proj = projection(dim, gen->tiled_len, gen->shared_len + gen->n_block);
3245         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
3246
3247         dim = isl_union_map_get_dim(sched);
3248         par = parametrization(dim, gen->shared_len + gen->n_block,
3249                                 0, gen->shared_len, "g");
3250         sched = isl_union_map_intersect_range(sched,
3251                                                 isl_union_set_from_set(par));
3252
3253         dim = isl_union_map_get_dim(sched);
3254         proj = projection(dim, gen->shared_len + gen->n_block, gen->shared_len);
3255
3256         gen->shared_sched = sched;
3257         gen->shared_proj = isl_union_map_from_map(proj);
3258 }
3259
3260 /* Group references of all arrays in the program.
3261  */
3262 static void group_references(struct cuda_gen *gen)
3263 {
3264         int i;
3265         isl_union_map *sched;
3266
3267         sched = isl_union_map_apply_range(isl_union_map_copy(gen->shared_sched),
3268                                           isl_union_map_copy(gen->shared_proj));
3269
3270         for (i = 0; i < gen->n_array; ++i)
3271                 group_array_references(gen, &gen->array[i], sched);
3272
3273         isl_union_map_free(sched);
3274 }
3275
3276 /* Free all array information that is local to the current kernel.
3277  */
3278 static void free_local_array_info(struct cuda_gen *gen)
3279 {
3280         int i, j;
3281
3282         for (i = 0; i < gen->n_array; ++i) {
3283                 struct cuda_array_info *array = &gen->array[i];
3284
3285                 for (j = 0; j < array->n_group; ++j)
3286                         free_array_ref_group(array->groups[j], array->n_index);
3287                 free(array->groups);
3288
3289                 if (array->n_group == 0)
3290                         continue;
3291                 for (j = 0; j < gen->array[i].n_index; ++j) {
3292                         isl_pw_aff_free(gen->array[i].local_bound[j]);
3293                         gen->array[i].local_bound[j] = NULL;
3294                 }
3295         }
3296 }
3297
3298 static void print_iterator_list(FILE *out, int len, const char *prefix,
3299         int parens)
3300 {
3301         int i;
3302
3303         fprintf(out, "(");
3304         for (i = 0; i < len; ++i) {
3305                 if (i)
3306                         fprintf(out, ", ");
3307                 if (parens)
3308                         fprintf(out, "(%s%d)", prefix, i);
3309                 else
3310                         fprintf(out, "%s%d", prefix, i);
3311         }
3312         fprintf(out, ")");
3313 }
3314
3315 /* Print an access to the element in the global memory copy of the
3316  * given array that corresponds to element [a0][a1]... of the original array.
3317  * The copy in global memory has been linearized, so we need to take
3318  * the array size into account.
3319  */
3320 static void print_global_index(isl_ctx *ctx, FILE *out,
3321         struct cuda_array_info *array)
3322 {
3323         int i;
3324         isl_printer *prn;
3325
3326         fprintf(out, "%s[", array->name);
3327         for (i = 0; i + 1 < array->n_index; ++i)
3328                 fprintf(out, "(");
3329         for (i = 0; i < array->n_index; ++i) {
3330                 if (i) {
3331                         prn = isl_printer_to_file(ctx, out);
3332                         prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
3333                         prn = isl_printer_print_str(prn, ") * (");
3334                         prn = isl_printer_print_pw_aff(prn,
3335                                                         array->local_bound[i]);
3336                         prn = isl_printer_print_str(prn, ") + ");
3337                         isl_printer_free(prn);
3338                 }
3339                 fprintf(out, "a%d", i);
3340         }
3341         fprintf(out, "]");
3342 }
3343
3344 /* Print an access to the element in the shared memory copy of the
3345  * given array that corresponds to element [a0][a1]... of the original array.
3346  * Since the array in shared memory is just a shifted copy of part
3347  * of the original array, we simply need to subtract the lower bound,
3348  * which was computed in can_tile_for_shared_memory.
3349  * If any of the indices is strided, then we first add
3350  * shared_bound[i].shift and divide by shared_bound[i].stride.
3351  */
3352 static void print_local_index(FILE *out, struct cuda_array_ref_group *group)
3353 {
3354         int i;
3355         isl_ctx *ctx;
3356         isl_printer *prn;
3357         struct cuda_array_bound *bounds = group->shared_bound;
3358
3359         ctx = isl_dim_get_ctx(group->array->dim);
3360         print_array_name(out, group);
3361         for (i = 0; i < group->array->n_index; ++i) {
3362                 fprintf(out, "[(a%d", i);
3363                 if (bounds[i].shift) {
3364                         fprintf(out, " + (");
3365                         prn = isl_printer_to_file(ctx, out);
3366                         prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
3367                         prn = isl_printer_print_qpolynomial(prn,
3368                                                 bounds[i].shift);
3369                         prn = isl_printer_print_str(prn, "))/");
3370                         prn = isl_printer_print_isl_int(prn,
3371                                                 bounds[i].stride);
3372                         isl_printer_free(prn);
3373                 } else
3374                         fprintf(out, ")");
3375                 fprintf(out, " - (");
3376                 prn = isl_printer_to_file(ctx, out);
3377                 prn = isl_printer_set_output_format(prn, ISL_FORMAT_C);
3378                 prn = isl_printer_print_aff(prn, bounds[i].lb);
3379                 isl_printer_free(prn);
3380                 fprintf(out, ")]");
3381         }
3382 }
3383
3384 /* Print '#define's for copying data from global memory to shared
3385  * memory and back for the given array.
3386  */
3387 static void print_array_copy_defines(struct cuda_gen *gen,
3388         struct cuda_array_ref_group *group)
3389 {
3390         int i;
3391         const char *type[] = { "read", "write" };
3392         struct cuda_array_info *array = group->array;
3393         int n_index = array->n_index;
3394
3395         for (i = 0; i < 2; ++i) {
3396                 fprintf(gen->cuda.kernel_c, "#define %s_", type[i]);
3397                 print_array_name(gen->cuda.kernel_c, group);
3398                 print_iterator_list(gen->cuda.kernel_c, n_index, "a", 0);
3399                 fprintf(gen->cuda.kernel_c, " %s_", type[i]);
3400                 print_array_name(gen->cuda.kernel_c, group);
3401                 fprintf(gen->cuda.kernel_c, "_");
3402                 print_iterator_list(gen->cuda.kernel_c, n_index, "a", 1);
3403                 fprintf(gen->cuda.kernel_c, "\n");
3404
3405                 fprintf(gen->cuda.kernel_c, "#define %s_", type[i]);
3406                 print_array_name(gen->cuda.kernel_c, group);
3407                 fprintf(gen->cuda.kernel_c, "_");
3408                 print_iterator_list(gen->cuda.kernel_c, n_index, "a", 0);
3409                 if (i) {
3410                         fprintf(gen->cuda.kernel_c, " ");
3411                         print_global_index(gen->ctx, gen->cuda.kernel_c, array);
3412                         fprintf(gen->cuda.kernel_c, " = ");
3413                         print_local_index(gen->cuda.kernel_c, group);
3414                 } else {
3415                         fprintf(gen->cuda.kernel_c, " ");
3416                         print_local_index(gen->cuda.kernel_c, group);
3417                         fprintf(gen->cuda.kernel_c, " = ");
3418                         print_global_index(gen->ctx, gen->cuda.kernel_c, array);
3419                 }
3420                 fprintf(gen->cuda.kernel_c, "\n");
3421         }
3422 }
3423
3424 static void print_copy_defines(struct cuda_gen *gen)
3425 {
3426         int i, j;
3427
3428         for (i = 0; i < gen->n_array; ++i) {
3429                 struct cuda_array_info *array = &gen->array[i];
3430
3431                 for (j = 0; j < array->n_group; ++j) {
3432                         if (array->groups[j]->private_bound)
3433                                 continue;
3434                         if (!array->groups[j]->shared_bound)
3435                                 continue;
3436                         print_array_copy_defines(gen, array->groups[j]);
3437                 }
3438         }
3439 }
3440
3441 /* The sizes of the arrays on the host that have been computed by
3442  * extract_array_info may depend on the parameters.  Use the extra
3443  * constraints on the parameters that are valid at "host_domain"
3444  * to simplify these expressions.
3445  */
3446 static void localize_bounds(struct cuda_gen *gen,
3447         __isl_keep isl_set *host_domain)
3448 {
3449         int i, j;
3450         isl_set *context;
3451         unsigned nvar;
3452
3453         context = isl_set_copy(host_domain);
3454         nvar = isl_set_dim(host_domain, isl_dim_set);
3455         context = isl_set_project_out(host_domain, isl_dim_set, 0, nvar);
3456
3457         for (i = 0; i < gen->n_array; ++i) {
3458                 struct cuda_array_info *array = &gen->array[i];
3459
3460                 if (array->n_group == 0)
3461                         continue;
3462
3463                 for (j = 0; j < array->n_index; ++j) {
3464                         isl_pw_aff *pwaff;
3465
3466                         pwaff = isl_pw_aff_copy(array->bound[j]);
3467                         pwaff = isl_pw_aff_gist(pwaff, isl_set_copy(context));
3468                         array->local_bound[j] = pwaff;
3469                 }
3470         }
3471         isl_set_free(context);
3472 }
3473
3474 /* Set gen->tile_len and gen->n_parallel to those of the first statement
3475  * in the statement list u.
3476  * Because of the way the schedule is constructed, the other statements
3477  * in the list, if any, should have the same values for these properties.
3478  */
3479 static void set_tile_len(struct cuda_gen *gen, struct clast_user_stmt *u)
3480 {
3481         int nr;
3482         struct cuda_stmt *stmt;
3483
3484         nr = atoi(u->statement->name + 2);
3485         stmt = &gen->stmts[nr];
3486
3487         gen->tile_len = stmt->tile_len;
3488         gen->n_parallel = stmt->n_parallel;
3489 }
3490
3491 /* This function is called for each leaf in the clast of the host code.
3492  * We first specialize the schedule to the site of the leaf, compute
3493  * the size of shared memory and then print the body of host code
3494  * and the associated kernel (through a call to print_kernel_body).
3495  */
3496 static void print_host_user(struct gpucode_info *code,
3497         struct clast_user_stmt *u)
3498 {
3499         struct cuda_gen *gen = code->user;
3500         isl_dim *dim;
3501         isl_set *par;
3502         isl_set *host_domain;
3503         isl_union_map *access;
3504         isl_union_map *local_sched;
3505         isl_union_set *arrays;
3506
3507         set_tile_len(gen, u);
3508         read_sizes(gen);
3509
3510         host_domain = extract_entire_host_domain(u);
3511
3512         local_sched = isl_union_map_intersect_range(
3513                     isl_union_map_copy(gen->sched),
3514                     isl_union_set_from_set(extend(isl_set_copy(host_domain),
3515                                                   gen->untiled_len)));
3516         access = isl_union_map_union(isl_union_map_copy(gen->read),
3517                                      isl_union_map_copy(gen->write));
3518         access = isl_union_map_apply_domain(access,
3519                                             isl_union_map_copy(local_sched));
3520         arrays = isl_union_map_range(access);
3521
3522         print_indent(code->dst, code->indent);
3523         fprintf(code->dst, "dim3 k%d_dimBlock(", gen->kernel_id);
3524         print_reverse_list(code->dst, gen->n_block, gen->block_dim);
3525         fprintf(code->dst, ");\n");
3526
3527         print_indent(code->dst, code->indent);
3528         fprintf(code->dst, "dim3 k%d_dimGrid(", gen->kernel_id);
3529         print_reverse_list(code->dst, gen->n_grid, gen->grid_dim);
3530         fprintf(code->dst, ");\n");
3531
3532         gen->tiled_sched = tile_schedule(gen, local_sched);
3533         gen->tiled_sched = parametrize_tiled_schedule(gen, gen->tiled_sched);
3534         gen->tiled_sched = scale_tile_loops(gen, gen->tiled_sched);
3535
3536         gen->local_sched = isl_union_map_copy(gen->tiled_sched);
3537
3538         dim = isl_union_map_get_dim(gen->local_sched);
3539         par = parametrization(dim, gen->tiled_len, 0, gen->shared_len, "g");
3540         gen->local_sched = isl_union_map_intersect_range(gen->local_sched,
3541                                                 isl_union_set_from_set(par));
3542
3543         gen->local_sched = thread_tile_schedule(gen, gen->local_sched);
3544         gen->local_sched = scale_thread_tile_loops(gen, gen->local_sched);
3545
3546         gen->private_access = NULL;
3547         compute_shared_sched(gen);
3548         gen->privatization = compute_privatization(gen);
3549         group_references(gen);
3550         compute_private_size(gen);
3551         localize_bounds(gen, host_domain);
3552
3553         gen->local_sched = interchange_for_unroll(gen, gen->local_sched);
3554
3555         print_copy_defines(gen);
3556         print_kernel_launch(gen, arrays);
3557
3558         fprintf(gen->cuda.kernel_c, "{\n");
3559
3560         print_kernel_body(gen, host_domain, gen->tiled_sched);
3561
3562         fprintf(gen->cuda.kernel_c, "}\n");
3563
3564         free_local_array_info(gen);
3565         isl_map_free(gen->privatization);
3566         isl_union_map_free(gen->private_access);
3567         isl_union_map_free(gen->local_sched);
3568         isl_union_map_free(gen->tiled_sched);
3569         isl_union_map_free(gen->shared_sched);
3570         isl_union_map_free(gen->shared_proj);
3571         isl_union_set_free(arrays);
3572         isl_set_free(host_domain);
3573
3574         free(gen->tile_size);
3575         gen->kernel_id++;
3576 }
3577
3578 /* Use CLooG to generate code for the outer gen->tile_first loops
3579  * of the global schedule in gen->sched.
3580  * The pretty printing of this code is handled by gpu_print_host_stmt,
3581  * which calls print_host_user for each kernel invocation location.
3582  */
3583 static void print_cloog_host_code(struct cuda_gen *gen)
3584 {
3585         int i;
3586         isl_set *context;
3587         isl_union_map *sched;
3588         CloogOptions *options;
3589         CloogDomain *cloog_context;
3590         CloogUnionDomain *ud;
3591         CloogInput *input;
3592         struct clast_stmt *stmt;
3593         char name[20];
3594
3595         options = cloog_options_malloc(gen->state);
3596         options->language = LANGUAGE_C;
3597         options->otl = 0;
3598         options->strides = 1;
3599         options->stop = gen->tile_first;
3600         options->f = gen->untiled_len;
3601         options->l = gen->untiled_len;
3602         options->save_domains = 1;
3603         options->noscalars = 1;
3604
3605         sched = isl_union_map_copy(gen->sched);
3606         ud = cloog_union_domain_from_isl_union_map(sched);
3607         for (i = 0; i < options->stop; ++i) {
3608                 snprintf(name, sizeof(name), "h%d", i);
3609                 ud = cloog_union_domain_set_name(ud, CLOOG_SCAT, i, name);
3610         }
3611         context = isl_set_copy(gen->context);
3612         cloog_context = cloog_domain_from_isl_set(context);
3613         input = cloog_input_alloc(cloog_context, ud);
3614
3615         stmt = cloog_clast_create_from_input(input, options);
3616
3617         gen->code.indent = 0;
3618         gen->code.dst = gen->cuda.host_c;
3619         gen->code.print_user_stmt = NULL;
3620         gen->code.print_user_stmt_list = &print_host_user;
3621         gen->code.print_for_head = NULL;
3622         gen->code.print_for_foot = NULL;
3623         gen->code.user = gen;
3624         gpu_print_host_stmt(&gen->code, stmt);
3625
3626         cloog_clast_free(stmt);
3627         cloog_options_free(options);
3628 }
3629
3630 void print_host_code(struct cuda_gen *gen)
3631 {
3632         fprintf(gen->cuda.host_c, "{\n");
3633         print_cloog_macros(gen->cuda.host_c);
3634         print_cloog_macros(gen->cuda.kernel_c);
3635
3636         declare_device_arrays(gen);
3637
3638         allocate_device_arrays(gen);
3639         copy_arrays_to_device(gen);
3640
3641         gen->kernel_id = 0;
3642         print_cloog_host_code(gen);
3643
3644         copy_arrays_from_device(gen);
3645         free_device_arrays(gen);
3646
3647         fprintf(gen->cuda.host_c, "}\n");
3648 }
3649
3650 __isl_give isl_set *add_context_from_str(__isl_take isl_set *set,
3651         const char *str)
3652 {
3653         isl_ctx *ctx;
3654         isl_set *context;
3655
3656         if (!str)
3657                 return set;
3658
3659         ctx = isl_set_get_ctx(set);
3660         context = isl_set_read_from_str(ctx, str, -1);
3661         context = isl_set_align_params(context, isl_set_get_dim(set));
3662         set = isl_set_intersect(set, context);
3663
3664         return set;
3665 }
3666
3667 /* Convert scop->context to an isl_set.
3668  */
3669 static __isl_give isl_set *extract_context(isl_ctx *ctx, scoplib_scop_p scop)
3670 {
3671         isl_dim *dim;
3672
3673         dim = isl_dim_set_alloc(ctx, scop->nb_parameters, 0);
3674         dim = set_dim_names(dim, isl_dim_param, scop->parameters);
3675         return scoplib_matrix_to_isl_set(scop->context, dim);
3676 }
3677
3678 /* Return an array of cuda_stmt representing the statements in "scop".
3679  */
3680 static struct cuda_stmt *extract_stmts(isl_ctx *ctx, scoplib_scop_p scop,
3681         __isl_keep isl_set *context)
3682 {
3683         int n;
3684         struct cuda_stmt *stmts;
3685         scoplib_statement_p stmt = scop->statement;
3686
3687         n = scoplib_statement_number(scop->statement);
3688         stmts = isl_calloc_array(ctx, struct cuda_stmt, n);
3689         assert(stmts);
3690
3691         for (stmt = scop->statement, n = 0; stmt; stmt = stmt->next, n++) {
3692                 char name[20];
3693                 isl_dim *dim;
3694                 struct cuda_stmt *s = &stmts[n];
3695
3696                 snprintf(name, sizeof(name), "S_%d", n);
3697
3698                 dim = isl_dim_set_alloc(ctx, scop->nb_parameters,
3699                                         stmt->nb_iterators);
3700                 dim = set_dim_names(dim, isl_dim_param, scop->parameters);
3701                 dim = set_dim_names(dim, isl_dim_set, stmt->iterators);
3702                 dim = isl_dim_set_tuple_name(dim, isl_dim_set, name);
3703                 dim = set_dim_names(dim, isl_dim_set, stmt->iterators);
3704                 s->domain = scoplib_matrix_list_to_isl_set(stmt->domain,
3705                                                                         dim);
3706                 s->domain = isl_set_intersect(s->domain, isl_set_copy(context));
3707                 s->text = strdup(stmt->body);
3708                 stmt_extract_accesses(s);
3709         }
3710
3711         return stmts;
3712 }
3713
3714 /* Extract all the read and write accesses from "scop" and store
3715  * them in gen->read and gen->write.
3716  */
3717 static void extract_accesses(struct cuda_gen *gen, scoplib_scop_p scop)
3718 {
3719         int i;
3720         int n = scoplib_statement_number(scop->statement);
3721         isl_dim *dim;
3722         scoplib_statement_p stmt;
3723
3724         dim = isl_set_get_dim(gen->context);
3725         gen->write = isl_union_map_empty(isl_dim_copy(dim));
3726         gen->read = isl_union_map_empty(dim);
3727
3728         for (i = 0, stmt = scop->statement; i < n; ++i, stmt = stmt->next) {
3729                 isl_union_map *read_i;
3730                 isl_union_map *write_i;
3731
3732                 read_i = scoplib_access_to_isl_union_map(stmt->read,
3733                                 isl_set_copy(gen->stmts[i].domain),
3734                                 scop->arrays);
3735                 write_i = scoplib_access_to_isl_union_map(stmt->write,
3736                                 isl_set_copy(gen->stmts[i].domain),
3737                                 scop->arrays);
3738
3739                 gen->read = isl_union_map_union(gen->read, read_i);
3740                 gen->write = isl_union_map_union(gen->write, write_i);
3741         }
3742 }
3743
3744 /* Extract and return the original schedule of the program from "scop".
3745  */
3746 static isl_union_map *extract_original_schedule(struct cuda_gen *gen,
3747         scoplib_scop_p scop)
3748 {
3749         int i;
3750         int n = scoplib_statement_number(scop->statement);
3751         isl_dim *dim;
3752         isl_union_map *sched;
3753         scoplib_statement_p stmt;
3754
3755         dim = isl_set_get_dim(gen->context);
3756         sched = isl_union_map_empty(dim);
3757
3758         for (i = 0, stmt = scop->statement; i < n; ++i, stmt = stmt->next) {
3759                 isl_map *sched_i;
3760
3761                 dim = isl_set_get_dim(gen->stmts[i].domain);
3762                 dim = isl_dim_from_domain(dim);
3763                 dim = isl_dim_add(dim, isl_dim_out, 2 * stmt->nb_iterators + 1);
3764                 sched_i = scoplib_schedule_to_isl_map(stmt->schedule, dim);
3765
3766                 sched = isl_union_map_union(sched,
3767                                             isl_union_map_from_map(sched_i));
3768         }
3769
3770         return sched;
3771 }
3772
3773 /* Return the union of all iteration domains of the gen->stmts[i].
3774  */
3775 static __isl_give isl_union_set *extract_domain(struct cuda_gen *gen)
3776 {
3777         int i;
3778         isl_union_set *domain;
3779
3780         domain = isl_union_set_empty(isl_set_get_dim(gen->context));
3781         for (i = 0; i < gen->n_stmts; ++i) {
3782                 isl_set *domain_i;
3783
3784                 domain_i = isl_set_copy(gen->stmts[i].domain);
3785                 domain = isl_union_set_union(domain,
3786                                              isl_union_set_from_set(domain_i));
3787         }
3788
3789         return domain;
3790 }
3791
3792 /* Information about the outermost tilable bands in the forest of bands.
3793  *
3794  * tile_len and n_parallel are only sets on band_info structures
3795  * that correspond to outermost bands.  For other bands (in particular,
3796  * ancestors of the outermost bands), n_parallal is set to 0.
3797  *
3798  * prefix is the (padded) schedule leading up to the outermost tilable bands.
3799  *
3800  * tile_first is the number of schedule dimensions in prefix.
3801  *
3802  * suffix is the schedule of the outermost tilable bands and their descendants.
3803  */
3804 struct band_info {
3805         struct cuda_gen *gen;
3806         int tile_first;
3807         int tile_len;
3808         int n_parallel;
3809         isl_union_map *prefix;
3810         isl_union_map *suffix;
3811 };
3812
3813 /* Set tile_len and n_parallel of the statement to that of
3814  * their outermost band, recorded in the band_info.
3815  */
3816 static int set_stmt_tile_len(__isl_take isl_map *map, void *user)
3817 {
3818         struct band_info *info = user;
3819         int nr;
3820         struct cuda_stmt *stmt;
3821
3822         nr = atoi(isl_map_get_tuple_name(map, isl_dim_in) + 2);
3823         stmt = &info->gen->stmts[nr];
3824
3825         stmt->tile_len = info->tile_len;
3826         stmt->n_parallel = info->n_parallel;
3827
3828         isl_map_free(map);
3829
3830         return 0;
3831 }
3832
3833 static void list_select_outer_band(struct cuda_gen *gen,
3834         __isl_take isl_band_list *list, int pos, struct band_info *list_info);
3835
3836 /* Check if this band has any parallel loops.  If so, take it as
3837  * the outermost tilable band.  If not, continue looking for the
3838  * outermost tilable band in the children of the current band.
3839  */
3840 static void band_select_outer_band(struct cuda_gen *gen,
3841         __isl_take isl_band *band, int pos, struct band_info *info)
3842 {
3843         int n = isl_band_n_member(band);
3844         int n_parallel;
3845
3846         for (n_parallel = 0; n_parallel < n; ++n_parallel)
3847                 if (!isl_band_member_is_zero_distance(band, n_parallel))
3848                         break;
3849
3850         info->n_parallel = n_parallel;
3851         if (n_parallel) {
3852                 info->gen = gen;
3853                 info->tile_first = pos;
3854                 info->tile_len = n;
3855                 info->prefix = isl_band_get_prefix_schedule(band);
3856                 info->suffix = isl_union_map_flat_range_product(
3857                                 isl_band_get_partial_schedule(band),
3858                                 isl_band_get_suffix_schedule(band));
3859                 isl_union_map_foreach_map(info->prefix,
3860                                             &set_stmt_tile_len, info);
3861         } else {
3862                 isl_band_list *children;
3863                 assert(isl_band_has_children(band));
3864                 children = isl_band_get_children(band);
3865                 list_select_outer_band(gen, children, pos + n, info);
3866         }
3867
3868         isl_band_free(band);
3869 }
3870
3871 /* Comparison function that returns a non-zero value for band_infos
3872  * with different tile_len fields or different n_parallel fields.
3873  */
3874 static int cmp_band(const void *p1, const void *p2)
3875 {
3876         const struct band_info *info1 = p1;
3877         const struct band_info *info2 = p2;
3878
3879         if (info1->tile_len != info2->tile_len)
3880                 return info1->tile_len - info2->tile_len;
3881
3882         return info1->n_parallel - info2->n_parallel;
3883 }
3884
3885 /* Extend "umap" with coordinates with fixed value "val"
3886  * to a total length of "dst_len", assuming the original dimension is "src_len".
3887  */
3888 static __isl_give isl_union_map *extend_range(__isl_take isl_union_map *umap,
3889         int src_len, int dst_len, int val)
3890 {
3891         isl_dim *dim;
3892         isl_map *map;
3893         int i;
3894
3895         dim = isl_union_map_get_dim(umap);
3896         map = isl_map_reverse(projection(dim, dst_len, src_len));
3897         for (i = src_len; i < dst_len; ++i)
3898                 map = isl_map_fix_si(map, isl_dim_out, i, val);
3899
3900         umap = isl_union_map_apply_range(umap, isl_union_map_from_map(map));
3901
3902         return umap;
3903 }
3904
3905 /* Group bands with the same values for tile_len and n_parallel.
3906  * The prefix schedule is then extended with a fixed coordinate that
3907  * is different for each such group.
3908  * Note that the actual values for this coordinate are not important.
3909  * The bands have already been effectively separated at a higher level
3910  * or they are independent and may be executed in parallel.
3911  * The list of band_info has been sorted before this functions is called.
3912  */
3913 static void separate_bands(struct band_info *info, int n)
3914 {
3915         int i;
3916         int j = 0;
3917
3918         for (i = 0; i < n; ++i) {
3919                 int l = info[i].tile_first;
3920
3921                 if (i &&
3922                     (info[i].tile_len != info[i - 1].tile_len ||
3923                      info[i].n_parallel != info[i - 1].n_parallel))
3924                         j++;
3925
3926                 info[i].prefix = extend_range(info[i].prefix,
3927                                                 l, l + 1, j);
3928                 info[i].tile_first = l + 1;
3929         }
3930 }
3931
3932 /* Select the outermost bands in the elements of the list, align
3933  * their prefix schedules, separate bands with different values
3934  * for tile_len and/or n_parallel and then combine the resulting
3935  * prefix and suffix schedules into a single pair of prefix and
3936  * suffix schedules for the entire list.
3937  */
3938 static void list_select_outer_band(struct cuda_gen *gen,
3939         __isl_take isl_band_list *list, int pos, struct band_info *list_info)
3940 {
3941         isl_band *band;
3942         int i;
3943         int n = isl_band_list_n_band(list);
3944         isl_ctx *ctx = isl_band_list_get_ctx(list);
3945         struct band_info *info;
3946         int max_tile_first;
3947         isl_union_map *prefix;
3948         isl_union_map *suffix;
3949
3950         assert(n >= 1);
3951         info = isl_calloc_array(ctx, struct band_info, n);
3952         assert(info);
3953
3954         max_tile_first = 0;
3955         for (i = 0; i < n; ++i) {
3956                 band = isl_band_list_get_band(list, i);
3957                 band_select_outer_band(gen, band, pos, &info[i]);
3958                 if (info[i].tile_first > max_tile_first)
3959                         max_tile_first = info[i].tile_first;
3960         }
3961
3962         for (i = 0; i < n; ++i) {
3963                 if (info[i].tile_first == max_tile_first)
3964                         continue;
3965                 info[i].prefix = extend_range(info[i].prefix,
3966                                         info[i].tile_first, max_tile_first, 0);
3967         }
3968
3969         qsort(info, n, sizeof(struct band_info), &cmp_band);
3970
3971         for (i = 0; i < n - 1; ++i)
3972                 if (info[i].tile_len != info[i + 1].tile_len ||
3973                     info[i].n_parallel != info[i + 1].n_parallel)
3974                         break;
3975
3976         if (i < n -1)
3977                 separate_bands(info, n);
3978
3979         prefix = info[0].prefix;
3980         suffix = info[0].suffix;
3981
3982         for (i = 1; i < n; ++i) {
3983                 prefix = isl_union_map_union(prefix, info[i].prefix);
3984                 suffix = isl_union_map_union(suffix, info[i].suffix);
3985         }
3986
3987         list_info->tile_first = info[0].tile_first;
3988         list_info->tile_len = -1;
3989         list_info->prefix = prefix;
3990         list_info->suffix = suffix;
3991
3992         isl_band_list_free(list);
3993         free(info);
3994 }
3995
3996 /* Set max_out to the maximal number of output dimensions over
3997  * all maps.
3998  */
3999 static int update_max_out(__isl_take isl_map *map, void *user)
4000 {
4001         int *max_out = user;
4002         int n_out = isl_map_dim(map, isl_dim_out);
4003
4004         if (n_out > *max_out)
4005                 *max_out = n_out;
4006
4007         isl_map_free(map);
4008         return 0;
4009 }
4010
4011 struct align_range_data {
4012         int max_out;
4013         isl_union_map *res;
4014 };
4015
4016 /* Extend the dimension of the range of the given map to data->max_out and
4017  * then add the result to data->res.
4018  */
4019 static int map_align_range(__isl_take isl_map *map, void *user)
4020 {
4021         struct align_range_data *data = user;
4022         int i;
4023         isl_dim *dim;
4024         isl_map *proj;
4025         int n_out = isl_map_dim(map, isl_dim_out);
4026
4027         dim = isl_union_map_get_dim(data->res);
4028         proj = isl_map_reverse(projection(dim, data->max_out, n_out));
4029         for (i = n_out; i < data->max_out; ++i)
4030                 proj = isl_map_fix_si(proj, isl_dim_out, i, 0);
4031
4032         map = isl_map_apply_range(map, proj);
4033
4034         data->res = isl_union_map_add_map(data->res, map);
4035
4036         return 0;
4037 }
4038
4039 /* Extend the ranges of the maps in the union map such they all have
4040  * the same dimension.
4041  */
4042 static __isl_give isl_union_map *align_range(__isl_take isl_union_map *umap)
4043 {
4044         struct align_range_data data;
4045
4046         data.max_out = 0;
4047         isl_union_map_foreach_map(umap, &update_max_out, &data.max_out);
4048
4049         data.res = isl_union_map_empty(isl_union_map_get_dim(umap));
4050         isl_union_map_foreach_map(umap, &map_align_range, &data);
4051
4052         isl_union_map_free(umap);
4053         return data.res;
4054 }
4055
4056 /* Select the outermost tilable band that (by construction)
4057  * has at least one parallel loop.
4058  * The starting position of the aligned band is stored in the pair
4059  * gen->tile_first.
4060  * The sizes and number of parallel loops may be different in different
4061  * parts of the band forest and are therefore stored in the cuda_stmts.
4062  *
4063  * Return the complete schedule, with the tilable bands aligned
4064  * at gen->tile_first and padded with zero, if needed.
4065  */
4066 static __isl_give isl_union_map *select_outer_tilable_band(struct cuda_gen *gen,
4067         __isl_keep isl_schedule *schedule)
4068 {
4069         isl_band_list *list;
4070         struct band_info info;
4071
4072         gen->n_parallel = 0;
4073         gen->tile_len = -1;
4074
4075         list = isl_schedule_get_band_forest(schedule);
4076
4077         list_select_outer_band(gen, list, 0, &info);
4078
4079         gen->tile_first = info.tile_first;
4080         info.suffix = align_range(info.suffix);
4081
4082         return isl_union_map_flat_range_product(info.prefix, info.suffix);
4083 }
4084
4085 /* Set gen->untiled_len to the number of scheduling dimensions
4086  * for the schedule of the first domain.
4087  * We assume here that this number is the same for all domains.
4088  */
4089 static int set_untiled_len(__isl_take isl_map *map, void *user)
4090 {
4091         unsigned *untiled_len = user;
4092
4093         *untiled_len = isl_map_dim(map, isl_dim_out);
4094
4095         isl_map_free(map);
4096         return -1;
4097 }
4098
4099 /* Compute an appropriate schedule based on the accesses in
4100  * gen->read and gen->write.
4101  *
4102  * We first compute dependences and then use those to compute
4103  * a schedule that has a parallel loop in each tilable band.
4104  * Finally, we select the outermost tilable band.
4105  */
4106 static void compute_schedule(struct cuda_gen *gen,
4107         __isl_take isl_union_map *sched)
4108 {
4109         isl_ctx *ctx = isl_union_map_get_ctx(sched);
4110         isl_union_set *domain;
4111         isl_union_map *empty;
4112         isl_union_map *dep_raw, *dep2, *dep3, *dep;
4113         isl_union_map *uninitialized;
4114         isl_schedule *schedule;
4115         struct isl_options *options;
4116
4117         empty = isl_union_map_empty(isl_union_map_get_dim(sched));
4118
4119         isl_union_map_compute_flow(isl_union_map_copy(gen->read),
4120                             isl_union_map_copy(gen->write), empty,
4121                             isl_union_map_copy(sched),
4122                             &dep_raw, NULL, &uninitialized, NULL);
4123         isl_union_map_compute_flow(isl_union_map_copy(gen->write),
4124                             isl_union_map_copy(gen->write),
4125                             isl_union_map_copy(gen->read),
4126                             isl_union_map_copy(sched),
4127                             &dep2, &dep3, NULL, NULL);
4128         isl_union_map_free(sched);
4129
4130         gen->copy_in = isl_union_map_range(uninitialized);
4131
4132         dep = isl_union_map_union(dep2, dep3);
4133         dep = isl_union_map_union(dep, dep_raw);
4134         dep = isl_union_map_coalesce(dep);
4135
4136         domain = extract_domain(gen);
4137         options = isl_ctx_peek_options(ctx, isl_options_arg);
4138         options->schedule_outer_zero_distance = 1;
4139         schedule = isl_union_set_compute_schedule(isl_union_set_copy(domain),
4140                                 isl_union_map_copy(dep), dep);
4141
4142         sched = select_outer_tilable_band(gen, schedule);
4143
4144         isl_union_map_foreach_map(sched, &set_untiled_len, &gen->untiled_len);
4145         sched = isl_union_map_intersect_domain(sched, domain);
4146         gen->sched = sched;
4147
4148         isl_schedule_free(schedule);
4149 }
4150
4151 /* Replace the scop in the "input" file by equivalent code
4152  * that uses the GPU.  "scop" is assumed to correspond to this scop.
4153  *
4154  * We first compute a schedule that respects the dependences
4155  * of the original program and select the outermost band
4156  * of tilable dimensions that has at least one parallel loop.
4157  * We then have three blocks of dimensions
4158  *
4159  *      H               B                       G
4160  *
4161  * The tilable band "B" is first tiled according to "tile.sizes", resulting
4162  * in
4163  *
4164  *      H       T               P               G
4165  *
4166  * For each iteration of the T loop and for each array, we compute
4167  * the array elements accessed by that iteration, construct a rectangular
4168  * box around it and shift it to the origin.  The result is used
4169  * as shared memory for the array.
4170  *
4171  * We then split off at most 2 parallel loops from the T loops and
4172  * at most 3 parallel loops from the P loops
4173  *
4174  *      H       T1      T2      P1      P2      G
4175  *
4176  * The T1/P1 loops are then tiled or "wrapped" over the blocks/threads,
4177  * according to "grid.sizes"/"block.sizes".
4178  *
4179  *      H       T1T T1P T2      P1T P1P P2      G
4180  *
4181  * Finally, the T1P and P1P iterators are equated to the block and
4182  * thread dimensions respectively and so are effectively removed.
4183  * The H loops are run on the host.  The T1T, T2, P1T, P2 and G loops
4184  * are run on the GPU.
4185  *
4186  * Code is generated in three stages.  We first generate code for the
4187  * host (the H loops), with iterators h%d.  Then, for each leaf node
4188  * of the resulting AST, we generate code for the shared loops (up to
4189  * and including T2), with iterators g%d and after equating the H loops
4190  * to h%d parameters and the T1P loops to the block dimensions.
4191  * Finally, we generate code for the remaining loops in a similar fashion.
4192  *
4193  * The function frees "scop" and "ctx".
4194  */
4195 int cuda_scop(isl_ctx *ctx, scoplib_scop_p scop, struct ppcg_options *options,
4196         const char *input)
4197 {
4198         isl_union_map *sched;
4199         struct cuda_gen gen;
4200
4201         gen.ctx = ctx;
4202         gen.context = extract_context(gen.ctx, scop);
4203         gen.context = add_context_from_str(gen.context, options->ctx);
4204         gen.n_stmts = scoplib_statement_number(scop->statement);
4205         gen.stmts = extract_stmts(gen.ctx, scop, gen.context);
4206         extract_accesses(&gen, scop);
4207         gen.options = options;
4208         gen.state = cloog_isl_state_malloc(gen.ctx);
4209
4210         cuda_open_files(&gen.cuda, input);
4211
4212         collect_array_info(&gen);
4213
4214         sched = extract_original_schedule(&gen, scop);
4215         compute_schedule(&gen, sched);
4216
4217         print_host_code(&gen);
4218
4219         cloog_state_free(gen.state);
4220         clear_cuda_gen(&gen);
4221         isl_ctx_free(gen.ctx);
4222         scoplib_scop_free(scop);
4223
4224         cuda_close_files(&gen.cuda);
4225
4226         return 0;
4227 }