gpu.c

   1 /*
   2  * Copyright 2010-2011 INRIA Saclay
   3  * Copyright 2012-2013 Ecole Normale Superieure
   4  *
   5  * Use of this software is governed by the MIT license
   6  *
   7  * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
   8  * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
   9  * 91893 Orsay, France
  10  * and Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
  11  */
  12
  13 #include <assert.h>
  14 #include <stdlib.h>
  15 #include <string.h>
  16
  17 #include <isl/polynomial.h>
  18 #include <isl/union_set.h>
  19 #include <isl/aff.h>
  20 #include <isl/ilp.h>
  21 #include <isl/flow.h>
  22 #include <isl/band.h>
  23 #include <isl/schedule.h>
  24 #include <isl/options.h>
  25 #include <isl/ast_build.h>
  26
  27 #include "cpu.h"
  28 #include "gpu.h"
  29 #include "schedule.h"
  30 #include "ppcg_options.h"
  31 #include "print.h"
  32
  33 /* The fields stride, shift and shift_map only contain valid information
  34  * if shift != NULL.
  35  * If so, they express that current index is such that if you add shift,
  36  * then the result is always a multiple of stride.
  37  * shift_map contains the mapping
  38  *
  39  *      i -> (i + shift)/stride
  40  *
  41  * Let D represent the initial shared_len dimensions of the computed schedule.
  42  * The spaces of "lb" and "shift" are of the form
  43  *
  44  *      D -> [b]
  45  *
  46  * "shift_map" is of the form
  47  *
  48  *      [D -> i] -> [D -> (i + shift(D))/stride]
  49  */
  50 struct gpu_array_bound {
  51         isl_val *size;
  52         isl_aff *lb;
  53
  54         isl_val *stride;
  55         isl_aff *shift;
  56         isl_basic_map *shift_map;
  57 };
  58
  59 /* A tile of an array.
  60  *
  61  * n is the dimension of the array.
  62  * bound is an array of size "n" representing the lower bound
  63  *      and size for each index.
  64  *
  65  * tiling maps a tile in the global array to the corresponding
  66  * shared/private memory tile and is of the form
  67  *
  68  *      { [D[i] -> A[a]] -> T[(a + shift(i))/stride - lb(i)] }
  69  *
  70  * where D represents the initial shared_len dimensions
  71  * of the computed schedule.
  72  */
  73 struct gpu_array_tile {
  74         int n;
  75         struct gpu_array_bound *bound;
  76         isl_multi_aff *tiling;
  77 };
  78
  79 struct gpu_array_info;
  80
  81 /* A group of array references in a kernel that should be handled together.
  82  * If private_tile is not NULL, then it is mapped to registers.
  83  * Otherwise, if shared_tile is not NULL, it is mapped to shared memory.
  84  * Otherwise, it is accessed from global memory.
  85  */
  86 struct gpu_array_ref_group {
  87         /* The references in this group access this array. */
  88         struct gpu_array_info *array;
  89         /* Position of this group in the list of reference groups of array. */
  90         int nr;
  91
  92         /* The following fields are use during the construction of the groups.
  93          * access is the combined access relation relative to the shared
  94          * memory tiling.  In particular, the domain of the map corresponds
  95          * to the first shared_len dimensions of the computed schedule.
  96          * write is set if any access in the group is a write.
  97          */
  98         isl_map *access;
  99         int write;
 100
 101         /* The shared memory tile, NULL if none. */
 102         struct gpu_array_tile *shared_tile;
 103
 104         /* The private memory tile, NULL if none. */
 105         struct gpu_array_tile *private_tile;
 106
 107         /* References in this group; point to elements of a linked list. */
 108         int n_ref;
 109         struct gpu_stmt_access **refs;
 110
 111         /* Last shared memory tile dimension that affects tile of this group. */
 112         int last_shared;
 113 };
 114
 115 struct gpu_gen {
 116         isl_ctx *ctx;
 117         struct ppcg_options *options;
 118
 119         /* Callback for printing of AST in appropriate format. */
 120         __isl_give isl_printer *(*print)(__isl_take isl_printer *p,
 121                 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
 122                 void *user);
 123         void *print_user;
 124
 125         struct gpu_prog *prog;
 126         /* The generated AST. */
 127         isl_ast_node *tree;
 128
 129         /* tile, grid and block sizes for each kernel */
 130         isl_union_map *sizes;
 131
 132         /* Identifier of current kernel. */
 133         int kernel_id;
 134         /* Pointer to the current kernel. */
 135         struct ppcg_kernel *kernel;
 136         /* Does the computed schedule exhibit any parallelism? */
 137         int any_parallelism;
 138
 139         /* First tile dimension. */
 140         int tile_first;
 141         /* Number of tile dimensions. */
 142         int tile_len;
 143         /* Number of initial parallel loops among tile dimensions. */
 144         int n_parallel;
 145
 146         /* Number of dimensions determining shared memory. */
 147         int shared_len;
 148
 149         /* Number of rows in the untiled schedule. */
 150         int untiled_len;
 151         /* Number of rows in the tiled schedule. */
 152         int tiled_len;
 153         /* Number of rows in schedule after tiling/wrapping over threads. */
 154         int thread_tiled_len;
 155
 156         /* Global untiled schedule. */
 157         isl_union_map *sched;
 158         /* Local (per kernel launch) tiled schedule. */
 159         isl_union_map *tiled_sched;
 160         /* Local schedule per shared memory tile loop iteration. */
 161         isl_union_map *local_sched;
 162
 163         /* Local tiled schedule projected onto the shared tile loops and
 164          * the loops that will be wrapped over the threads,
 165          * with all shared tile loops parametrized.
 166          */
 167         isl_union_map *shared_sched;
 168         /* Projects out the loops that will be wrapped over the threads
 169          * from shared_sched.
 170          */
 171         isl_union_map *shared_proj;
 172
 173         /* A map that takes the range of shared_sched as input,
 174          * wraps the appropriate loops over the threads and then projects
 175          * out these loops.
 176          */
 177         isl_map *privatization;
 178
 179         /* A map from the shared memory tile loops and the thread indices
 180          * (as parameters) to the set of accessed memory elements that
 181          * will be accessed through private copies.
 182          */
 183         isl_union_map *private_access;
 184
 185         /* The schedule for the current private/shared access
 186          * (within print_private_access or print_shared_access).
 187          */
 188         isl_map *copy_sched;
 189         /* The array reference group corresponding to copy_sched. */
 190         struct gpu_array_ref_group *copy_group;
 191
 192         /* First loop to unroll (or -1 if none) in the current part of the
 193          * schedule.
 194          */
 195         int first_unroll;
 196
 197         int n_grid;
 198         int n_block;
 199         /* Note: in the input file, the sizes of the grid and the blocks
 200          * are specified in the order x, y, z, but internally, the sizes
 201          * are stored in reverse order, so that the last element always
 202          * refers to the x dimension.
 203          */
 204         int grid_dim[2];
 205         int block_dim[3];
 206         int *tile_size;
 207 };
 208
 209 /* Print the name of the local copy of a given group of array references.
 210  */
 211 static __isl_give isl_printer *print_array_name(__isl_take isl_printer *p,
 212         struct gpu_array_ref_group *group)
 213 {
 214         int global = 0;
 215
 216         if (group->private_tile)
 217                 p = isl_printer_print_str(p, "private_");
 218         else if (group->shared_tile)
 219                 p = isl_printer_print_str(p, "shared_");
 220         else
 221                 global = 1;
 222         p = isl_printer_print_str(p, group->array->name);
 223         if (!global && group->array->n_group > 1) {
 224                 p = isl_printer_print_str(p, "_");
 225                 p = isl_printer_print_int(p, group->nr);
 226         }
 227
 228         return p;
 229 }
 230
 231 /* Collect all references to the given array and store pointers to them
 232  * in array->refs.
 233  */
 234 static void collect_references(struct gpu_prog *prog,
 235         struct gpu_array_info *array)
 236 {
 237         int i;
 238         int n;
 239
 240         n = 0;
 241         for (i = 0; i < prog->n_stmts; ++i) {
 242                 struct gpu_stmt *stmt = &prog->stmts[i];
 243                 struct gpu_stmt_access *access;
 244
 245                 for (access = stmt->accesses; access; access = access->next) {
 246                         const char *name;
 247                         name = isl_map_get_tuple_name(access->access,
 248                                                       isl_dim_out);
 249                         if (name && !strcmp(array->name, name))
 250                                 n++;
 251                 }
 252         }
 253
 254         array->n_ref = n;
 255         array->refs = isl_alloc_array(prog->ctx, struct gpu_stmt_access *, n);
 256         assert(array->refs);
 257
 258         n = 0;
 259         for (i = 0; i < prog->n_stmts; ++i) {
 260                 struct gpu_stmt *stmt = &prog->stmts[i];
 261                 struct gpu_stmt_access *access;
 262
 263                 for (access = stmt->accesses; access; access = access->next) {
 264                         const char *name;
 265                         name = isl_map_get_tuple_name(access->access,
 266                                                       isl_dim_out);
 267                         if (!name || strcmp(array->name, name))
 268                                 continue;
 269
 270                         array->refs[n++] = access;
 271                 }
 272         }
 273 }
 274
 275 /* Create a gpu_array_tile for an array of dimension "n_index".
 276  */
 277 static struct gpu_array_tile *create_tile(isl_ctx *ctx, int n_index)
 278 {
 279         int i;
 280         struct gpu_array_tile *tile;
 281
 282         tile = isl_calloc_type(ctx, struct gpu_array_tile);
 283         assert(tile);
 284
 285         tile->n = n_index;
 286
 287         tile->bound = isl_alloc_array(ctx, struct gpu_array_bound, n_index);
 288         assert(tile->bound);
 289
 290         for (i = 0; i < n_index; ++i) {
 291                 tile->bound[i].size = NULL;
 292                 tile->bound[i].lb = NULL;
 293                 tile->bound[i].stride = NULL;
 294                 tile->bound[i].shift = NULL;
 295                 tile->bound[i].shift_map = NULL;
 296         }
 297
 298         return tile;
 299 }
 300
 301 static void *free_tile(struct gpu_array_tile *tile)
 302 {
 303         int j;
 304
 305         if (!tile)
 306                 return NULL;
 307
 308         for (j = 0; j < tile->n; ++j) {
 309                 isl_val_free(tile->bound[j].size);
 310                 isl_val_free(tile->bound[j].stride);
 311                 isl_aff_free(tile->bound[j].lb);
 312                 isl_aff_free(tile->bound[j].shift);
 313                 isl_basic_map_free(tile->bound[j].shift_map);
 314         }
 315         free(tile->bound);
 316         isl_multi_aff_free(tile->tiling);
 317         free(tile);
 318
 319         return NULL;
 320 }
 321
 322 static struct pet_array *find_array(struct ppcg_scop *scop,
 323         __isl_keep isl_set *accessed)
 324 {
 325         int i;
 326         isl_id *id;
 327
 328         id = isl_set_get_tuple_id(accessed);
 329
 330         for (i = 0; i < scop->n_array; ++i) {
 331                 isl_id *id_i;
 332
 333                 id_i = isl_set_get_tuple_id(scop->arrays[i]->extent);
 334                 isl_id_free(id_i);
 335                 if (id == id_i)
 336                         break;
 337         }
 338         isl_id_free(id);
 339
 340         return i < scop->n_array ? scop->arrays[i] : NULL;
 341 }
 342
 343 /* Compute and return the extent of "array", taking into account the set of
 344  * accessed elements.
 345  *
 346  * In particular, the extent in the outer dimension is taken
 347  * from "accessed", while then extent in the remaing dimensions
 348  * are taken from array->extent.
 349  *
 350  * The extent in the outer dimension cannot be taken from array->extent
 351  * because that may be unbounded.  Furthermore, even if it is bounded,
 352  * it may be larger than the piece of the array that is being accessed.
 353  */
 354 static __isl_give isl_set *compute_extent(struct pet_array *array,
 355         __isl_keep isl_set *accessed)
 356 {
 357         int n_index;
 358         isl_id *id;
 359         isl_set *outer;
 360         isl_set *extent;
 361
 362         extent = isl_set_copy(array->extent);
 363
 364         n_index = isl_set_dim(accessed, isl_dim_set);
 365         if (n_index == 0)
 366                 return extent;
 367
 368         extent = isl_set_project_out(extent, isl_dim_set, 0, 1);
 369         outer = isl_set_copy(accessed);
 370         outer = isl_set_project_out(outer, isl_dim_set, 1, n_index - 1);
 371         extent = isl_set_flat_product(outer, extent);
 372         id = isl_set_get_tuple_id(accessed);
 373         extent = isl_set_set_tuple_id(extent, id);
 374
 375         return extent;
 376 }
 377
 378 /* Is the array "array" being extracted a read-only scalar?
 379  *
 380  * That is, is "array" a scalar that is never written to.
 381  */
 382 static int is_read_only_scalar(struct gpu_array_info *array,
 383         struct gpu_prog *prog)
 384 {
 385         isl_set *space;
 386         isl_union_map *write;
 387         int empty;
 388
 389         if (array->n_index != 0)
 390                 return 0;
 391
 392         write = isl_union_map_copy(prog->write);
 393         space = isl_set_universe(isl_space_copy(array->space));
 394         write = isl_union_map_intersect_range(write,
 395                                                 isl_union_set_from_set(space));
 396         empty = isl_union_map_is_empty(write);
 397         isl_union_map_free(write);
 398
 399         return empty;
 400 }
 401
 402 /* Compute bounds on the host arrays based on the accessed elements
 403  * and collect all references to the array.
 404  *
 405  * If the array is zero-dimensional, i.e., a scalar, we check
 406  * whether it is read-only.
 407  */
 408 static int extract_array_info(__isl_take isl_set *array, void *user)
 409 {
 410         int i;
 411         struct gpu_prog *prog = (struct gpu_prog *)user;
 412         const char *name;
 413         int n_index;
 414         isl_pw_aff **bounds;
 415         struct pet_array *pa;
 416         struct gpu_array_info *info;
 417         isl_set *extent;
 418
 419         info = &prog->array[prog->n_array];
 420         prog->n_array++;
 421
 422         n_index = isl_set_dim(array, isl_dim_set);
 423         name = isl_set_get_tuple_name(array);
 424         bounds = isl_alloc_array(isl_set_get_ctx(array),
 425                                  isl_pw_aff *, n_index);
 426         if (!bounds)
 427                 goto error;
 428
 429         info->space = isl_set_get_space(array);
 430         info->name = strdup(name);
 431         info->n_index = n_index;
 432         info->bound = bounds;
 433
 434         pa = find_array(prog->scop, array);
 435         if (!pa)
 436                 isl_die(isl_set_get_ctx(array), isl_error_internal,
 437                         "unable to find array in scop", goto error);
 438
 439         info->type = strdup(pa->element_type);
 440         info->size = pa->element_size;
 441         info->local = pa->declared && !pa->exposed;
 442         info->read_only_scalar = is_read_only_scalar(info, prog);
 443
 444         extent = compute_extent(pa, array);
 445         for (i = 0; i < n_index; ++i) {
 446                 isl_set *dom;
 447                 isl_local_space *ls;
 448                 isl_aff *one;
 449                 isl_pw_aff *bound;
 450
 451                 bound = isl_set_dim_max(isl_set_copy(extent), i);
 452                 assert(bound);
 453                 dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
 454                 ls = isl_local_space_from_space(isl_set_get_space(dom));
 455                 one = isl_aff_zero_on_domain(ls);
 456                 one = isl_aff_add_constant_si(one, 1);
 457                 bound = isl_pw_aff_add(bound, isl_pw_aff_alloc(dom, one));
 458                 bound = isl_pw_aff_gist(bound, isl_set_copy(prog->context));
 459
 460                 bounds[i] = bound;
 461         }
 462         info->extent = extent;
 463
 464         collect_references(prog, info);
 465
 466         isl_set_free(array);
 467         return 0;
 468 error:
 469         isl_set_free(array);
 470         return -1;
 471 }
 472
 473 /* Construct a gpu_array_info for each array accessed by "prog" and
 474  * collect them in prog->array.
 475  */
 476 static int collect_array_info(struct gpu_prog *prog)
 477 {
 478         int r;
 479         isl_union_set *arrays;
 480
 481         arrays = isl_union_map_range(isl_union_map_copy(prog->read));
 482         arrays = isl_union_set_union(arrays,
 483                         isl_union_map_range(isl_union_map_copy(prog->write)));
 484         arrays = isl_union_set_coalesce(arrays);
 485
 486         prog->n_array = isl_union_set_n_set(arrays);
 487         prog->array = isl_alloc_array(prog->ctx,
 488                                      struct gpu_array_info, prog->n_array);
 489         assert(prog->array);
 490         prog->n_array = 0;
 491         r = isl_union_set_foreach_set(arrays, &extract_array_info, prog);
 492         isl_union_set_free(arrays);
 493
 494         return r;
 495 }
 496
 497 static void free_array_info(struct gpu_prog *prog)
 498 {
 499         int i, j;
 500
 501         for (i = 0; i < prog->n_array; ++i) {
 502                 int n_index = prog->array[i].n_index;
 503                 free(prog->array[i].type);
 504                 free(prog->array[i].name);
 505                 for (j = 0; j < n_index; ++j)
 506                         isl_pw_aff_free(prog->array[i].bound[j]);
 507                 isl_space_free(prog->array[i].space);
 508                 isl_set_free(prog->array[i].extent);
 509                 free(prog->array[i].bound);
 510                 free(prog->array[i].refs);
 511         }
 512         free(prog->array);
 513 }
 514
 515 /* Check if a gpu array is a scalar.  A scalar is a value that is not stored
 516  * as an array or through a pointer reference, but as single data element.  At
 517  * the moment, scalars are represented as zero dimensional arrays.
 518  */
 519 int gpu_array_is_scalar(struct gpu_array_info *array)
 520 {
 521         return (array->n_index == 0);
 522 }
 523
 524 /* Is "array" a read-only scalar?
 525  */
 526 int gpu_array_is_read_only_scalar(struct gpu_array_info *array)
 527 {
 528         return array->read_only_scalar;
 529 }
 530
 531 /* Internal data structure for extract_size_of_type.
 532  * "type" specifies the name of the space that we want to extract.
 533  * "res" is used to store the subset of that space.
 534  */
 535 struct ppcg_extract_size_data {
 536         const char *type;
 537         isl_set *res;
 538 };
 539
 540 /* This function is called for each set in a union_set.
 541  * If the name of the set matches data->type, we store the
 542  * set in data->res.
 543  */
 544 static int extract_size_of_type(__isl_take isl_set *size, void *user)
 545 {
 546         struct ppcg_extract_size_data *data = user;
 547         const char *name;
 548
 549         name = isl_set_get_tuple_name(size);
 550         if (name && !strcmp(name, data->type)) {
 551                 data->res = size;
 552                 return -1;
 553         }
 554
 555         isl_set_free(size);
 556         return 0;
 557 }
 558
 559 /* Given a union map { kernel[i] -> *[...] },
 560  * return the range in the space called "type" for the kernel with
 561  * sequence number "id".
 562  */
 563 static __isl_give isl_set *extract_sizes(__isl_keep isl_union_map *sizes,
 564         const char *type, int id)
 565 {
 566         isl_space *space;
 567         isl_set *dom;
 568         isl_union_set *local_sizes;
 569         struct ppcg_extract_size_data data = { type, NULL };
 570
 571         if (!sizes)
 572                 return NULL;
 573
 574         space = isl_union_map_get_space(sizes);
 575         space = isl_space_set_from_params(space);
 576         space = isl_space_add_dims(space, isl_dim_set, 1);
 577         space = isl_space_set_tuple_name(space, isl_dim_set, "kernel");
 578         dom = isl_set_universe(space);
 579         dom = isl_set_fix_si(dom, isl_dim_set, 0, id);
 580
 581         local_sizes = isl_union_set_apply(isl_union_set_from_set(dom),
 582                                         isl_union_map_copy(sizes));
 583         isl_union_set_foreach_set(local_sizes, &extract_size_of_type, &data);
 584         isl_union_set_free(local_sizes);
 585         return data.res;
 586 }
 587
 588 /* Given a singleton set, extract the first (at most *len) elements
 589  * of the single integer tuple into *sizes and update *len if needed.
 590  */
 591 static void read_sizes_from_set(__isl_take isl_set *set, int *sizes, int *len)
 592 {
 593         int i;
 594         int dim;
 595
 596         if (!set)
 597                 return;
 598
 599         dim = isl_set_dim(set, isl_dim_set);
 600         if (dim < *len)
 601                 *len = dim;
 602
 603         for (i = 0; i < *len; ++i) {
 604                 isl_val *v;
 605
 606                 v = isl_set_plain_get_val_if_fixed(set, isl_dim_set, i);
 607                 assert(v);
 608
 609                 sizes[i] = isl_val_get_num_si(v);
 610                 isl_val_free(v);
 611         }
 612
 613         isl_set_free(set);
 614 }
 615
 616 /* Extract user specified "tile" sizes from the "sizes" command line option,
 617  * defaulting to option->tile_size in each dimension.
 618  */
 619 static void read_tile_sizes(struct gpu_gen *gen)
 620 {
 621         int n;
 622         isl_set *size;
 623
 624         gen->tile_size = isl_alloc_array(gen->ctx, int, gen->tile_len);
 625         assert(gen->tile_size);
 626         for (n = 0; n < gen->tile_len; ++n)
 627                 gen->tile_size[n] = gen->options->tile_size;
 628
 629         size = extract_sizes(gen->sizes, "tile", gen->kernel_id);
 630         read_sizes_from_set(size, gen->tile_size, &gen->tile_len);
 631
 632         if (gen->n_parallel > gen->tile_len)
 633                 gen->n_parallel = gen->tile_len;
 634 }
 635
 636 /* Extract user specified "block" sizes from the "sizes" command line option,
 637  * after filling in some potentially useful defaults.
 638  */
 639 static void read_block_sizes(struct gpu_gen *gen)
 640 {
 641         int n;
 642         isl_set *size;
 643
 644         n = gen->n_parallel;
 645         gen->n_block = (n <= 3) ? n : 3;
 646         switch (gen->n_block) {
 647         case 1:
 648                 gen->block_dim[0] = 512;
 649                 break;
 650         case 2:
 651                 gen->block_dim[0] = 32;
 652                 gen->block_dim[1] = 16;
 653                 break;
 654         default:
 655                 gen->block_dim[0] = 32;
 656                 gen->block_dim[1] = 4;
 657                 gen->block_dim[2] = 4;
 658                 break;
 659         }
 660
 661         size = extract_sizes(gen->sizes, "block", gen->kernel_id);
 662         read_sizes_from_set(size, gen->block_dim, &gen->n_block);
 663 }
 664
 665 /* Extract user specified "grid" sizes from the "sizes" command line option,
 666  * after filling in some potentially useful defaults.
 667  */
 668 static void read_grid_sizes(struct gpu_gen *gen)
 669 {
 670         int n = gen->n_parallel;
 671         isl_set *size;
 672
 673         gen->n_grid = (n <= 2) ? n : 2;
 674         switch (gen->n_grid) {
 675         case 1:
 676                 gen->grid_dim[0] = 32768;
 677                 break;
 678         default:
 679                 gen->grid_dim[0] = 256;
 680                 gen->grid_dim[1] = 256;
 681                 break;
 682         }
 683
 684         size = extract_sizes(gen->sizes, "grid", gen->kernel_id);
 685         read_sizes_from_set(size, gen->grid_dim, &gen->n_grid);
 686 }
 687
 688 /* Extract user specified sizes from the "sizes" command line option
 689  * after filling in some potentially useful defaults.
 690  */
 691 static void read_sizes(struct gpu_gen *gen)
 692 {
 693         read_tile_sizes(gen);
 694         read_block_sizes(gen);
 695         read_grid_sizes(gen);
 696 }
 697
 698 static void *free_stmts(struct gpu_stmt *stmts, int n)
 699 {
 700         int i;
 701
 702         if (!stmts)
 703                 return NULL;
 704
 705         for (i = 0; i < n; ++i) {
 706                 struct gpu_stmt_access *access, *next;
 707
 708                 for (access = stmts[i].accesses; access; access = next) {
 709                         next = access->next;
 710                         isl_id_free(access->ref_id);
 711                         isl_map_free(access->access);
 712                         free(access);
 713                 }
 714
 715                 isl_id_free(stmts[i].id);
 716         }
 717         free(stmts);
 718
 719         return NULL;
 720 }
 721
 722 /* Construct a map from a domain of dimensionality "len"
 723  * to a domain of dimensionality "len" + "tile_len" that tiles
 724  * the "tile_len" coordinates starting at "first".
 725  * In particular, [s_i] -> [s_i / tile_size[i], s_i % tile_size[i]].
 726  * "dim" prescribes the parameters.
 727  */
 728 static __isl_give isl_map *tile(__isl_take isl_space *dim, int len,
 729         int first, int tile_len, int *tile_size)
 730 {
 731         int i;
 732         isl_basic_map *bmap;
 733         isl_constraint *c;
 734         isl_local_space *ls;
 735
 736         dim = isl_space_add_dims(dim, isl_dim_in, len);
 737         dim = isl_space_add_dims(dim, isl_dim_out, len + tile_len);
 738         bmap = isl_basic_map_universe(isl_space_copy(dim));
 739         ls = isl_local_space_from_space(dim);
 740
 741         for (i = 0; i < len - tile_len; ++i) {
 742                 int j = i < first ? i : i + tile_len;
 743                 int k = i < first ? i : i + 2 * tile_len;
 744
 745                 c = isl_equality_alloc(isl_local_space_copy(ls));
 746                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, j, -1);
 747                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, k, 1);
 748                 bmap = isl_basic_map_add_constraint(bmap, c);
 749         }
 750
 751         for (i = 0; i < tile_len; ++i) {
 752                 c = isl_equality_alloc(isl_local_space_copy(ls));
 753                 c = isl_constraint_set_coefficient_si(c, isl_dim_in,
 754                                                 first + i, -1);
 755                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 756                                                 first + i, tile_size[i]);
 757                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 758                                                 first + i + tile_len, 1);
 759                 bmap = isl_basic_map_add_constraint(bmap, c);
 760
 761                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 762                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 763                                                    first + i + tile_len, 1);
 764                 bmap = isl_basic_map_add_constraint(bmap, c);
 765
 766                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 767                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 768                                                    first + i + tile_len, -1);
 769                 c = isl_constraint_set_constant_si(c, tile_size[i] - 1);
 770                 bmap = isl_basic_map_add_constraint(bmap, c);
 771         }
 772
 773         isl_local_space_free(ls);
 774
 775         return isl_map_from_basic_map(bmap);
 776 }
 777
 778 /* Construct a map from a domain of dimensionality "len"
 779  * to a domain of dimensionality "len" + "wrap_len" that "wraps"
 780  * the "wrap_len" coordinates starting at "first" according to "wrap_size".
 781  * In particular, [s_i] -> [s_i, s_i % wrap_size[i]].
 782  * To do so, we need extra variables corresponding to [s_i / wrap_size[i]],
 783  * that are projected out at the end.
 784  * "dim" prescribes the parameters.
 785  */
 786 static __isl_give isl_map *wrap(__isl_take isl_space *dim, int len,
 787         int first, int wrap_len, int *wrap_size)
 788 {
 789         int i;
 790         isl_basic_map *bmap;
 791         isl_constraint *c;
 792         isl_local_space *ls;
 793
 794         dim = isl_space_add_dims(dim, isl_dim_in, len);
 795         dim = isl_space_add_dims(dim, isl_dim_out, len + 2 * wrap_len);
 796         bmap = isl_basic_map_universe(isl_space_copy(dim));
 797         ls = isl_local_space_from_space(dim);
 798
 799         for (i = 0; i < len; ++i) {
 800                 int k = i < first + wrap_len ? i : i + 2 * wrap_len;
 801
 802                 c = isl_equality_alloc(isl_local_space_copy(ls));
 803                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, -1);
 804                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, k, 1);
 805                 bmap = isl_basic_map_add_constraint(bmap, c);
 806         }
 807
 808         for (i = 0; i < wrap_len; ++i) {
 809                 c = isl_equality_alloc(isl_local_space_copy(ls));
 810                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 811                                                     first + i, -1);
 812                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 813                                                     first + wrap_len + i, 1);
 814                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 815                                     first + 2 * wrap_len + i, wrap_size[i]);
 816                 bmap = isl_basic_map_add_constraint(bmap, c);
 817
 818                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 819                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 820                                                     first + wrap_len + i, 1);
 821                 bmap = isl_basic_map_add_constraint(bmap, c);
 822
 823                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 824                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 825                                                     first + wrap_len + i, -1);
 826                 c = isl_constraint_set_constant_si(c, wrap_size[i] - 1);
 827                 bmap = isl_basic_map_add_constraint(bmap, c);
 828         }
 829
 830         isl_local_space_free(ls);
 831
 832         bmap = isl_basic_map_project_out(bmap, isl_dim_out,
 833                                 first + 2 * wrap_len, wrap_len);
 834
 835         return isl_map_from_basic_map(bmap);
 836 }
 837
 838 /* Add "n" parameters named prefix%d.
 839  */
 840 static __isl_give isl_set *add_params( __isl_take isl_set *set,
 841         int n, const char *prefix)
 842 {
 843         int i;
 844         unsigned nparam;
 845         char name[20];
 846
 847         nparam = isl_set_dim(set, isl_dim_param);
 848         set = isl_set_add_dims(set, isl_dim_param, n);
 849
 850         for (i = 0; i < n; ++i) {
 851                 snprintf(name, sizeof(name), "%s%d", prefix, i);
 852                 set = isl_set_set_dim_name(set, isl_dim_param,
 853                                             nparam + i, name);
 854         }
 855
 856         return set;
 857 }
 858
 859 /* Equate the "n" dimensions of "set" starting at "first" to
 860  * freshly created parameters named prefix%d.
 861  */
 862 static __isl_give isl_set *parametrize(__isl_take isl_set *set,
 863         int first, int n, const char *prefix)
 864 {
 865         int i;
 866         unsigned nparam;
 867
 868         nparam = isl_set_dim(set, isl_dim_param);
 869
 870         set = add_params(set, n, prefix);
 871
 872         for (i = 0; i < n; ++i)
 873                 set = isl_set_equate(set, isl_dim_param, nparam + i,
 874                                         isl_dim_set, first + i);
 875
 876         return set;
 877 }
 878
 879 /* Given a parameter space "space", create a set of dimension "len"
 880  * of which the "n" dimensions starting at "first" are equated to
 881  * freshly created parameters named prefix%d.
 882  */
 883 static __isl_give isl_set *parametrization(__isl_take isl_space *space,
 884         int len, int first, int n, const char *prefix)
 885 {
 886         isl_set *set;
 887
 888         space = isl_space_set_from_params(space);
 889         space = isl_space_add_dims(space, isl_dim_set, len);
 890         set = isl_set_universe(space);
 891
 892         return parametrize(set, first, n, prefix);
 893 }
 894
 895 /* Tile the B loops over the tile sizes and then tile/wrap
 896  * the T1 loops over the blocks.
 897  */
 898 static __isl_give isl_union_map *tile_schedule(struct gpu_gen *gen,
 899         __isl_take isl_union_map *sched)
 900 {
 901         isl_space *dim;
 902         isl_map *tiling, *block_tiling;
 903
 904         dim = isl_union_map_get_space(sched);
 905         tiling = tile(isl_space_copy(dim), gen->untiled_len,
 906                       gen->tile_first, gen->tile_len, gen->tile_size);
 907
 908         if (gen->options->wrap)
 909                 block_tiling = wrap(dim, gen->untiled_len + gen->tile_len,
 910                                 gen->tile_first, gen->n_grid, gen->grid_dim);
 911         else
 912                 block_tiling = tile(dim, gen->untiled_len + gen->tile_len,
 913                                 gen->tile_first, gen->n_grid, gen->grid_dim);
 914
 915         gen->tiled_len = gen->untiled_len + gen->tile_len + gen->n_grid;
 916
 917         tiling = isl_map_apply_range(tiling, block_tiling);
 918
 919         sched = isl_union_map_apply_range(sched,
 920                                              isl_union_map_from_map(tiling));
 921
 922         gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
 923
 924         return sched;
 925 }
 926
 927 /* Equate the "T1P" iterators in the tiled schedule "sched"
 928  * to the block dimensions.
 929  */
 930 static __isl_give isl_union_map *parametrize_tiled_schedule(
 931         struct gpu_gen *gen, __isl_take isl_union_map *sched)
 932 {
 933         isl_space *dim;
 934         isl_set *par;
 935
 936         dim = isl_union_map_get_space(sched);
 937         par = parametrization(dim, gen->tiled_len,
 938                 gen->tile_first + gen->n_grid, gen->n_grid, "b");
 939         sched = isl_union_map_intersect_range(sched,
 940                                                 isl_union_set_from_set(par));
 941
 942         return sched;
 943 }
 944
 945 /* Tile/wrap the P1 loops over the threads.
 946  */
 947 static __isl_give isl_union_map *thread_tile_schedule(struct gpu_gen *gen,
 948         __isl_take isl_union_map *sched)
 949 {
 950         isl_space *dim;
 951         isl_map *tiling;
 952         isl_set *par;
 953
 954         dim = isl_union_map_get_space(sched);
 955
 956         if (gen->options->wrap)
 957                 tiling = wrap(isl_space_copy(dim), gen->tiled_len,
 958                                 gen->shared_len, gen->n_block, gen->block_dim);
 959         else
 960                 tiling = tile(isl_space_copy(dim), gen->tiled_len,
 961                                 gen->shared_len, gen->n_block, gen->block_dim);
 962         gen->thread_tiled_len = gen->tiled_len + gen->n_block;
 963
 964         sched = isl_union_map_apply_range(sched,
 965                                              isl_union_map_from_map(tiling));
 966
 967         par = parametrization(dim, gen->thread_tiled_len,
 968                 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
 969                 gen->n_block, "t");
 970         sched = isl_union_map_intersect_range(sched,
 971                                                 isl_union_set_from_set(par));
 972
 973         gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
 974
 975         return sched;
 976 }
 977
 978 /* If the user asked for it, scale the shared memory tile loops
 979  * (T1T and T2) of "sched" by gen->tile_size[i].
 980  * If we are not performing "wrapping", then additionally scale the T1P
 981  * loops by gen->grid_dim[i].
 982  */
 983 static __isl_give isl_union_map *scale_tile_loops(struct gpu_gen *gen,
 984         __isl_take isl_union_map *sched)
 985 {
 986         int i;
 987         isl_space *dim;
 988         isl_basic_map *scale;
 989         isl_constraint *c;
 990         isl_local_space *ls;
 991
 992         if (!gen->options->scale_tile_loops)
 993                 return sched;
 994
 995         dim = isl_union_map_get_space(sched);
 996         dim = isl_space_add_dims(dim, isl_dim_in, gen->tiled_len);
 997         dim = isl_space_add_dims(dim, isl_dim_out, gen->tiled_len);
 998         scale = isl_basic_map_universe(isl_space_copy(dim));
 999         ls = isl_local_space_from_space(dim);
1000
1001         for (i = 0; i < gen->tiled_len; ++i) {
1002                 int f = 1;
1003
1004                 if (i >= gen->tile_first && i < gen->tile_first + gen->n_grid) {
1005                         f = gen->tile_size[i - gen->tile_first];
1006                         if (!gen->options->wrap)
1007                                 f *= gen->grid_dim[i - gen->tile_first];
1008                 } else if (i >= gen->tile_first + gen->n_grid &&
1009                            i < gen->tile_first + gen->n_grid + gen->tile_len) {
1010                         f = gen->tile_size[i - (gen->tile_first + gen->n_grid)];
1011                 }
1012
1013                 c = isl_equality_alloc(isl_local_space_copy(ls));
1014                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1015                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1016                 scale = isl_basic_map_add_constraint(scale, c);
1017         }
1018
1019         isl_local_space_free(ls);
1020
1021         sched = isl_union_map_apply_range(sched,
1022                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1023
1024         return sched;
1025 }
1026
1027 /* If we are not performing "wrapping" and if the user asked for it,
1028  * scale the thread tile loops (P1T) of "sched" by gen->block_dim[i].
1029  */
1030 static __isl_give isl_union_map *scale_thread_tile_loops(struct gpu_gen *gen,
1031         __isl_take isl_union_map *sched)
1032 {
1033         int i;
1034         isl_space *dim;
1035         isl_basic_map *scale;
1036         isl_constraint *c;
1037         isl_local_space *ls;
1038
1039         if (gen->options->wrap)
1040                 return sched;
1041         if (!gen->options->scale_tile_loops)
1042                 return sched;
1043
1044         dim = isl_union_map_get_space(sched);
1045         dim = isl_space_add_dims(dim, isl_dim_in, gen->thread_tiled_len);
1046         dim = isl_space_add_dims(dim, isl_dim_out, gen->thread_tiled_len);
1047         scale = isl_basic_map_universe(isl_space_copy(dim));
1048         ls = isl_local_space_from_space(dim);
1049
1050         for (i = 0; i < gen->thread_tiled_len; ++i) {
1051                 int f = 1;
1052
1053                 if (i >= gen->shared_len &&
1054                     i < gen->shared_len + gen->n_block)
1055                         f = gen->block_dim[i - gen->shared_len];
1056
1057                 c = isl_equality_alloc(isl_local_space_copy(ls));
1058                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1059                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1060                 scale = isl_basic_map_add_constraint(scale, c);
1061         }
1062
1063         isl_local_space_free(ls);
1064
1065         sched = isl_union_map_apply_range(sched,
1066                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1067
1068         return sched;
1069 }
1070
1071 /* If we are not performing "wrapping" and if the user asked for it,
1072  * scale the "n_tile" loops starting at "first" of "sched" by gen->block_dim[i].
1073  */
1074 static __isl_give isl_union_map *scale_access_tile_loops(struct gpu_gen *gen,
1075         __isl_take isl_union_map *sched, int len, int first, int n_tile)
1076 {
1077         int i;
1078         isl_space *dim;
1079         isl_basic_map *scale;
1080         isl_constraint *c;
1081         isl_local_space *ls;
1082
1083         if (gen->options->wrap)
1084                 return sched;
1085         if (!gen->options->scale_tile_loops)
1086                 return sched;
1087
1088         dim = isl_union_map_get_space(sched);
1089         dim = isl_space_add_dims(dim, isl_dim_in, len);
1090         dim = isl_space_add_dims(dim, isl_dim_out, len);
1091         scale = isl_basic_map_universe(isl_space_copy(dim));
1092         ls = isl_local_space_from_space(dim);
1093
1094         for (i = 0; i < len; ++i) {
1095                 int f = 1;
1096
1097                 if (i >= first && i < first + n_tile)
1098                         f = gen->kernel->block_dim[i - first];
1099
1100                 c = isl_equality_alloc(isl_local_space_copy(ls));
1101                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1102                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1103                 scale = isl_basic_map_add_constraint(scale, c);
1104         }
1105
1106         isl_local_space_free(ls);
1107
1108         sched = isl_union_map_apply_range(sched,
1109                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1110
1111         return sched;
1112 }
1113
1114 /* Add "len" parameters p[i] called prefix%d,
1115  * with bounds to 0 <= p[i] < size[i].
1116  */
1117 __isl_give isl_set *add_bounded_parameters(__isl_take isl_set *set,
1118         int len, int *size, const char *prefix)
1119 {
1120         int i;
1121         unsigned nparam;
1122         isl_space *dim;
1123         isl_basic_set *bset;
1124         isl_constraint *c;
1125         isl_local_space *ls;
1126         char name[20];
1127
1128         nparam = isl_set_dim(set, isl_dim_param);
1129         set = isl_set_add_dims(set, isl_dim_param, len);
1130
1131         for (i = 0; i < len; ++i) {
1132                 snprintf(name, sizeof(name), "%s%d", prefix, i);
1133                 set = isl_set_set_dim_name(set, isl_dim_param,
1134                                             nparam + i, name);
1135         }
1136
1137         dim = isl_set_get_space(set);
1138         bset = isl_basic_set_universe(isl_space_copy(dim));
1139         ls = isl_local_space_from_space(dim);
1140
1141         for (i = 0; i < len; ++i) {
1142                 c = isl_inequality_alloc(isl_local_space_copy(ls));
1143                 c = isl_constraint_set_coefficient_si(c, isl_dim_param,
1144                                                         nparam + i, 1);
1145                 bset = isl_basic_set_add_constraint(bset, c);
1146
1147                 c = isl_inequality_alloc(isl_local_space_copy(ls));
1148                 c = isl_constraint_set_coefficient_si(c, isl_dim_param,
1149                                                         nparam + i, -1);
1150                 c = isl_constraint_set_constant_si(c, size[i] - 1);
1151                 bset = isl_basic_set_add_constraint(bset, c);
1152         }
1153
1154         isl_local_space_free(ls);
1155
1156         return isl_set_intersect(set, isl_set_from_basic_set(bset));
1157 }
1158
1159 /* Add "len" parameters p[i] called prefix%d,
1160  * with bounds to 0 <= p[i] < size[i].
1161  */
1162 static __isl_give isl_set *add_bounded_parameters_dynamic(
1163         __isl_take isl_set *set, __isl_keep isl_multi_pw_aff *size,
1164         const char *prefix)
1165 {
1166         int i, len;
1167         unsigned nparam;
1168         isl_space *space;
1169         isl_local_space *ls;
1170         char name[20];
1171
1172         len = isl_multi_pw_aff_dim(size, isl_dim_out);
1173         nparam = isl_set_dim(set, isl_dim_param);
1174         set = isl_set_add_dims(set, isl_dim_param, len);
1175
1176         for (i = 0; i < len; ++i) {
1177                 snprintf(name, sizeof(name), "%s%d", prefix, i);
1178                 set = isl_set_set_dim_name(set, isl_dim_param,
1179                                             nparam + i, name);
1180         }
1181
1182         space = isl_space_params(isl_set_get_space(set));
1183         ls = isl_local_space_from_space(space);
1184         for (i = 0; i < len; ++i) {
1185                 isl_pw_aff *param, *size_i, *zero;
1186                 isl_set *bound;
1187
1188                 param = isl_pw_aff_var_on_domain(isl_local_space_copy(ls),
1189                                                 isl_dim_param, nparam + i);
1190
1191                 size_i = isl_multi_pw_aff_get_pw_aff(size, i);
1192                 bound = isl_pw_aff_lt_set(isl_pw_aff_copy(param), size_i);
1193                 set = isl_set_intersect_params(set, bound);
1194
1195                 zero = isl_pw_aff_zero_on_domain(isl_local_space_copy(ls));
1196                 bound = isl_pw_aff_ge_set(param, zero);
1197                 set = isl_set_intersect_params(set, bound);
1198         }
1199         isl_local_space_free(ls);
1200
1201         return set;
1202 }
1203
1204 /* Construct a map from an access to group->array to the corresponding
1205  * shared/private memory tile.
1206  * The map is of the form
1207  *
1208  *      { [D[i] -> A[a]] -> T[t] }
1209  *
1210  * where D represents the initial shared_len dimensions
1211  * of the computed schedule.
1212  */
1213 static __isl_give isl_map *shift_access(struct gpu_array_ref_group *group)
1214 {
1215         struct gpu_array_tile *tile;
1216         isl_multi_aff *tiling;
1217
1218         tile = group->private_tile;
1219         if (!tile)
1220                 tile = group->shared_tile;
1221
1222         tiling = isl_multi_aff_copy(tile->tiling);
1223
1224         return isl_map_from_multi_aff(tiling);
1225 }
1226
1227 /* Does "map" have an obviously fixed value at variable "pos" of "type"?
1228  */
1229 static int map_plain_is_fixed(isl_map *map, enum isl_dim_type type,
1230         unsigned pos)
1231 {
1232         isl_val *v;
1233         int fixed;
1234
1235         v = isl_map_plain_get_val_if_fixed(map, type, pos);
1236         if (!v)
1237                 return -1;
1238         fixed = isl_val_is_int(v);
1239         isl_val_free(v);
1240
1241         return fixed;
1242 }
1243
1244 /* Given a schedule that iterates over all elements in a piece of an array,
1245  * perform tiling/wrapping over the threads.
1246  *
1247  * In particular, we tile the final iterators so that the final thread
1248  * dimension runs over the final array dimension.
1249  * However, if those final iterators have only a single iteration,
1250  * we try to tile earlier iterators instead.
1251  */
1252 static __isl_give isl_map *tile_access_schedule(struct gpu_gen *gen,
1253         __isl_take isl_map *sched)
1254 {
1255         isl_space *dim;
1256         isl_union_map *usched;
1257         isl_map *tiling;
1258         isl_set *par;
1259         unsigned nvar = isl_map_dim(sched, isl_dim_out);
1260         int n_tile;
1261         int first;
1262
1263         n_tile = gen->kernel->n_block;
1264         if (n_tile > nvar) {
1265                 int i;
1266                 sched = isl_map_insert_dims(sched,
1267                                                 isl_dim_out, 0, n_tile - nvar);
1268                 for (i = 0; i < n_tile - nvar; ++i)
1269                         sched = isl_map_fix_si(sched, isl_dim_out, i, 0);
1270                 nvar = n_tile;
1271         }
1272
1273         first = nvar - n_tile;
1274
1275         for (; first > 0; first --)
1276                 if (!map_plain_is_fixed(sched, isl_dim_out, first + n_tile - 1))
1277                         break;
1278
1279         dim = isl_map_get_space(sched);
1280         dim = isl_space_params(dim);
1281         if (gen->options->wrap)
1282                 tiling = wrap(isl_space_copy(dim), nvar, first,
1283                                 n_tile, gen->kernel->block_dim);
1284         else
1285                 tiling = tile(isl_space_copy(dim), nvar, first,
1286                                 n_tile, gen->kernel->block_dim);
1287         sched = isl_map_apply_range(sched, tiling);
1288
1289         par = parametrization(dim, nvar + n_tile, first + n_tile, n_tile, "t");
1290         sched = isl_map_intersect_range(sched, par);
1291
1292         usched = isl_union_map_from_map(sched);
1293         usched = scale_access_tile_loops(gen, usched, nvar + n_tile,
1294                                          first, n_tile);
1295         sched = isl_map_from_union_map(usched);
1296
1297         return sched;
1298 }
1299
1300 /* Return the union of all read (read = 1) and/or write (write = 1)
1301  * access relations in the group.
1302  */
1303 static __isl_give isl_union_map *group_access_relation(
1304         struct gpu_array_ref_group *group, int read, int write)
1305 {
1306         int i;
1307         isl_union_map *access;
1308
1309         access = isl_union_map_empty(isl_map_get_space(group->access));
1310         for (i = 0; i < group->n_ref; ++i) {
1311                 isl_map *map_i;
1312
1313                 if (!((read && group->refs[i]->read) ||
1314                      (write && group->refs[i]->write)))
1315                         continue;
1316                 map_i = isl_map_copy(group->refs[i]->access);
1317                 access = isl_union_map_union(access,
1318                                             isl_union_map_from_map(map_i));
1319         }
1320
1321         return access;
1322 }
1323
1324 /* Return the extent of "array", recomputed from the bounds.
1325  * The recomputed extent may be simpler than the original extent.
1326  */
1327 static __isl_give isl_set *array_extent(struct gpu_array_info *array)
1328 {
1329         int i;
1330         isl_id *id;
1331         isl_space *space;
1332         isl_local_space *ls;
1333         isl_set *extent;
1334
1335         id = isl_set_get_tuple_id(array->extent);
1336         space = isl_set_get_space(array->extent);
1337         extent = isl_set_universe(isl_space_copy(space));
1338         ls = isl_local_space_from_space(space);
1339         for (i = 0; i < array->n_index; ++i) {
1340                 isl_pw_aff *bound;
1341                 isl_aff *aff;
1342                 isl_pw_aff *index;
1343                 isl_set *lt;
1344
1345                 extent = isl_set_lower_bound_si(extent, isl_dim_set, i, 0);
1346
1347                 aff = isl_aff_var_on_domain(isl_local_space_copy(ls),
1348                                                 isl_dim_set, i);
1349                 index = isl_pw_aff_from_aff(aff);
1350                 bound = isl_pw_aff_copy(array->bound[i]);
1351                 bound = isl_pw_aff_from_range(bound);
1352                 bound = isl_pw_aff_add_dims(bound, isl_dim_in, array->n_index);
1353                 bound = isl_pw_aff_set_tuple_id(bound, isl_dim_in,
1354                                                 isl_id_copy(id));
1355                 lt = isl_pw_aff_lt_set(index, bound);
1356                 extent = isl_set_intersect(extent, lt);
1357         }
1358         isl_local_space_free(ls);
1359         isl_id_free(id);
1360
1361         return extent;
1362 }
1363
1364 /* Return a map from the first shared_len dimensions of the computed
1365  * schedule to the array tile in
1366  * global memory that corresponds to the shared memory copy.
1367  *
1368  * In particular, return a map
1369  *
1370  *      { D[i] -> A[a] }
1371  *
1372  * with constraints
1373  *
1374  *      tile_offset(i) <= a <= tile_offset(i) + tile_size - 1           (1)
1375  *
1376  * and
1377  *
1378  *      0 <= a <= array_size - 1                                        (2)
1379  *
1380  * Note that if some stride has been detected (i.e., when
1381  * group->shared_tile->bound[i].shift is set), then a in (1) refers
1382  * to the shifted and scaled down version.
1383  *
1384  * Constraints (1) are obtained by mapping the size constraints on the
1385  * shared/private memory tile back to the access relation.
1386  * Constraints (2) are obtained from the (recomputed) extent.
1387  */
1388 static __isl_give isl_map *group_tile(struct gpu_array_ref_group *group)
1389 {
1390         int i;
1391         int n_index = group->array->n_index;
1392         isl_map *tile;
1393         isl_space *space;
1394         isl_set *local;
1395         isl_set *extent;
1396
1397         space = isl_multi_aff_get_space(group->shared_tile->tiling);
1398         space = isl_space_range(space);
1399         local = isl_set_universe(space);
1400         for (i = 0; i < n_index; ++i) {
1401                 isl_val *bound;
1402
1403                 local = isl_set_lower_bound_si(local, isl_dim_set, i, 0);
1404                 bound = isl_val_copy(group->shared_tile->bound[i].size);
1405                 bound = isl_val_sub_ui(bound, 1);
1406                 local = isl_set_upper_bound_val(local, isl_dim_set, i, bound);
1407         }
1408         local = isl_set_preimage_multi_aff(local,
1409                                 isl_multi_aff_copy(group->shared_tile->tiling));
1410         tile = isl_set_unwrap(local);
1411         extent = array_extent(group->array);
1412         tile = isl_map_intersect_range(tile, extent);
1413
1414         return tile;
1415 }
1416
1417 /* Given a mapping "iterator_map" from the AST schedule to a domain,
1418  * return the corresponding mapping from the AST schedule to
1419  * to the first shared_len dimensions of the schedule computed by PPCG.
1420  */
1421 static __isl_give isl_pw_multi_aff *compute_sched_to_shared(struct gpu_gen *gen,
1422         __isl_take isl_pw_multi_aff *iterator_map)
1423 {
1424         isl_union_map *umap;
1425         isl_space *space;
1426         isl_map *map, *sched;;
1427
1428         space = isl_space_range(isl_pw_multi_aff_get_space(iterator_map));
1429         space = isl_space_from_domain(space);
1430         space = isl_space_add_dims(space, isl_dim_out, gen->shared_len);
1431
1432         umap = isl_union_map_copy(gen->shared_sched);
1433         umap = isl_union_map_apply_range(umap,
1434                         isl_union_map_copy(gen->shared_proj));
1435         map = isl_union_map_extract_map(umap, space);
1436         isl_union_map_free(umap);
1437
1438         sched = isl_map_preimage_domain_pw_multi_aff(map, iterator_map);
1439         sched = isl_map_detect_equalities(sched);
1440
1441         return isl_pw_multi_aff_from_map(sched);
1442 }
1443
1444 /* Set unroll[j] if the input dimension j is involved in
1445  * the index expression represented by ma.
1446  */
1447 static int check_unroll(__isl_take isl_set *set, __isl_take isl_multi_aff *ma,
1448         void *user)
1449 {
1450         int i, j;
1451         int n_in = isl_multi_aff_dim(ma, isl_dim_in);
1452         int n_out = isl_multi_aff_dim(ma, isl_dim_out);
1453         int *unroll = user;
1454
1455         for (i = 0; i < n_out; ++i) {
1456                 isl_aff *aff;
1457
1458                 aff = isl_multi_aff_get_aff(ma, i);
1459                 for (j = 0; j < n_in; ++j)
1460                         if (isl_aff_involves_dims(aff, isl_dim_in, j, 1))
1461                                 unroll[j] = 1;
1462                 isl_aff_free(aff);
1463         }
1464
1465         isl_set_free(set);
1466         isl_multi_aff_free(ma);
1467         return 0;
1468 }
1469
1470 /* Given an array pos mapping input dimensions to the corresponding
1471  * output dimension, construct the corresponding map.
1472  */
1473 static __isl_give isl_map *permutation(__isl_take isl_space *dim,
1474         int *pos, int len)
1475 {
1476         int i;
1477         isl_constraint *c;
1478         isl_basic_map *bmap;
1479         isl_local_space *ls;
1480
1481         dim = isl_space_add_dims(dim, isl_dim_in, len);
1482         dim = isl_space_add_dims(dim, isl_dim_out, len);
1483         bmap = isl_basic_map_universe(isl_space_copy(dim));
1484         ls = isl_local_space_from_space(dim);
1485
1486         for (i = 0; i < len; ++i) {
1487                 c = isl_equality_alloc(isl_local_space_copy(ls));
1488                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i,
1489                                                       -1);
1490                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, pos[i],
1491                                                       1);
1492                 bmap = isl_basic_map_add_constraint(bmap, c);
1493         }
1494         isl_local_space_free(ls);
1495
1496         return isl_map_from_basic_map(bmap);
1497 }
1498
1499 /* Find all loops involved in any of the index expressions for any of
1500  * the private accesses, move them innermost and then mark them as
1501  * requiring unrolling by setting gen->first_unroll.
1502  * The loops involved should all be parallel because of the checks
1503  * we performed in check_private_group_access.  Moving them innermost
1504  * is therefore a valid transformation.
1505  *
1506  * Loops up to gen->shared_len are generated before the mapping to
1507  * threads is applied.  They should therefore be ignored.
1508  *
1509  * We compute the hidden equalities of the schedule first
1510  * since we will need them in our calls to isl_pw_multi_aff_from_map
1511  * and because we want to make sure that the same equalities
1512  * are also available to the code generator.
1513  */
1514 static __isl_give isl_union_map *interchange_for_unroll(struct gpu_gen *gen,
1515         __isl_take isl_union_map *sched)
1516 {
1517         int i, j;
1518         int unroll[gen->thread_tiled_len];
1519         int perm[gen->thread_tiled_len];
1520         isl_space *dim;
1521         isl_map *permute;
1522         int len = gen->shared_len + gen->n_parallel + gen->n_block;
1523
1524         gen->first_unroll = -1;
1525
1526         sched = isl_union_map_detect_equalities(sched);
1527         for (i = 0; i < gen->thread_tiled_len; ++i)
1528                 unroll[i] = 0;
1529         for (i = 0; i < gen->prog->n_array; ++i) {
1530                 struct gpu_array_info *array = &gen->prog->array[i];
1531
1532                 for (j = 0; j < array->n_group; ++j) {
1533                         isl_union_map *access;
1534                         isl_map *acc;
1535                         isl_pw_multi_aff *pma;
1536
1537                         if (!array->groups[j]->private_tile)
1538                                 continue;
1539
1540                         access = group_access_relation(array->groups[j], 1, 1);
1541                         access = isl_union_map_apply_domain(access,
1542                                                 isl_union_map_copy(sched));
1543
1544                         acc = isl_map_from_union_map(access);
1545                         pma = isl_pw_multi_aff_from_map(acc);
1546                         isl_pw_multi_aff_foreach_piece(pma,
1547                                                         &check_unroll, unroll);
1548
1549                         isl_pw_multi_aff_free(pma);
1550                 }
1551         }
1552
1553         for (i = gen->shared_len; i < len; ++i)
1554                 if (unroll[i])
1555                         break;
1556
1557         if (i >= len)
1558                 return sched;
1559
1560         for (i = len; i < gen->thread_tiled_len; ++i)
1561                 if (unroll[i])
1562                         return sched;
1563
1564         j = 0;
1565         for (i = 0; i < gen->shared_len; ++i)
1566                 perm[i] = j++;
1567         for (i = gen->shared_len; i < gen->thread_tiled_len; ++i)
1568                 if (!unroll[i])
1569                         perm[i] = j++;
1570         gen->first_unroll = j - gen->shared_len;
1571         for (i = gen->shared_len; i < len; ++i)
1572                 if (unroll[i])
1573                         perm[i] = j++;
1574
1575         dim = isl_union_map_get_space(sched);
1576         permute = permutation(dim, perm, gen->thread_tiled_len);
1577         sched = isl_union_map_apply_range(sched,
1578                                           isl_union_map_from_map(permute));
1579
1580         return sched;
1581 }
1582
1583 /* Given a constraint
1584  *
1585  *              a(p,i) + j = g f(e)
1586  *
1587  * or -a(p,i) - j = g f(e) if sign < 0,
1588  * store a(p,i) in bound->shift and g (stride) in bound->stride.
1589  * a(p,i) is assumed to be an expression in only the parameters
1590  * and the input dimensions.
1591  */
1592 static void extract_stride(__isl_keep isl_constraint *c,
1593         struct gpu_array_bound *bound, __isl_keep isl_val *stride, int sign)
1594 {
1595         int i;
1596         isl_val *v;
1597         isl_space *space;
1598         unsigned nparam;
1599         unsigned nvar;
1600         isl_aff *aff;
1601
1602         isl_val_free(bound->stride);
1603         bound->stride = isl_val_copy(stride);
1604
1605         space = isl_constraint_get_space(c);
1606         space = isl_space_domain(space);
1607
1608         nparam = isl_space_dim(space, isl_dim_param);
1609         nvar = isl_space_dim(space, isl_dim_set);
1610
1611         v = isl_constraint_get_constant_val(c);
1612         if (sign < 0)
1613                 v = isl_val_neg(v);
1614         aff = isl_aff_zero_on_domain(isl_local_space_from_space(space));
1615         aff = isl_aff_set_constant_val(aff, v);
1616
1617         for (i = 0; i < nparam; ++i) {
1618                 if (!isl_constraint_involves_dims(c, isl_dim_param, i, 1))
1619                         continue;
1620                 v = isl_constraint_get_coefficient_val(c, isl_dim_param, i);
1621                 if (sign < 0)
1622                         v = isl_val_neg(v);
1623                 aff = isl_aff_add_coefficient_val(aff, isl_dim_param, i, v);
1624         }
1625
1626         for (i = 0; i < nvar; ++i) {
1627                 if (!isl_constraint_involves_dims(c, isl_dim_in, i, 1))
1628                         continue;
1629                 v = isl_constraint_get_coefficient_val(c, isl_dim_in, i);
1630                 if (sign < 0)
1631                         v = isl_val_neg(v);
1632                 aff = isl_aff_add_coefficient_val(aff, isl_dim_in, i, v);
1633         }
1634
1635         bound->shift = aff;
1636 }
1637
1638 /* Given an equality constraint of a map with a single output dimension j,
1639  * check if the constraint is of the form
1640  *
1641  *              a(p,i) + j = g f(e)
1642  *
1643  * with a(p,i) an expression in the parameters and input dimensions
1644  * and f(e) an expression in the existentially quantified variables.
1645  * If so, and if g is larger than any such g from a previously considered
1646  * constraint, then call extract_stride to record the stride information
1647  * in bound.
1648  */
1649 static int check_stride_constraint(__isl_take isl_constraint *c, void *user)
1650 {
1651         int i;
1652         isl_ctx *ctx;
1653         isl_val *v;
1654         unsigned n_div;
1655         struct gpu_array_bound *bound = user;
1656
1657         ctx = isl_constraint_get_ctx(c);
1658         n_div = isl_constraint_dim(c, isl_dim_div);
1659         v = isl_constraint_get_coefficient_val(c, isl_dim_out, 0);
1660
1661         if (n_div && (isl_val_is_one(v) || isl_val_is_negone(v))) {
1662                 int s = isl_val_sgn(v);
1663                 isl_val *stride = isl_val_zero(ctx);
1664
1665                 isl_val_free(v);
1666                 for (i = 0; i < n_div; ++i) {
1667                         v = isl_constraint_get_coefficient_val(c,
1668                                                                 isl_dim_div, i);
1669                         stride = isl_val_gcd(stride, v);
1670                 }
1671                 if (!isl_val_is_zero(stride) &&
1672                     isl_val_gt(stride, bound->stride))
1673                         extract_stride(c, bound, stride, s);
1674
1675                 isl_val_free(stride);
1676         } else
1677                 isl_val_free(v);
1678
1679         isl_constraint_free(c);
1680         return 0;
1681 }
1682
1683 /* Given contraints on an array index i, check if we can find
1684  * a shift a(p) and a stride g such that
1685  *
1686  *      a(p) + i = 0 mod g
1687  *
1688  * If so, record the information in bound and apply the mapping
1689  * i -> (i + a(p))/g to the array index in bounds and return
1690  * the new constraints.
1691  * If not, simply return the original constraints.
1692  *
1693  * If bounds is a subset of the space
1694  *
1695  *      D -> i
1696  *
1697  * then the bound recorded in bound->shift is of the form
1698  *
1699  *      D -> s(D)
1700  *
1701  * with s(D) equal to a(p) above.
1702  * The mapping recorded in bound->shift_map is of the form
1703  *
1704  *      [D -> i] -> [D -> (i + S(D))/g]
1705  *
1706  * This mapping is computed as follows.
1707  * We first introduce "i" in the domain through precomposition
1708  * with [D -> i] -> D obtaining
1709  *
1710  *      [D -> i] -> s(D)
1711  *
1712  * Adding [D -> i] -> i produces
1713  *
1714  *      [D -> i] -> i + s(D)
1715  *
1716  * and the domain product with [D -> i] -> D yields
1717  *
1718  *      [D -> i] -> [D -> i + s(D)]
1719  *
1720  * Composition with [D -> i] -> [D -> i/g] gives the desired result.
1721  */
1722 static __isl_give isl_basic_map *check_stride(struct gpu_array_bound *bound,
1723         __isl_take isl_basic_map *bounds)
1724 {
1725         isl_space *space;
1726         isl_basic_map *hull;
1727         isl_basic_map *shift, *id, *bmap, *scale;
1728         isl_basic_set *bset;
1729         isl_aff *aff;
1730
1731         bound->stride = NULL;
1732
1733         hull = isl_basic_map_affine_hull(isl_basic_map_copy(bounds));
1734
1735         isl_basic_map_foreach_constraint(hull, &check_stride_constraint, bound);
1736
1737         isl_basic_map_free(hull);
1738
1739         if (!bound->stride)
1740                 return bounds;
1741
1742         shift = isl_basic_map_from_aff(isl_aff_copy(bound->shift));
1743         space = isl_basic_map_get_space(bounds);
1744         bmap = isl_basic_map_domain_map(isl_basic_map_universe(space));
1745         shift = isl_basic_map_apply_range(bmap, shift);
1746         space = isl_basic_map_get_space(bounds);
1747         id = isl_basic_map_range_map(isl_basic_map_universe(space));
1748         shift = isl_basic_map_sum(id, shift);
1749         space = isl_basic_map_get_space(bounds);
1750         id = isl_basic_map_domain_map(isl_basic_map_universe(space));
1751         shift = isl_basic_map_range_product(id, shift);
1752
1753         space = isl_space_domain(isl_basic_map_get_space(bounds));
1754         id = isl_basic_map_identity(isl_space_map_from_set(space));
1755         space = isl_space_range(isl_basic_map_get_space(bounds));
1756         aff = isl_aff_zero_on_domain(isl_local_space_from_space(space));
1757         aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1);
1758         aff = isl_aff_scale_down_val(aff, isl_val_copy(bound->stride));
1759         scale = isl_basic_map_from_aff(aff);
1760         scale = isl_basic_map_product(id, scale);
1761
1762         bound->shift_map = isl_basic_map_apply_range(shift, scale);
1763         bmap = isl_basic_map_copy(bound->shift_map);
1764         bset = isl_basic_set_apply(isl_basic_map_wrap(bounds), bmap);
1765         bounds = isl_basic_set_unwrap(bset);
1766
1767         return bounds;
1768 }
1769
1770 /* Data used in compute_array_dim_size and compute_size_in_direction.
1771  *
1772  * pos is the position of the variable representing the array index,
1773  * i.e., the variable for which want to compute the size.  This variable
1774  * is also the last variable in the set.
1775  */
1776 struct gpu_size_info {
1777         isl_basic_set *bset;
1778         struct gpu_array_bound *bound;
1779         int pos;
1780 };
1781
1782 /* Given a constraint from the basic set describing the bounds on
1783  * an array index, check if it is a lower bound, say m i >= b(x), and,
1784  * if so, check whether the expression "i - ceil(b(x)/m) + 1" has a constant
1785  * upper bound.  If so, and if this bound is smaller than any bound
1786  * derived from earlier constraints, set the size to this bound on
1787  * the expression and the lower bound to ceil(b(x)/m).
1788  */
1789 static int compute_size_in_direction(__isl_take isl_constraint *c, void *user)
1790 {
1791         struct gpu_size_info *size = user;
1792         unsigned nparam;
1793         unsigned n_div;
1794         isl_val *v;
1795         isl_aff *aff;
1796         isl_aff *lb;
1797
1798         nparam = isl_basic_set_dim(size->bset, isl_dim_param);
1799         n_div = isl_constraint_dim(c, isl_dim_div);
1800
1801         if (isl_constraint_involves_dims(c, isl_dim_div, 0, n_div) ||
1802             !isl_constraint_is_lower_bound(c, isl_dim_set, size->pos)) {
1803                 isl_constraint_free(c);
1804                 return 0;
1805         }
1806
1807         aff = isl_constraint_get_bound(c, isl_dim_set, size->pos);
1808         aff = isl_aff_ceil(aff);
1809
1810         lb = isl_aff_copy(aff);
1811
1812         aff = isl_aff_neg(aff);
1813         aff = isl_aff_add_coefficient_si(aff, isl_dim_in, size->pos, 1);
1814
1815         v = isl_basic_set_max_val(size->bset, aff);
1816         isl_aff_free(aff);
1817
1818         if (isl_val_is_int(v)) {
1819                 v = isl_val_add_ui(v, 1);
1820                 if (!size->bound->size || isl_val_lt(v, size->bound->size)) {
1821                         isl_val_free(size->bound->size);
1822                         size->bound->size = isl_val_copy(v);
1823                         lb = isl_aff_drop_dims(lb, isl_dim_in, size->pos, 1);
1824                         isl_aff_free(size->bound->lb);
1825                         size->bound->lb = isl_aff_copy(lb);
1826                 }
1827         }
1828         isl_val_free(v);
1829         isl_aff_free(lb);
1830
1831         isl_constraint_free(c);
1832
1833         return 0;
1834 }
1835
1836 /* Given a basic map "bounds" that maps parameters and input dimensions
1837  * to a single output dimension, look for an expression in the parameters
1838  * and input dimensions such that the range of the output dimension shifted
1839  * by this expression is a constant.
1840  *
1841  * In particular, we currently only consider lower bounds on the output
1842  * dimension as candidate expressions.
1843  */
1844 static int compute_array_dim_size(struct gpu_array_bound *bound,
1845         __isl_take isl_basic_map *bounds)
1846 {
1847         struct gpu_size_info size;
1848
1849         bounds = isl_basic_map_detect_equalities(bounds);
1850         bounds = check_stride(bound, bounds);
1851
1852         bound->size = NULL;
1853         bound->lb = NULL;
1854
1855         size.bound = bound;
1856         size.pos = isl_basic_map_dim(bounds, isl_dim_in);
1857         size.bset = isl_basic_map_wrap(bounds);
1858         size.bset = isl_basic_set_flatten(size.bset);
1859         size.bset = isl_set_simple_hull(isl_basic_set_compute_divs(size.bset));
1860         isl_basic_set_foreach_constraint(size.bset, &compute_size_in_direction,
1861                                         &size);
1862         isl_basic_set_free(size.bset);
1863
1864         return bound->size ? 0 : -1;
1865 }
1866
1867 /* Check if we can find a memory tile for the given array
1868  * based on the given accesses, and if so, put the results in "tile".
1869  *
1870  * We project the accesses on each index in turn and look for a parametric
1871  * offset such that the size is constant.
1872  */
1873 static int can_tile(__isl_keep isl_map *access, struct gpu_array_tile *tile)
1874 {
1875         int i;
1876
1877         for (i = 0; i < tile->n; ++i) {
1878                 isl_map *access_i;
1879                 isl_basic_map *hull;
1880
1881                 access_i = isl_map_copy(access);
1882                 access_i = isl_map_project_out(access_i, isl_dim_out, 0, i);
1883                 access_i = isl_map_project_out(access_i, isl_dim_out,
1884                                             1, tile->n - (i + 1));
1885                 access_i = isl_map_compute_divs(access_i);
1886                 hull = isl_map_simple_hull(access_i);
1887                 if (compute_array_dim_size(&tile->bound[i], hull) < 0)
1888                         return 0;
1889         }
1890
1891         return 1;
1892 }
1893
1894 /* Construct a map with input the shared tile loops and the loops that
1895  * will be wrapped around the threads that relates these later loops
1896  * to the thread indices and then projects them out.
1897  */
1898 static __isl_give isl_map *compute_privatization(struct gpu_gen *gen)
1899 {
1900         isl_map *priv;
1901         isl_map *tiling;
1902         isl_map *proj;
1903         isl_set *par;
1904         isl_space *dim;
1905
1906         dim = isl_union_map_get_space(gen->shared_sched);
1907
1908         if (gen->options->wrap)
1909                 tiling = wrap(isl_space_copy(dim), gen->shared_len + gen->n_block,
1910                                 gen->shared_len, gen->n_block, gen->block_dim);
1911         else
1912                 tiling = tile(isl_space_copy(dim), gen->shared_len + gen->n_block,
1913                                 gen->shared_len, gen->n_block, gen->block_dim);
1914
1915         priv = tiling;
1916
1917         par = parametrization(dim, gen->shared_len + 2 * gen->n_block,
1918                 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
1919                 gen->n_block, "t");
1920
1921         priv = isl_map_align_params(priv, isl_set_get_space(par));
1922         priv = isl_map_intersect_range(priv, par);
1923
1924         dim = isl_map_get_space(priv);
1925         dim = isl_space_drop_dims(dim, isl_dim_in, 0, isl_space_dim(dim, isl_dim_in));
1926         dim = isl_space_drop_dims(dim, isl_dim_out, 0, isl_space_dim(dim, isl_dim_out));
1927         proj = projection(dim, gen->shared_len + 2 * gen->n_block,
1928                           gen->shared_len);
1929
1930         priv = isl_map_apply_range(priv, proj);
1931
1932         return priv;
1933 }
1934
1935 /* Construct a map from domain_dim to domain_dim that increments
1936  * the dimension at position "pos" and leaves all other dimensions
1937  * constant.
1938  */
1939 static __isl_give isl_map *next(__isl_take isl_space *domain_dim, int pos)
1940 {
1941         int i;
1942         int len = isl_space_dim(domain_dim, isl_dim_set);
1943         isl_space *dim;
1944         isl_basic_map *next;
1945         isl_local_space *ls;
1946
1947         dim = isl_space_map_from_set(domain_dim);
1948         next = isl_basic_map_universe(isl_space_copy(dim));
1949         ls = isl_local_space_from_space(dim);
1950
1951         for (i = 0; i < len; ++i) {
1952                 isl_constraint *c;
1953
1954                 c = isl_equality_alloc(isl_local_space_copy(ls));
1955                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, 1);
1956                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1957                 if (i == pos)
1958                         c = isl_constraint_set_constant_si(c, 1);
1959                 next = isl_basic_map_add_constraint(next, c);
1960         }
1961
1962         isl_local_space_free(ls);
1963
1964         return isl_map_from_basic_map(next);
1965 }
1966
1967 /* Check if the given access is coalesced.
1968  * That is, check whether incrementing the dimension that will get
1969  * wrapped over the last thread index results in incrementing
1970  * the last array index.
1971  *
1972  * This function is only called for access relations without reuse.
1973  */
1974 static int access_is_coalesced(struct gpu_gen *gen,
1975         __isl_keep isl_union_map *access)
1976 {
1977         isl_space *dim;
1978         isl_map *access_map;
1979         isl_map *next_thread_x;
1980         isl_map *next_element;
1981         isl_map *map;
1982         int coalesced;
1983
1984         access = isl_union_map_copy(access);
1985         access = isl_union_map_apply_domain(access,
1986                                 isl_union_map_copy(gen->tiled_sched));
1987         access_map = isl_map_from_union_map(access);
1988
1989         dim = isl_map_get_space(access_map);
1990         dim = isl_space_domain(dim);
1991         next_thread_x = next(dim, gen->shared_len + gen->n_block - 1);
1992
1993         dim = isl_map_get_space(access_map);
1994         dim = isl_space_range(dim);
1995         next_element = next(dim, isl_space_dim(dim, isl_dim_set) - 1);
1996
1997         map = isl_map_apply_domain(next_thread_x, isl_map_copy(access_map));
1998         map = isl_map_apply_range(map, access_map);
1999
2000         coalesced = isl_map_is_subset(map, next_element);
2001
2002         isl_map_free(next_element);
2003         isl_map_free(map);
2004
2005         return coalesced;
2006 }
2007
2008 /* Given an access relation in terms of the first gen->shared_len + gen->n_block
2009  * dimensions of the computed schedule, check if it is bijective for
2010  * fixed values of the first gen->shared_len dimensions.
2011  * We perform this check by equating these dimensions to parameters.
2012  */
2013 static int access_is_bijective(struct gpu_gen *gen, __isl_keep isl_map *access)
2014 {
2015         int res;
2016         isl_set *par;
2017         isl_space *space;
2018
2019         access = isl_map_copy(access);
2020         space = isl_space_params(isl_map_get_space(access));
2021         par = parametrization(space, gen->shared_len + gen->n_block,
2022                                 0, gen->shared_len, "s");
2023         access = isl_map_intersect_domain(access, par);
2024         res = isl_map_is_bijective(access);
2025         isl_map_free(access);
2026
2027         return res;
2028 }
2029
2030 /* Look for the last shared tile loop that affects the offset of "tile"
2031  * and return the result.
2032  * If there is no such loop, then return the index of the loop
2033  * before the first shared tile loop, in particular gen->tile_first - 1.
2034  */
2035 static int compute_tile_last_shared(struct gpu_gen *gen,
2036         struct gpu_array_tile *tile)
2037 {
2038         int i, j;
2039
2040         for (j = gen->shared_len - 1; j >= gen->tile_first; --j) {
2041                 for (i = 0; i < tile->n; ++i) {
2042                         isl_aff *lb;
2043                         isl_aff *shift;
2044
2045                         lb = tile->bound[i].lb;
2046                         if (isl_aff_involves_dims(lb, isl_dim_in, j, 1))
2047                                 break;
2048
2049                         shift = tile->bound[i].shift;
2050                         if (!shift)
2051                                 continue;
2052                         if (isl_aff_involves_dims(shift, isl_dim_in, j, 1))
2053                                 break;
2054                 }
2055                 if (i < tile->n)
2056                         break;
2057         }
2058
2059         return j;
2060 }
2061
2062 /* Look for the last shared tile loop that affects the offset of the
2063  * shared or private tile and store the result in group->last_shared.
2064  * If there is no such loop, then group->last_shared is set to a value
2065  * before the first shared tile loop, in particular gen->tile_first - 1.
2066  * If there is no tile defined on the array reference group,
2067  * then set group->last_shared to gen->shared_len - 1.
2068  */
2069 static void set_last_shared(struct gpu_gen *gen,
2070         struct gpu_array_ref_group *group)
2071 {
2072         struct gpu_array_tile *tile;
2073
2074         group->last_shared = gen->shared_len - 1;
2075
2076         tile = group->private_tile;
2077         if (!tile)
2078                 tile = group->shared_tile;
2079         if (!tile)
2080                 return;
2081
2082         group->last_shared = compute_tile_last_shared(gen, tile);
2083 }
2084
2085 /* Compute a privatized copy of all access relations from reference groups that
2086  * are mapped to private memory and store the result in gen->privatization.
2087  */
2088 static void compute_private_access(struct gpu_gen *gen)
2089 {
2090         int i, j;
2091         isl_union_map *private;
2092
2093         if (!gen->options->use_private_memory)
2094                 return;
2095
2096         private = isl_union_map_empty(isl_union_map_get_space(gen->shared_sched));
2097
2098         for (i = 0; i < gen->prog->n_array; ++i) {
2099                 struct gpu_array_info *array = &gen->prog->array[i];
2100
2101                 if (gpu_array_is_read_only_scalar(array))
2102                         continue;
2103
2104                 for (j = 0; j < array->n_group; ++j) {
2105                         if (!array->groups[j]->private_tile)
2106                                 continue;
2107
2108                         private = isl_union_map_union(private,
2109                                 group_access_relation(array->groups[j], 1, 1));
2110                 }
2111         }
2112
2113         if (isl_union_map_is_empty(private))
2114                 isl_union_map_free(private);
2115         else {
2116                 isl_union_map *priv;
2117
2118                 private = isl_union_map_apply_domain(private,
2119                                         isl_union_map_copy(gen->shared_sched));
2120                 priv = isl_union_map_from_map(isl_map_copy(gen->privatization));
2121                 private = isl_union_map_apply_domain(private, priv);
2122                 gen->private_access = private;
2123         }
2124 }
2125
2126 /* Compute the size of the tile specified by "tile"
2127  * in number of elements and return the result.
2128  */
2129 static __isl_give isl_val *tile_size(isl_ctx *ctx, struct gpu_array_tile *tile)
2130 {
2131         int i;
2132         isl_val *size;
2133
2134         size = isl_val_one(ctx);
2135
2136         for (i = 0; i < tile->n; ++i)
2137                 size = isl_val_mul(size, isl_val_copy(tile->bound[i].size));
2138
2139         return size;
2140 }
2141
2142 /* If max_shared_memory is not set to infinity (-1), then make
2143  * sure that the total amount of shared memory required by the
2144  * array reference groups mapped to shared memory is no larger
2145  * than this maximum.
2146  *
2147  * We apply a greedy approach and discard (keep in global memory)
2148  * those groups that would result in a total memory size that
2149  * is larger than the maximum.
2150  */
2151 static void check_shared_memory_bound(struct gpu_gen *gen)
2152 {
2153         int i, j;
2154         isl_val *left, *size;
2155
2156         if (gen->options->max_shared_memory < 0)
2157                 return;
2158
2159         left = isl_val_int_from_si(gen->ctx, gen->options->max_shared_memory);
2160
2161         for (i = 0; i < gen->prog->n_array; ++i) {
2162                 struct gpu_array_info *array = &gen->prog->array[i];
2163
2164                 for (j = 0; j < array->n_group; ++j) {
2165                         struct gpu_array_ref_group *group;
2166
2167                         group = array->groups[j];
2168                         if (group->private_tile)
2169                                 continue;
2170                         if (!group->shared_tile)
2171                                 continue;
2172
2173                         size = tile_size(gen->ctx, group->shared_tile);
2174                         size = isl_val_mul_ui(size, array->size);
2175
2176                         if (isl_val_le(size, left)) {
2177                                 left = isl_val_sub(left, size);
2178                                 continue;
2179                         }
2180                         isl_val_free(size);
2181
2182                         group->shared_tile = free_tile(group->shared_tile);
2183                 }
2184         }
2185
2186         isl_val_free(left);
2187 }
2188
2189 /* Given a description of an array tile "tile" and the "space"
2190  *
2191  *      { D -> A }
2192  *
2193  * where D represents the first shared_len schedule dimensions
2194  * and A represents the array, construct an isl_multi_aff
2195  *
2196  *      { [D[i] -> A[a]] -> A'[a'] }
2197  *
2198  * with A' a scaled down copy of A according to the shifts and strides
2199  * in "tile".  In particular,
2200  *
2201  *      a' = (a + shift(i))/stride
2202  *
2203  * "insert_array" represents
2204  *
2205  *      { [D -> A] -> D }
2206  *
2207  * and is used to insert A into the domain of functions that only
2208  * reference D.
2209  */
2210 static __isl_give isl_multi_aff *strided_tile(
2211         struct gpu_array_tile *tile, __isl_keep isl_space *space,
2212         __isl_keep isl_multi_aff *insert_array)
2213 {
2214         int i;
2215         isl_ctx *ctx;
2216         isl_multi_aff *shift;
2217         isl_multi_val *stride;
2218         isl_space *space2;
2219         isl_local_space *ls;
2220         isl_multi_aff *tiling;
2221
2222         ctx = isl_space_get_ctx(space);
2223         space2 = isl_space_domain(isl_space_copy(space));
2224         ls = isl_local_space_from_space(space2);
2225         space2 = isl_space_range(isl_space_copy(space));
2226         stride = isl_multi_val_zero(space2);
2227         shift = isl_multi_aff_zero(isl_space_copy(space));
2228
2229         for (i = 0; i < tile->n; ++i) {
2230                 struct gpu_array_bound *bound = &tile->bound[i];
2231                 isl_val *stride_i;
2232                 isl_aff *shift_i;
2233
2234                 if (tile->bound[i].shift) {
2235                         stride_i = isl_val_copy(bound->stride);
2236                         shift_i = isl_aff_copy(bound->shift);
2237                 } else {
2238                         stride_i = isl_val_one(ctx);
2239                         shift_i = isl_aff_zero_on_domain(
2240                                         isl_local_space_copy(ls));
2241                 }
2242
2243                 stride = isl_multi_val_set_val(stride, i, stride_i);
2244                 shift = isl_multi_aff_set_aff(shift, i, shift_i);
2245         }
2246         isl_local_space_free(ls);
2247
2248         shift = isl_multi_aff_pullback_multi_aff(shift,
2249                                     isl_multi_aff_copy(insert_array));
2250
2251         tiling = isl_multi_aff_range_map(isl_space_copy(space));
2252         tiling = isl_multi_aff_add(tiling, shift);
2253         tiling = isl_multi_aff_scale_down_multi_val(tiling, stride);
2254
2255         return tiling;
2256 }
2257
2258 /* Compute a tiling for the array reference group "group".
2259  *
2260  * The tiling is of the form
2261  *
2262  *      { [D[i] -> A[a]] -> T[t] }
2263  *
2264  * where D represents the first shared_len schedule dimensions,
2265  * A represents the global array and T represents the shared or
2266  * private memory tile.  The name of T is the name of the local
2267  * array.
2268  *
2269  * If there is any stride in the accesses, then the mapping is
2270  *
2271  *      t = (a + shift(i))/stride - lb(i)
2272  *
2273  * otherwise, it is simply
2274  *
2275  *      t = a - lb(i)
2276  */
2277 static void compute_group_tiling(struct gpu_array_ref_group *group)
2278 {
2279         int i;
2280         struct gpu_array_tile *tile;
2281         struct gpu_array_info *array = group->array;
2282         isl_space *space;
2283         isl_multi_aff *tiling, *lb, *insert_array;
2284         isl_printer *p;
2285         char *local_name;
2286
2287         tile = group->private_tile;
2288         if (!tile)
2289                 tile = group->shared_tile;
2290         if (!tile)
2291                 return;
2292
2293         space = isl_map_get_space(group->access);
2294         insert_array = isl_multi_aff_domain_map(isl_space_copy(space));
2295
2296         for (i = 0; i < tile->n; ++i)
2297                 if (tile->bound[i].shift)
2298                         break;
2299
2300         if (i < tile->n)
2301                 tiling = strided_tile(tile, space, insert_array);
2302         else
2303                 tiling = isl_multi_aff_range_map(isl_space_copy(space));
2304
2305         lb = isl_multi_aff_zero(space);
2306         for (i = 0; i < tile->n; ++i) {
2307                 isl_aff *lb_i = isl_aff_copy(tile->bound[i].lb);
2308                 lb = isl_multi_aff_set_aff(lb, i, lb_i);
2309         }
2310         lb = isl_multi_aff_pullback_multi_aff(lb, insert_array);
2311
2312         tiling = isl_multi_aff_sub(tiling, lb);
2313
2314         p = isl_printer_to_str(isl_multi_aff_get_ctx(tiling));
2315         p = print_array_name(p, group);
2316         local_name = isl_printer_get_str(p);
2317         isl_printer_free(p);
2318         tiling = isl_multi_aff_set_tuple_name(tiling, isl_dim_out, local_name);
2319         free(local_name);
2320
2321         tile->tiling = tiling;
2322 }
2323
2324 /* Compute a tiling for all the array reference groups.
2325  */
2326 static void compute_group_tilings(struct gpu_gen *gen)
2327 {
2328         int i, j;
2329
2330         for (i = 0; i < gen->prog->n_array; ++i) {
2331                 struct gpu_array_info *array = &gen->prog->array[i];
2332
2333                 for (j = 0; j < array->n_group; ++j)
2334                         compute_group_tiling(array->groups[j]);
2335         }
2336 }
2337
2338 /* Fill up the groups array with singleton groups, i.e., one group
2339  * per reference, initializing the array, access, write, n_ref and refs fields.
2340  * In particular the access field is initialized to the scheduled
2341  * access relation of the array reference.
2342  *
2343  * Return the number of elements initialized, i.e., the number of
2344  * active references in the current kernel.
2345  */
2346 static int populate_array_references(struct gpu_array_info *array,
2347         __isl_keep isl_union_map *sched, struct gpu_array_ref_group **groups)
2348 {
2349         int i;
2350         int n;
2351         isl_ctx *ctx = isl_union_map_get_ctx(sched);
2352
2353         n = 0;
2354         for (i = 0; i < array->n_ref; ++i) {
2355                 isl_union_map *umap;
2356                 isl_map *map;
2357                 struct gpu_array_ref_group *group;
2358                 struct gpu_stmt_access *access = array->refs[i];
2359
2360                 map = isl_map_copy(access->access);
2361                 umap = isl_union_map_from_map(map);
2362                 umap = isl_union_map_apply_domain(umap,
2363                                 isl_union_map_copy(sched));
2364
2365                 if (isl_union_map_is_empty(umap)) {
2366                         isl_union_map_free(umap);
2367                         continue;
2368                 }
2369
2370                 map = isl_map_from_union_map(umap);
2371                 map = isl_map_detect_equalities(map);
2372
2373                 group = isl_calloc_type(ctx, struct gpu_array_ref_group);
2374                 assert(group);
2375                 group->array = array;
2376                 group->access = map;
2377                 group->write = access->write;
2378                 group->refs = &array->refs[i];
2379                 group->n_ref = 1;
2380
2381                 groups[n++] = group;
2382         }
2383
2384         return n;
2385 }
2386
2387 /* If group->n_ref == 1, then group->refs was set by
2388  * populate_array_references to point directly into
2389  * group->array->refs and should not be freed.
2390  * If group->n_ref > 1, then group->refs was set by join_groups
2391  * to point to a newly allocated array.
2392  */
2393 static void free_array_ref_group(struct gpu_array_ref_group *group)
2394 {
2395         if (!group)
2396                 return;
2397         free_tile(group->shared_tile);
2398         free_tile(group->private_tile);
2399         isl_map_free(group->access);
2400         if (group->n_ref > 1)
2401                 free(group->refs);
2402         free(group);
2403 }
2404
2405 /* Given a map where the input dimensions represent the tile loops,
2406  * eliminate the innermost of those that have a fixed value
2407  * until we reach one that does not (obviously) have a fixed value.
2408  */
2409 static __isl_give isl_map *eliminate_fixed_inner_loops(
2410         __isl_take isl_map *access)
2411 {
2412         int i, n;
2413
2414         n = isl_map_dim(access, isl_dim_in);
2415
2416         for (i = n - 1; i >= 0; --i) {
2417                 if (!map_plain_is_fixed(access, isl_dim_in, i))
2418                         break;
2419                 access = isl_map_eliminate(access, isl_dim_in, i, 1);
2420         }
2421         return access;
2422 }
2423
2424 /* Check if the access relations of group1 and group2 overlap within
2425  * the innermost loop.  In particular, ignore any inner dimension
2426  * with a fixed value.
2427  * The copying to and from shared memory will be performed within
2428  * the innermost actual loop so we are only allowed to consider
2429  * the dimensions up to that innermost loop while checking whether
2430  * two access relations overlap.
2431  */
2432 static int accesses_overlap(struct gpu_array_ref_group *group1,
2433         struct gpu_array_ref_group *group2)
2434 {
2435         int empty;
2436         isl_map *access1, *access2;
2437
2438         access1 = isl_map_copy(group1->access);
2439         access1 = eliminate_fixed_inner_loops(access1);
2440         access2 = isl_map_copy(group2->access);
2441         access2 = eliminate_fixed_inner_loops(access2);
2442         access1 = isl_map_intersect(access1, access2);
2443         empty = isl_map_is_empty(access1);
2444         isl_map_free(access1);
2445
2446         return !empty;
2447 }
2448
2449 /* Combine the given two groups into a single group, containing
2450  * the references of both groups.
2451  */
2452 static struct gpu_array_ref_group *join_groups(
2453         struct gpu_array_ref_group *group1,
2454         struct gpu_array_ref_group *group2)
2455 {
2456         int i;
2457         isl_ctx *ctx;
2458         struct gpu_array_ref_group *group;
2459
2460         ctx = isl_map_get_ctx(group1->access);
2461         group = isl_calloc_type(ctx, struct gpu_array_ref_group);
2462         assert(group);
2463         group->array = group1->array;
2464         group->access = isl_map_union(isl_map_copy(group1->access),
2465                                         isl_map_copy(group2->access));
2466         group->write = group1->write || group2->write;
2467         group->n_ref = group1->n_ref + group2->n_ref;
2468         group->refs = isl_alloc_array(ctx, struct gpu_stmt_access *,
2469                                         group->n_ref);
2470         assert(group->refs);
2471         for (i = 0; i < group1->n_ref; ++i)
2472                 group->refs[i] = group1->refs[i];
2473         for (i = 0; i < group2->n_ref; ++i)
2474                 group->refs[group1->n_ref + i] = group2->refs[i];
2475
2476         return group;
2477 }
2478
2479 /* Combine the given two groups into a single group and free
2480  * the original two groups.
2481  */
2482 static struct gpu_array_ref_group *join_groups_and_free(
2483         struct gpu_array_ref_group *group1,
2484         struct gpu_array_ref_group *group2)
2485 {
2486         struct gpu_array_ref_group *group;
2487
2488         group = join_groups(group1, group2);
2489         free_array_ref_group(group1);
2490         free_array_ref_group(group2);
2491         return group;
2492 }
2493
2494 /* Compute the private and/or shared memory tiles for the array
2495  * reference group "group" of array "array".
2496  *
2497  * If the array is a read-only scalar or if the user requested
2498  * not to use shared or private memory, then we do not need to do anything.
2499  *
2500  * We only try to compute a shared memory tile if there is any reuse
2501  * or if the access is not coalesced.
2502  *
2503  * For computing a private memory tile, we also require that there is
2504  * some reuse.  Moreover, we require that the access is private
2505  * to the thread.  That is, we check that any given array element
2506  * is only accessed by a single thread.
2507  * We compute an access relation that maps the shared tile loop iterators
2508  * and the shared point loop iterators that will be wrapped over the
2509  * threads to the array elements.
2510  * We actually check that those iterators that will be wrapped
2511  * partition the array space.  This check is stricter than necessary
2512  * since several iterations may be mapped onto the same thread
2513  * and then they could be allowed to access the same memory elements,
2514  * but our check does not allow this situation.
2515  *
2516  * We also check that the index expression only depends on parallel
2517  * loops.  That way, we can move those loops innermost and unroll them.
2518  * Again, we use a test that is stricter than necessary.
2519  * We actually check whether the index expression only depends
2520  * on the iterators that are wrapped over the threads.
2521  * These are necessarily parallel, but there may be more parallel loops.
2522  *
2523  * Combining the injectivity of the first test with the single-valuedness
2524  * of the second test, we simply test for bijectivity.
2525  *
2526  * If it turns out we can use registers, we compute the private memory
2527  * tile size using can_tile, after introducing a dependence
2528  * on the thread indices.
2529  */
2530 static void compute_group_bounds_core(struct gpu_gen *gen,
2531         struct gpu_array_ref_group *group)
2532 {
2533         isl_ctx *ctx = isl_space_get_ctx(group->array->space);
2534         isl_union_map *access;
2535         int n_index = group->array->n_index;
2536         int no_reuse;
2537         isl_map *acc;
2538         int use_shared = gen->options->use_shared_memory;
2539         int use_private = gen->options->use_private_memory;
2540
2541         if (!use_shared && !use_private)
2542                 return;
2543         if (gpu_array_is_read_only_scalar(group->array))
2544                 return;
2545
2546         access = group_access_relation(group, 1, 1);
2547         no_reuse = isl_union_map_is_injective(access);
2548
2549         if (use_shared && (!no_reuse || !access_is_coalesced(gen, access))) {
2550                 group->shared_tile = create_tile(ctx, group->array->n_index);
2551                 if (!can_tile(group->access, group->shared_tile))
2552                         group->shared_tile = free_tile(group->shared_tile);
2553         }
2554
2555         if (!use_private || no_reuse) {
2556                 isl_union_map_free(access);
2557                 return;
2558         }
2559
2560         access = isl_union_map_apply_domain(access,
2561                                         isl_union_map_copy(gen->shared_sched));
2562
2563         acc = isl_map_from_union_map(access);
2564
2565         if (!access_is_bijective(gen, acc)) {
2566                 isl_map_free(acc);
2567                 return;
2568         }
2569
2570         group->private_tile = create_tile(gen->ctx, n_index);
2571         acc = isl_map_apply_domain(acc, isl_map_copy(gen->privatization));
2572         if (!can_tile(acc, group->private_tile))
2573                 group->private_tile = free_tile(group->private_tile);
2574
2575         isl_map_free(acc);
2576 }
2577
2578 /* Compute the private and/or shared memory tiles for the array
2579  * reference group "group" of array "array" and set last_shared.
2580  */
2581 static void compute_group_bounds(struct gpu_gen *gen,
2582         struct gpu_array_ref_group *group)
2583 {
2584         compute_group_bounds_core(gen, group);
2585         set_last_shared(gen, group);
2586 }
2587
2588 /* If two groups have overlapping access relations (as determined by
2589  * the "overlap" function) and if one of them involves a write,
2590  * then merge the two groups into one.
2591  * If "compute_bounds" is set, then call compute_group_bounds
2592  * on the merged groups.
2593  *
2594  * Return the updated number of groups.
2595  */
2596 static int group_writes(struct gpu_gen *gen,
2597         int n, struct gpu_array_ref_group **groups,
2598         int (*overlap)(struct gpu_array_ref_group *group1,
2599                 struct gpu_array_ref_group *group2), int compute_bounds)
2600 {
2601         int i, j;
2602
2603         for (i = 0; i < n; ++i) {
2604                 for (j = n - 1; j > i; --j) {
2605                         if (!groups[i]->write && !groups[j]->write)
2606                                 continue;
2607
2608                         if (!overlap(groups[i], groups[j]))
2609                                 continue;
2610
2611                         groups[i] = join_groups_and_free(groups[i], groups[j]);
2612                         if (compute_bounds)
2613                                 compute_group_bounds(gen, groups[i]);
2614                         if (j != n - 1)
2615                                 groups[j] = groups[n - 1];
2616                         n--;
2617                 }
2618         }
2619
2620         return n;
2621 }
2622
2623 /* If two groups have overlapping access relations (within the innermost
2624  * loop) and if one of them involves a write, then merge the two groups
2625  * into one.
2626  *
2627  * Return the updated number of groups.
2628  */
2629 static int group_overlapping_writes(struct gpu_gen *gen,
2630         int n, struct gpu_array_ref_group **groups)
2631 {
2632         return group_writes(gen, n, groups, &accesses_overlap, 0);
2633 }
2634
2635 /* Check if the access relations of group1 and group2 overlap within
2636  * the outermost min(group1->last_shared, group2->last_shared) loops.
2637  */
2638 static int last_shared_accesses_overlap(struct gpu_array_ref_group *group1,
2639         struct gpu_array_ref_group *group2)
2640 {
2641         int last_shared;
2642         int dim;
2643         int empty;
2644         isl_map *map_i, *map_j, *map;
2645
2646         last_shared = group1->last_shared;
2647         if (group2->last_shared < last_shared)
2648                 last_shared = group2->last_shared;
2649         map_i = isl_map_copy(group1->access);
2650         dim = isl_map_dim(map_i, isl_dim_in);
2651         map_i = isl_map_eliminate(map_i, isl_dim_in,
2652                                 last_shared + 1, dim - (last_shared + 1));
2653         map_j = isl_map_copy(group2->access);
2654         map_j = isl_map_eliminate(map_j, isl_dim_in,
2655                                 last_shared + 1, dim - (last_shared + 1));
2656         map = isl_map_intersect(map_i, map_j);
2657         empty = isl_map_is_empty(map);
2658         isl_map_free(map);
2659
2660         return !empty;
2661 }
2662
2663 /* If two groups have overlapping access relations (within the outer
2664  * last_shared loops) and if one of them involves a write,
2665  * then merge the two groups into one.
2666  *
2667  * Return the updated number of groups.
2668  */
2669 static int group_last_shared_overlapping_writes(struct gpu_gen *gen, int n,
2670         struct gpu_array_ref_group **groups)
2671 {
2672         return group_writes(gen, n, groups, &last_shared_accesses_overlap, 1);
2673 }
2674
2675 /* Is the size of the tile specified by "tile" smaller than the sum of
2676  * the sizes of the tiles specified by "tile1" and "tile2"?
2677  */
2678 static int smaller_tile(isl_ctx *ctx, struct gpu_array_tile *tile,
2679         struct gpu_array_tile *tile1, struct gpu_array_tile *tile2)
2680 {
2681         int smaller;
2682         isl_val *size, *size1, *size2;
2683
2684         size = tile_size(ctx, tile);
2685         size1 = tile_size(ctx, tile1);
2686         size2 = tile_size(ctx, tile2);
2687
2688         size = isl_val_sub(size, size1);
2689         size = isl_val_sub(size, size2);
2690         smaller = isl_val_is_neg(size);
2691
2692         isl_val_free(size);
2693
2694         return smaller;
2695 }
2696
2697 /* Given an initial grouping of array references and shared memory tiles
2698  * for each group that allows for a shared memory tile, merge two groups
2699  * if both have a shared memory tile, the merged group also has
2700  * a shared memory tile and the size of the tile for the merge group
2701  * is smaller than the sum of the tile sizes of the individual groups.
2702  *
2703  * If merging two groups decreases the "last_shared" dimension of
2704  * one or both of the two groups, then we need to check for overlapping
2705  * writes again.
2706  *
2707  * Return the number of groups after merging.
2708  */
2709 static int group_common_shared_memory_tile(struct gpu_gen *gen,
2710         struct gpu_array_info *array, int n,
2711         struct gpu_array_ref_group **groups)
2712 {
2713         int i, j;
2714         int recompute_overlap = 0;
2715         isl_ctx *ctx = isl_space_get_ctx(array->space);
2716
2717         for (i = 0; i < n; ++i) {
2718                 if (!groups[i]->shared_tile)
2719                         continue;
2720                 for (j = n - 1; j > i; --j) {
2721                         isl_map *map;
2722                         int empty;
2723                         struct gpu_array_ref_group *group;
2724
2725                         if (!groups[j]->shared_tile)
2726                                 continue;
2727
2728                         map = isl_map_intersect(isl_map_copy(groups[i]->access),
2729                                             isl_map_copy(groups[j]->access));
2730                         empty = isl_map_is_empty(map);
2731                         isl_map_free(map);
2732
2733                         if (empty)
2734                                 continue;
2735
2736                         group = join_groups(groups[i], groups[j]);
2737                         compute_group_bounds(gen, group);
2738                         if (!group->shared_tile ||
2739                             !smaller_tile(ctx, group->shared_tile,
2740                                         groups[i]->shared_tile,
2741                                         groups[j]->shared_tile)) {
2742                                 free_array_ref_group(group);
2743                                 continue;
2744                         }
2745
2746                         if (group->last_shared < groups[i]->last_shared ||
2747                             group->last_shared < groups[j]->last_shared)
2748                                 recompute_overlap = 1;
2749                         free_array_ref_group(groups[i]);
2750                         free_array_ref_group(groups[j]);
2751                         groups[i] = group;
2752                         if (j != n - 1)
2753                                 groups[j] = groups[n - 1];
2754                         n--;
2755                 }
2756         }
2757
2758         if (recompute_overlap)
2759                 n = group_last_shared_overlapping_writes(gen, n, groups);
2760         return n;
2761 }
2762
2763 /* Set array->n_group and array->groups to n and groups.
2764  *
2765  * Additionally, set the "nr" field of each group
2766  * and the "group" field of each reference in each group.
2767  */
2768 static void set_array_groups(struct gpu_array_info *array,
2769         int n, struct gpu_array_ref_group **groups)
2770 {
2771         int i, j;
2772
2773         array->n_group = n;
2774         array->groups = groups;
2775
2776         for (i = 0; i < n; ++i) {
2777                 groups[i]->nr = i;
2778
2779                 for (j = 0; j < groups[i]->n_ref; ++j)
2780                         groups[i]->refs[j]->group = i;
2781         }
2782 }
2783
2784 /* Group array references that should be considered together when
2785  * deciding whether to access them from private, shared or global memory.
2786  *
2787  * In particular, if two array references overlap and if one of them
2788  * is a write, then the two references are grouped together.
2789  * We first perform an initial grouping based only on the access relation.
2790  * After computing shared and private memory tiles, we check for
2791  * overlapping writes again, but this time taking into account
2792  * the "last_shared" property.
2793  *
2794  * Furthermore, if two groups admit a shared memory tile and if the
2795  * combination of the two also admits a shared memory tile, we merge
2796  * the two groups.
2797  */
2798 static void group_array_references(struct gpu_gen *gen,
2799         struct gpu_array_info *array, __isl_keep isl_union_map *sched)
2800 {
2801         int i;
2802         int n;
2803         isl_ctx *ctx = isl_union_map_get_ctx(sched);
2804         struct gpu_array_ref_group **groups;
2805
2806         groups = isl_calloc_array(ctx, struct gpu_array_ref_group *,
2807                                         array->n_ref);
2808         assert(groups);
2809
2810         n = populate_array_references(array, sched, groups);
2811
2812         n = group_overlapping_writes(gen, n, groups);
2813
2814         for (i = 0; i < n; ++i)
2815                 compute_group_bounds(gen, groups[i]);
2816
2817         n = group_last_shared_overlapping_writes(gen, n, groups);
2818
2819         n = group_common_shared_memory_tile(gen, array, n, groups);
2820
2821         set_array_groups(array, n, groups);
2822 }
2823
2824 /* Take tiled_sched, project it onto the shared tile loops and
2825  * the loops that will be wrapped over the threads and
2826  * store the result in gen->shared_sched.
2827  * Also compute a projection that projects out the loops that will be
2828  * wrapped over the threads and store this projection in gen->shared_proj.
2829  */
2830 static void compute_shared_sched(struct gpu_gen *gen)
2831 {
2832         isl_space *dim;
2833         isl_map *proj;
2834         isl_set *par;
2835         isl_union_map *sched;
2836
2837         sched = isl_union_map_copy(gen->tiled_sched);
2838
2839         dim = isl_union_map_get_space(sched);
2840         proj = projection(dim, gen->tiled_len, gen->shared_len + gen->n_block);
2841         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
2842
2843         dim = isl_union_map_get_space(sched);
2844         proj = projection(dim, gen->shared_len + gen->n_block, gen->shared_len);
2845
2846         gen->shared_sched = sched;
2847         gen->shared_proj = isl_union_map_from_map(proj);
2848 }
2849
2850 /* Group references of all arrays in the program.
2851  */
2852 static void group_references(struct gpu_gen *gen)
2853 {
2854         int i;
2855         isl_union_map *sched;
2856
2857         sched = isl_union_map_apply_range(isl_union_map_copy(gen->shared_sched),
2858                                           isl_union_map_copy(gen->shared_proj));
2859
2860         for (i = 0; i < gen->prog->n_array; ++i)
2861                 group_array_references(gen, &gen->prog->array[i], sched);
2862
2863         isl_union_map_free(sched);
2864 }
2865
2866 /* Free all array information that is local to the current kernel.
2867  */
2868 static void free_local_array_info(struct gpu_gen *gen)
2869 {
2870         int i, j;
2871
2872         for (i = 0; i < gen->prog->n_array; ++i) {
2873                 struct gpu_array_info *array = &gen->prog->array[i];
2874
2875                 for (j = 0; j < array->n_group; ++j)
2876                         free_array_ref_group(array->groups[j]);
2877                 free(array->groups);
2878         }
2879 }
2880
2881 /* Compute the size of a bounding box around the origin and "set",
2882  * where "set" is assumed to contain only non-negative elements.
2883  * In particular, compute the maximal value of "set" in each direction
2884  * and add one.
2885  */
2886 static __isl_give isl_multi_pw_aff *extract_size(__isl_take isl_set *set,
2887         __isl_keep isl_set *context)
2888 {
2889         int i, n;
2890         isl_multi_pw_aff *mpa;
2891
2892         n = isl_set_dim(set, isl_dim_set);
2893         mpa = isl_multi_pw_aff_zero(isl_set_get_space(set));
2894         for (i = 0; i < n; ++i) {
2895                 isl_space *space;
2896                 isl_aff *one;
2897                 isl_pw_aff *bound;
2898
2899                 bound = isl_set_dim_max(isl_set_copy(set), i);
2900                 bound = isl_pw_aff_coalesce(bound);
2901                 bound = isl_pw_aff_gist(bound, isl_set_copy(context));
2902
2903                 space = isl_pw_aff_get_domain_space(bound);
2904                 one = isl_aff_zero_on_domain(isl_local_space_from_space(space));
2905                 one = isl_aff_add_constant_si(one, 1);
2906                 bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one));
2907                 mpa = isl_multi_pw_aff_set_pw_aff(mpa, i, bound);
2908         }
2909         isl_set_free(set);
2910
2911         return mpa;
2912 }
2913
2914 /* Compute the effective grid size as a list of the sizes in each dimension.
2915  *
2916  * The grid size specified by the user or set by default
2917  * in read_grid_sizes() and applied in tile_schedule(),
2918  * may be too large for the given code in the sense that
2919  * it may contain blocks that don't need to execute anything.
2920  * We therefore don't return this grid size, but instead the
2921  * smallest grid size that ensures that all blocks that actually
2922  * execute code are included in the grid.
2923  *
2924  * We first extract a description of the grid, i.e., the possible values
2925  * of the block ids, from gen->tiled_sched.
2926  * The block ids are parameters in gen->tiled_sched.
2927  * We simply need to change them into set dimensions.
2928  *
2929  * Then, for each block dimension, we compute the maximal value of the block id
2930  * and add one.
2931  */
2932 static __isl_give isl_multi_pw_aff *extract_grid_size(struct gpu_gen *gen,
2933         struct ppcg_kernel *kernel)
2934 {
2935         int i;
2936         isl_set *grid;
2937
2938         grid = isl_union_map_params(isl_union_map_copy(gen->tiled_sched));
2939         grid = isl_set_from_params(grid);
2940         grid = isl_set_add_dims(grid, isl_dim_set, gen->n_grid);
2941         for (i = 0; i < gen->n_grid; ++i) {
2942                 int pos;
2943                 char name[20];
2944
2945                 snprintf(name, sizeof(name), "b%d", i);
2946                 pos = isl_set_find_dim_by_name(grid, isl_dim_param, name);
2947                 assert(pos >= 0);
2948                 grid = isl_set_equate(grid, isl_dim_param, pos, isl_dim_set, i);
2949                 grid = isl_set_project_out(grid, isl_dim_param, pos, 1);
2950         }
2951
2952         return extract_size(grid, kernel->context);
2953 }
2954
2955 /* Compute the size of a fixed bounding box around the origin and "set",
2956  * where "set" is assumed to contain only non-negative elements,
2957  * and store the results in "size".
2958  * In particular, compute the maximal value of "set" in each direction
2959  * and add one.
2960  */
2961 static void extract_fixed_size(__isl_take isl_set *set, int *size)
2962 {
2963         int i, n;
2964         isl_local_space *ls;
2965         isl_aff *obj;
2966
2967         n = isl_set_dim(set, isl_dim_set);
2968         ls = isl_local_space_from_space(isl_set_get_space(set));
2969         obj = isl_aff_zero_on_domain(ls);
2970         for (i = 0; i < n; ++i) {
2971                 isl_val *max;
2972
2973                 obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 1);
2974                 max = isl_set_max_val(set, obj);
2975                 size[i] = isl_val_get_num_si(max) + 1;
2976                 isl_val_free(max);
2977                 obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 0);
2978         }
2979         isl_aff_free(obj);
2980         isl_set_free(set);
2981 }
2982
2983 /* Compute the effective block size as a list of the sizes in each dimension
2984  * and store the sizes in kernel->block_dim.
2985  *
2986  * The block size specified by the user or set by default
2987  * in read_block_sizes() and applied in thread_tile_schedule(),
2988  * may be too large for the given code in the sense that
2989  * it may contain threads that don't need to execute anything.
2990  * We therefore don't store this block size in kernel->block_dim,
2991  * but instead the smallest block size that ensures that all threads
2992  * that actually execute code are included in the block.
2993  *
2994  * The current implementation eliminates all parameters, ensuring
2995  * that the size is a fixed constant in each dimension.
2996  * In principle we could also compute parametric sizes.
2997  * We would have to make sure to project out all b%d and t%d parameters,
2998  * however.
2999  */
3000 static void extract_block_size(struct gpu_gen *gen, struct ppcg_kernel *kernel)
3001 {
3002         int i;
3003         int nparam;
3004         isl_set *block;
3005         isl_multi_pw_aff *mpa;
3006
3007         block = isl_union_map_params(isl_union_map_copy(gen->local_sched));
3008         block = isl_set_from_params(block);
3009         block = isl_set_add_dims(block, isl_dim_set, gen->n_block);
3010         kernel->n_block = gen->n_block;
3011         for (i = 0; i < gen->n_block; ++i) {
3012                 int pos;
3013                 char name[20];
3014
3015                 snprintf(name, sizeof(name), "t%d", i);
3016                 pos = isl_set_find_dim_by_name(block, isl_dim_param, name);
3017                 assert(pos >= 0);
3018                 block = isl_set_equate(block, isl_dim_param, pos,
3019                                         isl_dim_set, i);
3020         }
3021         nparam = isl_set_dim(block, isl_dim_param);
3022         block = isl_set_project_out(block, isl_dim_param, 0, nparam);
3023
3024         extract_fixed_size(block, kernel->block_dim);
3025 }
3026
3027 void ppcg_kernel_free(void *user)
3028 {
3029         struct ppcg_kernel *kernel = user;
3030         int i;
3031
3032         if (!kernel)
3033                 return;
3034
3035         isl_multi_pw_aff_free(kernel->grid_size);
3036         isl_set_free(kernel->context);
3037         isl_union_set_free(kernel->arrays);
3038         isl_space_free(kernel->space);
3039         isl_ast_node_free(kernel->tree);
3040
3041         for (i = 0; i < kernel->n_array; ++i)
3042                 isl_pw_aff_list_free(kernel->array[i].bound);
3043         free(kernel->array);
3044
3045         for (i = 0; i < kernel->n_var; ++i) {
3046                 free(kernel->var[i].name);
3047                 isl_vec_free(kernel->var[i].size);
3048         }
3049         free(kernel->var);
3050
3051         free(kernel);
3052 }
3053
3054 static void create_kernel_var(isl_ctx *ctx, struct gpu_array_ref_group *group,
3055         struct ppcg_kernel_var *var)
3056 {
3057         int j;
3058         struct gpu_array_tile *tile;
3059         isl_printer *p;
3060         char *name;
3061
3062         var->array = group->array;
3063
3064         tile = group->private_tile;
3065         var->type = ppcg_access_private;
3066         if (!tile) {
3067                 tile = group->shared_tile;
3068                 var->type = ppcg_access_shared;
3069         }
3070
3071         p = isl_printer_to_str(ctx);
3072         p = print_array_name(p, group);
3073         var->name = isl_printer_get_str(p);
3074         isl_printer_free(p);
3075
3076         var->size = isl_vec_alloc(ctx, group->array->n_index);
3077
3078         for (j = 0; j < group->array->n_index; ++j)
3079                 var->size = isl_vec_set_element_val(var->size, j,
3080                                             isl_val_copy(tile->bound[j].size));
3081 }
3082
3083 static void create_kernel_vars(struct gpu_gen *gen, struct ppcg_kernel *kernel)
3084 {
3085         int i, j, n;
3086
3087         n = 0;
3088         for (i = 0; i < gen->prog->n_array; ++i) {
3089                 struct gpu_array_info *array = &gen->prog->array[i];
3090
3091                 for (j = 0; j < array->n_group; ++j) {
3092                         struct gpu_array_ref_group *group = array->groups[j];
3093                         if (group->private_tile || group->shared_tile)
3094                                 ++n;
3095                 }
3096         }
3097
3098         kernel->n_var = n;
3099         kernel->var = isl_calloc_array(gen->ctx, struct ppcg_kernel_var, n);
3100         assert(kernel->var);
3101
3102         n = 0;
3103         for (i = 0; i < gen->prog->n_array; ++i) {
3104                 struct gpu_array_info *array = &gen->prog->array[i];
3105
3106                 for (j = 0; j < array->n_group; ++j) {
3107                         struct gpu_array_ref_group *group = array->groups[j];
3108                         if (!group->private_tile && !group->shared_tile)
3109                                 continue;
3110                         create_kernel_var(gen->ctx, group, &kernel->var[n]);
3111                         ++n;
3112                 }
3113         }
3114 }
3115
3116 /* The sizes of the arrays on the host that have been computed by
3117  * extract_array_info may depend on the parameters.  Use the extra
3118  * constraints on the parameters that are valid at "host_domain"
3119  * to simplify these expressions and store the results in kernel->array.
3120  */
3121 static void localize_bounds(struct gpu_gen *gen, struct ppcg_kernel *kernel,
3122         __isl_keep isl_set *host_domain)
3123 {
3124         int i, j;
3125         isl_set *context;
3126
3127         kernel->array = isl_calloc_array(gen->ctx,
3128                             struct gpu_local_array_info, gen->prog->n_array);
3129         assert(kernel->array);
3130         kernel->n_array = gen->prog->n_array;
3131
3132         context = isl_set_copy(host_domain);
3133         context = isl_set_params(context);
3134
3135         for (i = 0; i < gen->prog->n_array; ++i) {
3136                 struct gpu_array_info *array = &gen->prog->array[i];
3137                 isl_pw_aff_list *local;
3138
3139                 if (array->n_group == 0)
3140                         continue;
3141
3142                 local = isl_pw_aff_list_alloc(gen->ctx, array->n_index);
3143
3144                 for (j = 0; j < array->n_index; ++j) {
3145                         isl_pw_aff *pwaff;
3146
3147                         pwaff = isl_pw_aff_copy(array->bound[j]);
3148                         pwaff = isl_pw_aff_gist(pwaff, isl_set_copy(context));
3149                         local = isl_pw_aff_list_add(local, pwaff);
3150                 }
3151
3152                 kernel->array[i].bound = local;
3153         }
3154         isl_set_free(context);
3155 }
3156
3157 /* Find the element in gen->stmt that has the given "id".
3158  * Return NULL if no such gpu_stmt can be found.
3159  */
3160 static struct gpu_stmt *find_stmt(struct gpu_prog *prog, __isl_keep isl_id *id)
3161 {
3162         int i;
3163
3164         for (i = 0; i < prog->n_stmts; ++i) {
3165                 if (id == prog->stmts[i].id)
3166                         break;
3167         }
3168
3169         return i < prog->n_stmts ? &prog->stmts[i] : NULL;
3170 }
3171
3172 /* Set gen->tile_len and gen->n_parallel to those of the statement
3173  * affected by the first map (part of the schedule)
3174  * on which this function is called.
3175  * Because of the way the schedule is constructed, the other statements
3176  * in the list, if any, should have the same values for these properties.
3177  */
3178 static int extract_tile_len(__isl_take isl_map *map, void *user)
3179 {
3180         struct gpu_gen *gen = (struct gpu_gen *) user;
3181         isl_id *id;
3182         struct gpu_stmt *stmt;
3183
3184         id = isl_map_get_tuple_id(map, isl_dim_in);
3185         stmt = find_stmt(gen->prog, id);
3186         isl_id_free(id);
3187
3188         isl_map_free(map);
3189
3190         if (!stmt)
3191                 isl_die(gen->ctx, isl_error_unknown,
3192                         "statement not found", return -1);
3193
3194         gen->tile_len = stmt->tile_len;
3195         gen->n_parallel = stmt->n_parallel;
3196
3197         return -1;
3198 }
3199
3200 void ppcg_kernel_stmt_free(void *user)
3201 {
3202         int i;
3203         struct ppcg_kernel_stmt *stmt = user;
3204
3205         if (!stmt)
3206                 return;
3207
3208         switch (stmt->type) {
3209         case ppcg_kernel_copy:
3210                 isl_ast_expr_free(stmt->u.c.index);
3211                 isl_ast_expr_free(stmt->u.c.local_index);
3212                 break;
3213         case ppcg_kernel_domain:
3214                 isl_id_to_ast_expr_free(stmt->u.d.ref2expr);
3215                 break;
3216         case ppcg_kernel_sync:
3217                 break;
3218         }
3219
3220         free(stmt);
3221 }
3222
3223 /* Set the options of "context" to
3224  *
3225  *      { space -> [x] : x >= first }
3226  */
3227 static __isl_give isl_ast_build *set_unroll(
3228         __isl_take isl_ast_build *build, __isl_take isl_space *space,
3229         int first)
3230 {
3231         isl_ctx *ctx;
3232         isl_map *unroll;
3233         isl_union_map *opt;
3234
3235         ctx = isl_ast_build_get_ctx(build);
3236
3237         space = isl_space_from_domain(space);
3238         space = isl_space_add_dims(space, isl_dim_out, 1);
3239         space = isl_space_set_tuple_name(space, isl_dim_out, "unroll");
3240         unroll = isl_map_universe(space);
3241         unroll = isl_map_lower_bound_si(unroll, isl_dim_out, 0, first);
3242         opt = isl_union_map_from_map(unroll);
3243
3244         build = isl_ast_build_set_options(build, opt);
3245
3246         return build;
3247 }
3248
3249 /* Return a list of isl_ids of the form "prefix%d".
3250  */
3251 static __isl_give isl_id_list *generate_names(isl_ctx *ctx,
3252         int n, const char *prefix)
3253 {
3254         int i;
3255         char name[10];
3256         isl_id_list *names;
3257
3258         names = isl_id_list_alloc(ctx, n);
3259         for (i = 0; i < n; ++i) {
3260                 isl_id *id;
3261
3262                 snprintf(name, sizeof(name), "%s%d", prefix, i);
3263                 id = isl_id_alloc(ctx, name, NULL);
3264                 names = isl_id_list_add(names, id);
3265         }
3266
3267         return names;
3268 }
3269
3270 /* Extend the schedule "schedule" with the part of "extension"
3271  * starting at "first" up to "len".
3272  */
3273 static __isl_give isl_union_map *extend_schedule(
3274         __isl_take isl_union_map *schedule,
3275         __isl_take isl_union_map *extension, int first, int len)
3276 {
3277         isl_space *space;
3278         isl_map *proj;
3279         isl_union_map *umap;
3280         isl_set *set;
3281
3282         space = isl_union_map_get_space(schedule);
3283         space = isl_space_set_from_params(space);
3284         space = isl_space_add_dims(space, isl_dim_set, len);
3285         proj = isl_set_identity(isl_set_universe(space));
3286         proj = isl_map_project_out(proj, isl_dim_out, 0, first);
3287         extension = isl_union_map_apply_range(extension,
3288                                                 isl_union_map_from_map(proj));
3289
3290         schedule = isl_union_map_range_product(schedule, extension);
3291
3292         return schedule;
3293 }
3294
3295 /* Return the gpu_stmt_access in the list "accesses" that corresponds
3296  * to "ref_id".
3297  */
3298 static struct gpu_stmt_access *find_access(struct gpu_stmt_access *accesses,
3299         __isl_keep isl_id *ref_id)
3300 {
3301         struct gpu_stmt_access *access;
3302
3303         for (access = accesses; access; access = access->next)
3304                 if (access->ref_id == ref_id)
3305                         return access;
3306
3307         return NULL;
3308 }
3309
3310 /* Return the index of the array called "name" in the list of arrays.
3311  */
3312 static int find_array_index(struct gpu_gen *gen, const char *name)
3313 {
3314         int i;
3315
3316         for (i = 0; i < gen->prog->n_array; ++i)
3317                 if (!strcmp(name, gen->prog->array[i].name))
3318                         return i;
3319
3320         return -1;
3321 }
3322
3323 /* Internal data structure for the index and AST expression transformation
3324  * callbacks for pet_stmt_build_ast_exprs.
3325  *
3326  * "accesses" is the list of gpu_stmt_access in the statement.
3327  * "iterator_map" expresses the statement iterators in terms of
3328  * the AST loop iterators.
3329  * "sched2shared" expresses the first shared_len dimensions of
3330  * the computed schedule in terms of the AST loop iterators.
3331  *
3332  * The following fields are set in transform_index and used in transform_expr.
3333  * "array" is the array that is being accessed.
3334  * "global" is set if the global array is accessed (rather than
3335  * shared/private memory).
3336  * "local_array" refers to information on the array specialized
3337  * to the current kernel.
3338  */
3339 struct ppcg_transform_data {
3340         struct gpu_gen *gen;
3341         struct gpu_stmt_access *accesses;
3342         isl_pw_multi_aff *iterator_map;
3343         isl_pw_multi_aff *sched2shared;
3344
3345         struct gpu_array_info *array;
3346         int global;
3347         struct gpu_local_array_info *local_array;
3348 };
3349
3350 /* Index transformation callback for pet_stmt_build_ast_exprs.
3351  *
3352  * "index" expresses the array indices in terms of statement iterators
3353  *
3354  * We first reformulate "index" in terms of the AST loop iterators.
3355  * Then we check if we are accessing the global array or
3356  * a shared/private copy.  In the former case, we simply return
3357  * the updated index.  If "index" is an affine expression rather
3358  * than an array access, then we also return the updated index here.
3359  *
3360  * Otherwise, we apply the tiling to the index.
3361  * This tiling is of the form
3362  *
3363  *      [D -> A] -> T
3364  *
3365  * The index is of the form
3366  *
3367  *      L -> A
3368  *
3369  * We update the tiling to refer to the AST loop iteratos
3370  *
3371  *      [L -> A] -> T
3372  *
3373  * and modify index to keep track of those iterators
3374  *
3375  *      L -> [L -> A]
3376  *
3377  * Combining these two yields a tiled index expression in terms
3378  * of the AST loop iterators
3379  *
3380  *      L -> T
3381  */
3382 static __isl_give isl_multi_pw_aff *transform_index(
3383         __isl_take isl_multi_pw_aff *index, __isl_keep isl_id *ref_id,
3384         void *user)
3385 {
3386         struct ppcg_transform_data *data = user;
3387         struct gpu_stmt_access *access;
3388         struct gpu_array_ref_group *group;
3389         struct gpu_array_tile *tile;
3390         isl_pw_multi_aff *iterator_map;
3391         int i;
3392         const char *name;
3393         isl_space *space;
3394         isl_multi_pw_aff *tiling;
3395         isl_pw_multi_aff *pma;
3396         isl_multi_pw_aff *mpa;
3397
3398         data->array = NULL;
3399
3400         iterator_map = isl_pw_multi_aff_copy(data->iterator_map);
3401         index = isl_multi_pw_aff_pullback_pw_multi_aff(index, iterator_map);
3402
3403         access = find_access(data->accesses, ref_id);
3404         if (!access)
3405                 return index;
3406         if (!isl_map_has_tuple_name(access->access, isl_dim_out))
3407                 return index;
3408
3409         name = isl_map_get_tuple_name(access->access, isl_dim_out);
3410         i = find_array_index(data->gen, name);
3411         if (i < 0)
3412                 isl_die(isl_multi_pw_aff_get_ctx(index), isl_error_internal,
3413                         "cannot find array reference group",
3414                         return isl_multi_pw_aff_free(index));
3415
3416         data->array = &data->gen->prog->array[i];
3417         data->local_array = &data->gen->kernel->array[i];
3418         group = data->array->groups[access->group];
3419         tile = group->private_tile;
3420         if (!tile)
3421                 tile = group->shared_tile;
3422         data->global = !tile;
3423         if (!tile)
3424                 return index;
3425
3426         space = isl_space_range(isl_multi_pw_aff_get_space(index));
3427         space = isl_space_map_from_set(space);
3428         pma = isl_pw_multi_aff_identity(space);
3429         pma = isl_pw_multi_aff_product(
3430                         isl_pw_multi_aff_copy(data->sched2shared), pma);
3431         tiling = isl_multi_pw_aff_from_multi_aff(
3432                                     isl_multi_aff_copy(tile->tiling));
3433         tiling = isl_multi_pw_aff_pullback_pw_multi_aff(tiling, pma);
3434
3435         space = isl_space_domain(isl_multi_pw_aff_get_space(index));
3436         space = isl_space_map_from_set(space);
3437         mpa = isl_multi_pw_aff_identity(space);
3438         index = isl_multi_pw_aff_range_product(mpa, index);
3439         index = isl_multi_pw_aff_pullback_multi_pw_aff(tiling, index);
3440
3441         return index;
3442 }
3443
3444 /* Dereference "expr" by adding an index [0].
3445  * The original "expr" is assumed not to have any indices.
3446  */
3447 static __isl_give isl_ast_expr *dereference(__isl_take isl_ast_expr *expr)
3448 {
3449         isl_ctx *ctx;
3450         isl_ast_expr *res;
3451         isl_ast_expr_list *list;
3452
3453         ctx = isl_ast_expr_get_ctx(expr);
3454         res = isl_ast_expr_from_val(isl_val_zero(ctx));
3455         list = isl_ast_expr_list_from_ast_expr(res);
3456         res = isl_ast_expr_get_op_arg(expr, 0);
3457         res = isl_ast_expr_access(res, list);
3458         isl_ast_expr_free(expr);
3459
3460         return res;
3461 }
3462
3463 /* Linearize the index expression "expr" based on the array bounds
3464  * of "array".
3465  *
3466  * That is, transform expression
3467  *
3468  *      A[i_0][i_1]...[i_n]
3469  *
3470  * to
3471  *
3472  *      A[(..((i_0 * b_1 + i_1) ... ) * b_n + i_n]
3473  *
3474  * where b_0, b_1, ..., b_n are the bounds on the array.
3475  */
3476 __isl_give isl_ast_expr *gpu_local_array_info_linearize_index(
3477         struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr)
3478 {
3479         int i, n;
3480         isl_ctx *ctx;
3481         isl_set *context;
3482         isl_ast_expr *res;
3483         isl_ast_expr_list *list;
3484         isl_ast_build *build;
3485
3486         ctx = isl_ast_expr_get_ctx(expr);
3487         context = isl_set_universe(isl_space_params_alloc(ctx, 0));
3488         build = isl_ast_build_from_context(context);
3489
3490         n = isl_ast_expr_get_op_n_arg(expr);
3491         res = isl_ast_expr_get_op_arg(expr, 1);
3492         for (i = 2; i < n; ++i) {
3493                 isl_pw_aff *bound_i;
3494                 isl_ast_expr *expr_i;
3495
3496                 bound_i = isl_pw_aff_list_get_pw_aff(array->bound, i - 1);
3497                 expr_i = isl_ast_build_expr_from_pw_aff(build, bound_i);
3498                 res = isl_ast_expr_mul(res, expr_i);
3499                 expr_i = isl_ast_expr_get_op_arg(expr, i);
3500                 res = isl_ast_expr_add(res, expr_i);
3501         }
3502
3503         isl_ast_build_free(build);
3504
3505         list = isl_ast_expr_list_from_ast_expr(res);
3506         res = isl_ast_expr_get_op_arg(expr, 0);
3507         res = isl_ast_expr_access(res, list);
3508
3509         isl_ast_expr_free(expr);
3510
3511         return res;
3512 }
3513
3514 /* AST expression transformation callback for pet_stmt_build_ast_exprs.
3515  *
3516  * If the AST expression refers to a global scalar that is not
3517  * a read-only scalar, then its address was passed to the kernel and
3518  * we need to dereference it.
3519  *
3520  * If the AST expression refers to an access to a global array,
3521  * then we linearize the access exploiting the bounds in data->local_array.
3522  */
3523 static __isl_give isl_ast_expr *transform_expr(__isl_take isl_ast_expr *expr,
3524         __isl_keep isl_id *id, void *user)
3525 {
3526         struct ppcg_transform_data *data = user;
3527
3528         if (!data->array)
3529                 return expr;
3530         if (gpu_array_is_read_only_scalar(data->array))
3531                 return expr;
3532         if (!data->global)
3533                 return expr;
3534         if (data->array->n_index == 0)
3535                 return dereference(expr);
3536
3537         return gpu_local_array_info_linearize_index(data->local_array, expr);
3538 }
3539
3540 /* This function is called for each instance of a user statement
3541  * in the kernel.
3542  *
3543  * We attach a struct ppcg_kernel_stmt to the "node", containing
3544  * a computed AST expression for each access.
3545  * These AST expressions are computed from iterator_map,
3546  * which expresses the domain
3547  * elements in terms of the generated loops, and sched2shared,
3548  * which expresses the first shared_len dimensions of the schedule
3549  * computed by PPCG in terms of the generated loops.
3550  */
3551 static __isl_give isl_ast_node *at_each_domain(__isl_take isl_ast_node *node,
3552         __isl_keep isl_ast_build *build, void *user)
3553 {
3554         struct ppcg_transform_data data;
3555         struct gpu_gen *gen = (struct gpu_gen *) user;
3556         struct ppcg_kernel_stmt *stmt;
3557         isl_id *id;
3558         isl_pw_multi_aff *sched2shared;
3559         isl_map *map;
3560         isl_pw_multi_aff *iterator_map;
3561         isl_ast_expr *expr, *arg;
3562         isl_union_map *schedule;
3563         int i, n;
3564         struct gpu_stmt_access *access;
3565
3566         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
3567         if (!stmt)
3568                 return isl_ast_node_free(node);
3569
3570         expr = isl_ast_node_user_get_expr(node);
3571         arg = isl_ast_expr_get_op_arg(expr, 0);
3572         id = isl_ast_expr_get_id(arg);
3573
3574         schedule = isl_ast_build_get_schedule(build);
3575         map = isl_map_reverse(isl_map_from_union_map(schedule));
3576         iterator_map = isl_pw_multi_aff_from_map(map);
3577         sched2shared = compute_sched_to_shared(gen,
3578                                         isl_pw_multi_aff_copy(iterator_map));
3579
3580         stmt->type = ppcg_kernel_domain;
3581         stmt->u.d.stmt = find_stmt(gen->prog, id);
3582         if (!stmt->u.d.stmt)
3583                 goto error;
3584
3585         data.gen = gen;
3586         data.accesses = stmt->u.d.stmt->accesses;
3587         data.iterator_map = iterator_map;
3588         data.sched2shared = sched2shared;
3589         stmt->u.d.ref2expr = pet_stmt_build_ast_exprs(stmt->u.d.stmt->stmt,
3590                                             build, &transform_index, &data,
3591                                             &transform_expr, &data);
3592
3593         isl_id_free(id);
3594         isl_pw_multi_aff_free(iterator_map);
3595         isl_pw_multi_aff_free(sched2shared);
3596         isl_ast_expr_free(arg);
3597         isl_ast_expr_free(expr);
3598
3599         id = isl_id_alloc(gen->ctx, NULL, stmt);
3600         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
3601         return isl_ast_node_set_annotation(node, id);
3602 error:
3603         isl_id_free(id);
3604         isl_pw_multi_aff_free(iterator_map);
3605         ppcg_kernel_stmt_free(stmt);
3606         isl_pw_multi_aff_free(sched2shared);
3607         return isl_ast_node_free(node);
3608 }
3609
3610 /* This function is called when code has been generated for the shared
3611  * tile loops.  The "schedule" refers only to the original statements.
3612  *
3613  * We extend the schedule with that part of gen->local_sched that hasn't
3614  * been taken into account yet.  This introduces parameters referring
3615  * to thread ids in the schedule, so we add them (with the appropriate
3616  * bounds to the context as well).
3617  * Finally, we set the appropriate unrolling options
3618  * if gen->first_unroll is set.
3619  */
3620 static __isl_give isl_ast_node *create_domain_leaf(
3621         __isl_take isl_union_map *schedule, __isl_take isl_ast_build *build,
3622         void *user)
3623 {
3624         struct gpu_gen *gen = (struct gpu_gen *) user;
3625         isl_space *space;
3626         isl_union_map *sched;
3627         isl_ast_node *tree;
3628         isl_set *set;
3629         isl_id_list *iterators;
3630         int n;
3631
3632         schedule = extend_schedule(schedule,
3633                         isl_union_map_copy(gen->local_sched),
3634                         gen->shared_len, gen->thread_tiled_len);
3635
3636         space = isl_ast_build_get_schedule_space(build);
3637         set = isl_set_universe(space);
3638         set = add_bounded_parameters(set, gen->kernel->n_block,
3639                                         gen->kernel->block_dim, "t");
3640         build = isl_ast_build_restrict(build, set);
3641
3642         n = gen->thread_tiled_len - gen->shared_len;
3643
3644         if (gen->first_unroll >= 0) {
3645                 space = isl_space_set_alloc(gen->ctx, 0, n);
3646                 build = set_unroll(build, space, gen->first_unroll);
3647         }
3648         iterators = generate_names(gen->ctx, n, "c");
3649         build = isl_ast_build_set_iterators(build, iterators);
3650         build = isl_ast_build_set_at_each_domain(build, &at_each_domain, gen);
3651         tree = isl_ast_build_ast_from_schedule(build, schedule);
3652         isl_ast_build_free(build);
3653
3654         return tree;
3655 }
3656
3657 /* This function is called for each statement node in the AST of the code
3658  * for copying to or from shared/private memory.
3659  * Attach a pointer to a ppcg_kernel_stmt representing the copy
3660  * statement to the node.
3661  * The statement name is "read" or "write", depending on whether we are
3662  * reading from global memory or writing to global memory.
3663  * The name of the T space is {shared,private}_<array>.
3664  *
3665  * The schedule is of the form
3666  *
3667  *      type[A -> T] -> L
3668  *
3669  * where A refers to a piece of an array and T to the corresponding
3670  * shifted tile.  We split this schedule into mappings L -> A and L -> T
3671  * and store the corresponding expressions in stmt->index and stmt->local_index,
3672  * where stmt points to the ppcg_kernel_stmt that is attached to the node.
3673  */
3674 static __isl_give isl_ast_node *attach_copy_stmt(__isl_take isl_ast_node *node,
3675         __isl_keep isl_ast_build *build, void *user)
3676 {
3677         struct gpu_gen *gen = (struct gpu_gen *) user;
3678         struct ppcg_kernel_stmt *stmt;
3679         isl_id *id;
3680         isl_ast_expr *expr;
3681         isl_space *space;
3682         isl_map *access, *local_access, *map;
3683         isl_pw_multi_aff *pma;
3684         const char *type;
3685         int array_index;
3686
3687         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
3688         if (!stmt)
3689                 return isl_ast_node_free(node);
3690
3691         access = isl_map_from_union_map(isl_ast_build_get_schedule(build));
3692         type = isl_map_get_tuple_name(access, isl_dim_in);
3693         stmt->u.c.read = !strcmp(type, "read");
3694         access = isl_map_reverse(access);
3695         space = isl_space_unwrap(isl_space_range(isl_map_get_space(access)));
3696         local_access = isl_map_copy(access);
3697
3698         map = isl_map_domain_map(isl_map_universe(isl_space_copy(space)));
3699         id = isl_map_get_tuple_id(access, isl_dim_out);
3700         map = isl_map_set_tuple_id(map, isl_dim_in, id);
3701         access = isl_map_apply_range(access, map);
3702         pma = isl_pw_multi_aff_from_map(access);
3703         expr = isl_ast_build_access_from_pw_multi_aff(build, pma);
3704         stmt->u.c.index = expr;
3705
3706         map = isl_map_range_map(isl_map_universe(space));
3707         id = isl_map_get_tuple_id(local_access, isl_dim_out);
3708         map = isl_map_set_tuple_id(map, isl_dim_in, id);
3709         local_access = isl_map_apply_range(local_access, map);
3710         pma = isl_pw_multi_aff_from_map(local_access);
3711         expr = isl_ast_build_access_from_pw_multi_aff(build, pma);
3712         stmt->u.c.local_index = expr;
3713
3714         stmt->u.c.array = gen->copy_group->array;
3715         array_index = stmt->u.c.array - gen->prog->array;
3716         stmt->u.c.local_array = &gen->kernel->array[array_index];
3717         stmt->type = ppcg_kernel_copy;
3718
3719         id = isl_id_alloc(gen->ctx, NULL, stmt);
3720         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
3721         return isl_ast_node_set_annotation(node, id);
3722 }
3723
3724 /* Given a schedule of the form
3725  *
3726  *      [S -> A] -> L
3727  *
3728  * (with S the first shared_len dimensions of the computed schedule,
3729  * A the array and L the schedule correponding to the generated loops),
3730  * indicating where to copy the array elements that need to be copied,
3731  * construct code for performing the copying.
3732  *
3733  * "group" is the array reference group that is being copied
3734  * "type" is either "read" or "write"
3735  * private is set if copying needs to be performed to/from registers
3736  *
3737  * We first construct a mapping to a shifted tile of the array,
3738  *
3739  *      [S -> A] -> T(S,A)                                      (1)
3740  *
3741  * If private is set, then we also use this mapping as a schedule
3742  * (which is already thread-specific and will be completely unrolled).
3743  * Otherwise, we wrap/tile the range over the threads.
3744  * The result is
3745  *
3746  *      [S -> A] -> T'(S,A)
3747  *
3748  * Combined with the given schedule, we have
3749  *
3750  *      [S -> A] -> [L -> T'(S,A)]                              (2)
3751  *
3752  * From the shifted tile mapping, we construct a mapping
3753  *
3754  *      [S -> A] -> [A -> T(S,A)]
3755  *
3756  * and apply it to the schedule (2), obtaining
3757  *
3758  *      [A -> T(S(L),A)] -> [L -> T'(S(L),A)]
3759  *
3760  * Note that we can project out S because it is uniquely defined by L.
3761  */
3762 static __isl_give isl_ast_node *copy_access(struct gpu_gen *gen,
3763         __isl_take isl_map *sched,
3764         const char *type, struct gpu_array_ref_group *group,
3765         __isl_take isl_ast_build *build, int private)
3766 {
3767         isl_space *space;
3768         isl_ast_node *tree;
3769         isl_map *schedule, *shift, *map;
3770         isl_set *set;
3771         isl_id_list *iterators;
3772         int n;
3773
3774         shift = shift_access(group);
3775
3776         schedule = isl_map_copy(shift);
3777         schedule = isl_map_reset_tuple_id(schedule, isl_dim_out);
3778         if (!private)
3779                 schedule = tile_access_schedule(gen, schedule);
3780
3781         n = isl_map_dim(schedule, isl_dim_out);
3782         set = isl_set_universe(isl_ast_build_get_schedule_space(build));
3783         set = add_bounded_parameters(set, gen->kernel->n_block,
3784                                         gen->kernel->block_dim, "t");
3785
3786         schedule = isl_map_range_product(sched, schedule);
3787
3788         space = isl_space_domain(isl_map_get_space(shift));
3789         map = isl_map_range_map(isl_map_universe(isl_space_unwrap(space)));
3790         map = isl_map_range_product(map, shift);
3791
3792         schedule = isl_map_apply_domain(schedule, map);
3793
3794         schedule = isl_map_set_tuple_name(schedule, isl_dim_in, type);
3795
3796         build = isl_ast_build_restrict(build, set);
3797
3798         gen->copy_group = group;
3799
3800         if (private) {
3801                 space = isl_space_range(isl_map_get_space(schedule));
3802                 space = isl_space_range(isl_space_unwrap(space));
3803                 build = set_unroll(build, space, 0);
3804         }
3805         iterators = generate_names(gen->ctx, n, "c");
3806         build = isl_ast_build_set_iterators(build, iterators);
3807         build = isl_ast_build_set_at_each_domain(build, &attach_copy_stmt, gen);
3808         tree = isl_ast_build_ast_from_schedule(build,
3809                                             isl_union_map_from_map(schedule));
3810         isl_ast_build_free(build);
3811
3812         return tree;
3813 }
3814
3815 /* Return code for reading into or writing from shared memory
3816  * the given array reference group.
3817  *
3818  * If we are performing a read from global memory to shared memory and
3819  * if the array involved is not a scalar, then we copy
3820  * the entire tile to shared memory.  This may result in some extra
3821  * elements getting copied, but it should lead to simpler code
3822  * (which means that fewer registers may be needed) and less divergence.
3823  *
3824  * Otherwise, we only copy the elements that will be read or have been written
3825  * in the kernel.
3826  *
3827  *
3828  * The input "sched" is of the form.
3829  *
3830  *      type[S -> A] -> L
3831  *
3832  * with S the first shared_len dimensions of the computed schedule,
3833  * A the array and L the schedule correponding to the generated loops.
3834  *
3835  * We first drop "type",
3836  *
3837  *      [S -> A] -> L
3838  *
3839  * If the above conditions are satisfied, we project out A,
3840  * resulting in
3841  *
3842  *      S -> L
3843  *
3844  * and then introduce the group tile [S -> T], resulting in
3845  *
3846  *      [S -> T] -> L
3847  */
3848 static __isl_give isl_ast_node *copy_group_shared_accesses(
3849         struct gpu_gen *gen, struct gpu_array_ref_group *group,
3850         __isl_take isl_map *sched, __isl_take isl_ast_build *build)
3851 {
3852         const char *type;
3853         int read;
3854         isl_union_map *access;
3855
3856         type = isl_map_get_tuple_name(sched, isl_dim_in);
3857         read = !strcmp(type, "read");
3858
3859         sched = isl_map_reset_tuple_id(sched, isl_dim_in);
3860
3861         if (read && !gpu_array_is_scalar(group->array)) {
3862                 isl_space *space;
3863                 isl_map *map;
3864
3865                 space = isl_space_domain(isl_map_get_space(sched));
3866                 space = isl_space_unwrap(space);
3867                 map = isl_map_domain_map(isl_map_universe(space));
3868                 sched = isl_map_apply_domain(sched, map);
3869
3870                 map = group_tile(group);
3871                 map = isl_map_reverse(isl_map_domain_map(map));
3872                 sched = isl_map_apply_domain(sched, map);
3873         }
3874
3875         return copy_access(gen, sched, type, group, build, 0);
3876 }
3877
3878 /* Return code for reading into or writing from private memory
3879  * the given array reference group.
3880  *
3881  * Let S be the first shared_len dimensions of the computed schedule,
3882  * D the iteration domains, A the array and L the schedule correponding
3883  * to the generated loops.
3884  * "sched" is of the form
3885  *
3886  *      type[S -> A] -> L
3887  *
3888  * where type is either "read" or "write".
3889  * We apply the privatization D -> S(t), with t the thread ids,
3890  * to the access relation D -> A to obtain the privatized access relation
3891  *
3892  *      S(t) -> A
3893  *
3894  * We drop the type from "sched" and intersect with the privatized access
3895  * relation to obtain
3896  *
3897  *      [S(t) -> A] -> L
3898  */
3899 static __isl_give isl_ast_node *copy_group_private_accesses(
3900         struct gpu_gen *gen, struct gpu_array_ref_group *group,
3901         __isl_take isl_map *sched, __isl_take isl_ast_build *build)
3902 {
3903         const char *type;
3904         int read;
3905         isl_union_map *priv;
3906         isl_union_map *access;
3907         isl_map *access_map;
3908
3909         type = isl_map_get_tuple_name(sched, isl_dim_in);
3910         read = !strcmp(type, "read");
3911
3912         priv = isl_union_map_from_map(isl_map_copy(gen->privatization));
3913         priv = isl_union_map_apply_range(isl_union_map_copy(gen->shared_sched),
3914                                         priv);
3915
3916         access = group_access_relation(group, read, !read);
3917         access = isl_union_map_apply_domain(access, priv);
3918         access_map = isl_map_from_union_map(access);
3919
3920         sched = isl_map_reset_tuple_id(sched, isl_dim_in);
3921         sched = isl_map_intersect_domain(sched, isl_map_wrap(access_map));
3922
3923         return copy_access(gen, sched, type, group, build, 1);
3924 }
3925
3926 /* Return code for reading into or writing from shared or private memory.
3927  *
3928  * "schedule" is of the form
3929  *
3930  *      type[S -> A] -> L
3931  *
3932  * with S be the first shared_len dimensions of the computed schedule,
3933  * A the array and L the schedule correponding to the generated loops.
3934  * The array reference group is attached to "type".
3935  */
3936 static __isl_give isl_ast_node *create_access_leaf(
3937         struct gpu_gen *gen, __isl_take isl_map *schedule,
3938         __isl_take isl_ast_build *build)
3939 {
3940         struct gpu_array_ref_group *group;
3941         isl_id *id;
3942
3943         id = isl_map_get_tuple_id(schedule, isl_dim_in);
3944         group = isl_id_get_user(id);
3945         isl_id_free(id);
3946
3947         if (group->private_tile)
3948                 return copy_group_private_accesses(gen, group, schedule,
3949                                                         build);
3950         else
3951                 return copy_group_shared_accesses(gen, group, schedule,
3952                                                         build);
3953 }
3954
3955 /* Create a domain node representing a synchronization.
3956  */
3957 static __isl_give isl_ast_node *create_sync_leaf(
3958         struct gpu_gen *gen, __isl_take isl_map *schedule,
3959         __isl_take isl_ast_build *build)
3960 {
3961         struct ppcg_kernel_stmt *stmt;
3962         isl_id *id;
3963         isl_space *space;
3964         isl_ast_node *node;
3965         isl_ast_expr *expr;
3966
3967         isl_map_free(schedule);
3968
3969         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
3970         if (!stmt)
3971                 return NULL;
3972
3973         stmt->type = ppcg_kernel_sync;
3974
3975         space = isl_ast_build_get_schedule_space(build);
3976         space = isl_space_from_domain(space);
3977         space = isl_space_set_tuple_name(space, isl_dim_out, "sync");
3978         expr = isl_ast_build_call_from_pw_multi_aff(build,
3979                     isl_pw_multi_aff_from_multi_aff(isl_multi_aff_zero(space)));
3980         node = isl_ast_node_alloc_user(expr);
3981         isl_ast_build_free(build);
3982
3983         id = isl_id_alloc(gen->ctx, NULL, stmt);
3984         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
3985         return isl_ast_node_set_annotation(node, id);
3986 }
3987
3988 /* This function is called during the code generation at the point
3989  * where the schedule domain element is completely determined by
3990  * the generated code.  The input schedule contains the original
3991  * statements as well as synchronization and copy "statements".
3992  * The latter are scheduled at different points than any of the original
3993  * statements, so they will only arrive here in isolation.
3994  *
3995  * If the current schedule only refers to a single statement,
3996  * we check if it is a copy or synchronization statement and
3997  * call the appropriate functions.
3998  * Otherwise, we assume we are dealing with the original statements
3999  * and we call create_domain_leaf.
4000  */
4001 static __isl_give isl_ast_node *create_kernel_leaf(
4002         __isl_take isl_ast_build *build, void *user)
4003 {
4004         struct gpu_gen *gen = (struct gpu_gen *) user;
4005         isl_map *map;
4006         isl_union_map *schedule;
4007         const char *name;
4008
4009         schedule = isl_ast_build_get_schedule(build);
4010
4011         if (isl_union_map_n_map(schedule) != 1)
4012                 return create_domain_leaf(schedule, build, user);
4013
4014         map = isl_map_from_union_map(schedule);
4015         name = isl_map_get_tuple_name(map, isl_dim_in);
4016         if (!strcmp(name, "read") || !strcmp(name, "write"))
4017                 return create_access_leaf(gen, map, build);
4018         if (!strcmp(name, "sync"))
4019                 return create_sync_leaf(gen, map, build);
4020
4021         return create_domain_leaf(isl_union_map_from_map(map), build, user);
4022 }
4023
4024 /* Mark all odd schedule dimensions as "atomic" (when the even dimensions
4025  * have value 0) and all even schedule dimensions as "unroll".
4026  *
4027  * That is, the options look as follows
4028  *
4029  *      { [0, b, 0, d, ..., 0] -> atomic[i] : exists a : i = 2 a + 1;
4030  *        [a, b, c, d, ..., z] -> unroll[i] : exists a : i = 2 a }
4031  *
4032  * The even positions are used to be able to schedule copying blocks
4033  * and synchronization before or after each level of the shared memory
4034  * tile loops and we want to make sure that code for these is generated
4035  * separately (within each level).
4036  */
4037 static __isl_give isl_ast_build *set_atomic_and_unroll(
4038         __isl_take isl_ast_build *build,
4039         __isl_take isl_space *space, int sched_len)
4040 {
4041         isl_ctx *ctx;
4042         isl_map *map;
4043         isl_constraint *c;
4044         isl_union_map *opt;
4045         isl_local_space *ls;
4046         int i, n;
4047
4048         ctx = isl_ast_build_get_ctx(build);
4049
4050         space = isl_space_params(space);
4051         space = isl_space_add_dims(space, isl_dim_set, sched_len);
4052         space = isl_space_from_domain(space);
4053         space = isl_space_add_dims(space, isl_dim_out, 2);
4054         map = isl_map_universe(isl_space_copy(space));
4055         for (i = 0; i < sched_len; i += 2)
4056                 map = isl_map_fix_si(map, isl_dim_in, i, 0);
4057         ls = isl_local_space_from_space(isl_map_get_space(map));
4058         c = isl_equality_alloc(ls);
4059         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 0, 1);
4060         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 1, 2);
4061         c = isl_constraint_set_constant_si(c, 1);
4062         map = isl_map_add_constraint(map, c);
4063         map = isl_map_project_out(map, isl_dim_out, 1, 1);
4064         map = isl_map_set_tuple_name(map, isl_dim_out, "atomic");
4065         opt = isl_union_map_from_map(map);
4066
4067         map = isl_map_universe(space);
4068         ls = isl_local_space_from_space(isl_map_get_space(map));
4069         c = isl_equality_alloc(ls);
4070         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 0, 1);
4071         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 1, 2);
4072         map = isl_map_add_constraint(map, c);
4073         map = isl_map_project_out(map, isl_dim_out, 1, 1);
4074         map = isl_map_set_tuple_name(map, isl_dim_out, "unroll");
4075         opt = isl_union_map_add_map(opt, map);
4076
4077         build = isl_ast_build_set_options(build, opt);
4078
4079         return build;
4080 }
4081
4082 /* Return a map that maps a space of dimension gen->shared_len
4083  * to its last dimensions starting at gen->tile_first.
4084  * The range is of dimension
4085  *
4086  *      2 * (gen->shared_len - gen->tile_first) + 1
4087  *
4088  * The input dimensions are mapped to the odd dimensions in the output,
4089  * while the even dimensions (except 2*pos) are fixed to 0.
4090  * Output dimension 2*pos (if pos >= 0) is fixed to "val".
4091  * If pos >= 0, then only the pos first dimensions starting at gen->tile_first
4092  * are mapped to the output.  The remaining input dimensions are projected
4093  * out and the corresponding output dimensions are fixed to 0.
4094  */
4095 static __isl_give isl_map *insert_even(struct gpu_gen *gen,
4096         __isl_take isl_space *space, int pos, int val)
4097 {
4098         int i, n;
4099         isl_map *proj;
4100
4101         space = isl_space_set_from_params(space);
4102         space = isl_space_add_dims(space, isl_dim_set, gen->shared_len);
4103         space = isl_space_map_from_set(space);
4104         proj = isl_map_identity(space);
4105         proj = isl_map_project_out(proj, isl_dim_out, 0, gen->tile_first);
4106         n = gen->shared_len - gen->tile_first;
4107         for (i = 0; i <= n; ++i) {
4108                 proj = isl_map_insert_dims(proj, isl_dim_out, 2 * i, 1);
4109                 if (i == pos)
4110                         proj = isl_map_fix_si(proj, isl_dim_out, 2 * i, val);
4111                 else
4112                         proj = isl_map_fix_si(proj, isl_dim_out, 2 * i, 0);
4113         }
4114
4115         if (pos < 0)
4116                 return proj;
4117
4118         proj = isl_map_eliminate(proj, isl_dim_in, gen->tile_first + pos,
4119                                 gen->shared_len - (gen->tile_first + pos));
4120         for (i = pos; i < n; ++i)
4121                 proj = isl_map_fix_si(proj, isl_dim_out, 2 * i + 1, 0);
4122
4123         return proj;
4124 }
4125
4126 /* Given the AST context schedule "schedule" and the mapping from
4127  * domains to the shared tile loops "shared_sched", add a schedule
4128  * for a synchronization operation at position "val" of loop level "pos".
4129  *
4130  * schedule is of the form
4131  *
4132  *      D -> L
4133  *
4134  * (with D the iteration domains and L the already generated loops),
4135  * while shared_sched is of the form
4136  *
4137  *      D -> S
4138  *
4139  * We combine them into
4140  *
4141  *      L -> S
4142  *
4143  * apply a mapping
4144  *
4145  *      [s_0,...] -> [0,s_{tile_first},0,..., val, 0, 0, ... 0]
4146  *
4147  * and use the result as a schedule for "sync".
4148  */
4149 static __isl_give isl_union_map *add_sync_schedule(struct gpu_gen *gen,
4150         __isl_take isl_union_map *res, __isl_keep isl_union_map *schedule,
4151         __isl_keep isl_union_map *shared_sched, int pos, int val)
4152 {
4153         isl_space *space;
4154         isl_map *proj, *map;
4155
4156         shared_sched = isl_union_map_copy(shared_sched);
4157         schedule = isl_union_map_copy(schedule);
4158
4159         space = isl_union_map_get_space(shared_sched);
4160         schedule = isl_union_map_apply_domain(shared_sched, schedule);
4161         map = isl_map_from_union_map(schedule);
4162
4163         proj = insert_even(gen, space, pos, val);
4164         map = isl_map_apply_range(map, proj);
4165         map = isl_map_from_range(isl_map_wrap(map));
4166         map = isl_map_set_tuple_name(map, isl_dim_in, "sync");
4167
4168         res = isl_union_map_add_map(res, map);
4169
4170         return res;
4171 }
4172
4173 /* Given the AST context schedule "schedule" and the mapping from
4174  * domains to the shared tile loops "shared_sched", add a schedule
4175  * for copying an array reference group to/from shared/private memory.
4176  * "read" is set if data should be copied from global memory
4177  * to shared/private memory.
4178  * "k" represents the current group
4179  * "s" is the total number of groups
4180  *
4181  * We schedule an operation before or after the innermost loop
4182  * of "shared_sched" that affects the tile of the array reference group.
4183  *
4184  * schedule is of the form
4185  *
4186  *      D -> L
4187  *
4188  * (with D the iteration domains and L the already generated loops),
4189  * while shared_sched is of the form
4190  *
4191  *      D -> S
4192  *
4193  * We first compute the access relation for the reference group
4194  *
4195  *      D -> A
4196  *
4197  * and combine it with shared_sched into
4198  *
4199  *      D -> [S -> A]
4200  *
4201  * If this results in an empty relation, no copying needs to be performed
4202  * at this point.
4203  * Otherwise, we invert the relation and combine it with "schedule" into
4204  *
4205  *      [S -> A] -> L
4206  *
4207  * The actual additional piece of the schedule is obtained from combining
4208  *
4209  *      [S -> A] -> S
4210  *
4211  * with a mapping
4212  *
4213  *      [s_0,...] -> [0,s_{tile_first},0,..., val, 0, 0, ... 0]
4214  *
4215  * The position of "val" corresponds to the innermost loop that affects
4216  * the tile and the value indicates where the copying is scheduled
4217  * with respect to the actual kernel code (at value 0).
4218  * Reads are schedule before the code, writes to global memory from
4219  * private memory are scheduled at values 1 to s, writes to global
4220  * memory from shared memory are scheduled at values s + 2 to 2 * s + 1.
4221  *
4222  * If we are scheduling a read from global memory to shared memory,
4223  * we insert a synchronization before the kernel code (at the innermost
4224  * level).
4225  * If we are scheduling a write to global memory, then we add
4226  * a synchronization after all writes (at value 2 *s + 2).
4227  * However, there is no need for a synchronization after the outermost loop.
4228  * A write to global memory from private memory at the innermost level
4229  * does not require a synchronization, because it is covered by
4230  * the synchronization after the kernel inserted by body_schedule.
4231  */
4232 static __isl_give isl_union_map *add_group_schedule(struct gpu_gen *gen,
4233         __isl_take isl_union_map *res, __isl_keep isl_union_map *schedule,
4234         __isl_keep isl_union_map *shared_sched,
4235         struct gpu_array_ref_group *group, int read, int k, int s)
4236 {
4237         int n;
4238         int pos, val;
4239         isl_space *space;
4240         isl_union_map *access;
4241         isl_map *map, *proj, *access_map;
4242         isl_id *id;
4243
4244         access = group_access_relation(group, read, !read);
4245         access = isl_union_map_range_product(isl_union_map_copy(shared_sched),
4246                                                 access);
4247
4248         if (isl_union_map_is_empty(access)) {
4249                 isl_union_map_free(access);
4250                 return res;
4251         }
4252
4253         access = isl_union_map_reverse(access);
4254         access = isl_union_map_apply_range(access,
4255                                             isl_union_map_copy(schedule));
4256         access_map = isl_map_from_union_map(access);
4257
4258         space = isl_space_copy(group->array->space);
4259         space = isl_space_from_range(space);
4260         space = isl_space_add_dims(space, isl_dim_in, gen->shared_len);
4261         map = isl_map_domain_map(isl_map_universe(space));
4262
4263         space = isl_union_map_get_space(schedule);
4264         pos = group->last_shared + 1 - gen->tile_first;
4265         assert(pos >= 0);
4266         if (read)
4267                 val = -2 - k;
4268         else if (group->private_tile)
4269                 val = 1 + k;
4270         else
4271                 val = 1 + s + 1 + k;
4272         proj = insert_even(gen, space, pos, val);
4273         map = isl_map_apply_range(map, proj);
4274
4275         access_map = isl_map_range_product(access_map, map);
4276
4277         id = isl_id_alloc(gen->ctx, read ? "read" : "write", group);
4278         access_map = isl_map_set_tuple_id(access_map, isl_dim_in, id);
4279
4280         res = isl_union_map_add_map(res, access_map);
4281
4282         n = gen->shared_len - gen->tile_first;
4283         if (read) {
4284                 if (!group->private_tile)
4285                         res = add_sync_schedule(gen, res, schedule,
4286                                                 shared_sched, n, -1);
4287         } else {
4288                 if (pos == 0)
4289                         return res;
4290                 if (pos == n && group->private_tile)
4291                         return res;
4292                 res = add_sync_schedule(gen, res, schedule, shared_sched,
4293                                         pos, 2 * s + 2);
4294         }
4295
4296         return res;
4297 }
4298
4299 /* Return a schedule for the shared tile loops based on the current
4300  * AST context schedule.
4301  *
4302  * We create a "shared_sched" that maps the domains to the first
4303  * shared_len dimensions of the computed schedule, project out the
4304  * first tile_first dimensions (as these are already covered by
4305  * the host code) and insert "statement-level" dimensions at even
4306  * positions so that we can schedule copy blocks and synchronization
4307  * before/after each level.
4308  *
4309  * In particular, copy blocks are inserted inside the innermost
4310  * level that affect the tile.  For the copying to global memory,
4311  * those from private memory are scheduled before those from shared
4312  * memory such that synchronization can be inserted between the two
4313  * at the innermost level.
4314  * Synchronization is inserted at the innermost level before the
4315  * actual kernel code if there is any copying from global memory
4316  * to shared memory.  It is inserted unconditionally at the innermost
4317  * level after the actual kernel code and the copying to global memory
4318  * from private memory (if any).  Finally, it is inserted after
4319  * any copying to global memory, except at the outermost level
4320  * and at the innermost level if there is no copying from shared
4321  * memory.  The copying from private memory is covered by the unconditional
4322  * synchronization at the innermost level.
4323  */
4324 static __isl_give isl_union_map *body_schedule(struct gpu_gen *gen,
4325         __isl_take isl_union_map *schedule)
4326 {
4327         isl_space *space;
4328         isl_union_map *res;
4329         isl_union_map *shared_sched;
4330         isl_union_map *sched;
4331         isl_map *proj, *map;
4332         int i, j, k, s;
4333
4334         shared_sched = isl_union_map_copy(gen->tiled_sched);
4335         proj = projection(isl_union_map_get_space(shared_sched),
4336                                 gen->tiled_len, gen->shared_len);
4337         shared_sched = isl_union_map_apply_range(shared_sched,
4338                                 isl_union_map_from_map(proj));
4339         space = isl_union_map_get_space(shared_sched);
4340         proj = insert_even(gen, space, -1, 0);
4341         sched = isl_union_map_apply_range(isl_union_map_copy(shared_sched),
4342                                 isl_union_map_from_map(proj));
4343
4344         res = isl_union_map_range_product(isl_union_map_copy(schedule), sched);
4345
4346         s = 0;
4347         for (i = 0; i < gen->prog->n_array; ++i)
4348                 s += gen->prog->array[i].n_group;
4349
4350         k = 0;
4351         for (i = 0; i < gen->prog->n_array; ++i) {
4352                 struct gpu_array_info *array = &gen->prog->array[i];
4353
4354                 for (j = 0; j < array->n_group; ++j) {
4355                         struct gpu_array_ref_group *group;
4356
4357                         group = array->groups[j];
4358                         if (!group->private_tile && !group->shared_tile)
4359                                 continue;
4360                         res = add_group_schedule(gen, res, schedule,
4361                                                 shared_sched, group, 0, k, s);
4362                         res = add_group_schedule(gen, res, schedule,
4363                                                 shared_sched, group, 1, k, s);
4364                         ++k;
4365                 }
4366         }
4367
4368         res = add_sync_schedule(gen, res, schedule, shared_sched,
4369                             gen->shared_len - gen->tile_first, 1 + s);
4370
4371         isl_union_map_free(shared_sched);
4372         isl_union_map_free(schedule);
4373
4374         return res;
4375 }
4376
4377 /* Generate code for "kernel" in the given "context".
4378  *
4379  * We first generate code for the shared tile loops (T1T, T1P and T2)
4380  * in a context that includes the block ids.
4381  * Within each iteration of these loops an additional code generation
4382  * is performed (within create_kernel_leaf) for the rest of the schedule
4383  * in a context that includes the thread ids.
4384  */
4385 static __isl_give isl_ast_node *generate_kernel(struct gpu_gen *gen,
4386         __isl_keep isl_ast_build *build, __isl_keep isl_set *host_domain,
4387         __isl_keep isl_multi_pw_aff *grid_size)
4388 {
4389         isl_space *space;
4390         isl_set *set;
4391         isl_id_list *iterators;
4392         isl_union_map *schedule;
4393         isl_ast_node *tree;
4394         int sched_len;
4395
4396         schedule = isl_ast_build_get_schedule(build);
4397
4398         build = isl_ast_build_copy(build);
4399         build = isl_ast_build_restrict(build, isl_set_copy(host_domain));
4400         space = isl_ast_build_get_schedule_space(build);
4401         set = isl_set_universe(isl_space_copy(space));
4402         set = add_bounded_parameters_dynamic(set, grid_size, "b");
4403         build = isl_ast_build_restrict(build, set);
4404
4405         schedule = body_schedule(gen, schedule);
4406
4407         sched_len = 2 * (gen->shared_len - gen->tile_first) + 1;
4408
4409         build = set_atomic_and_unroll(build, space, sched_len);
4410         iterators = generate_names(gen->ctx, sched_len, "g");
4411         build = isl_ast_build_set_iterators(build, iterators);
4412         build = isl_ast_build_set_create_leaf(build, &create_kernel_leaf, gen);
4413         tree = isl_ast_build_ast_from_schedule(build, schedule);
4414         isl_ast_build_free(build);
4415
4416         return tree;
4417 }
4418
4419 /* Attach "id" to the given node.
4420  */
4421 static __isl_give isl_ast_node *attach_id(__isl_take isl_ast_node *node,
4422         __isl_keep isl_ast_build *build, void *user)
4423 {
4424         isl_id *id = user;
4425
4426         node = isl_ast_node_set_annotation(node, id);
4427
4428         return node;
4429 }
4430
4431 /* Construct an AST node for performing a kernel launch and attach
4432  * the information about the kernel to that node.
4433  *
4434  * The kernel AST has been constructed in the context of the range
4435  * of "schedule".  In particular, the grid size has been computed
4436  * in the context.  We therefore still need to make sure that these
4437  * constraints are expressed in the code.  We do this by creating a schedule
4438  *
4439  *      kernel[] -> [S -> []]
4440  *
4441  * where S is the schedule domain, i.e., the range of "schedule".
4442  * The AST generation will then create a single call surrounded by
4443  * all the condition in "S" that have not been expressed yet.
4444  *
4445  * The kernel information is attached to this node in attach_id.
4446  */
4447 static __isl_give isl_ast_node *construct_launch(
4448         __isl_take isl_ast_build *build, __isl_take isl_union_map *schedule,
4449         __isl_take struct ppcg_kernel *kernel)
4450 {
4451         isl_id *id;
4452         isl_ctx *ctx;
4453         isl_union_set *domain;
4454         isl_set *set;
4455         isl_map *map;
4456         isl_ast_node *node;
4457
4458         ctx = isl_ast_build_get_ctx(build);
4459
4460         id = isl_id_alloc(ctx, NULL, kernel);
4461         id = isl_id_set_free_user(id, &ppcg_kernel_free);
4462
4463         domain = isl_union_map_range(schedule);
4464         set = isl_set_from_union_set(domain);
4465         map = isl_map_from_domain(set);
4466         map = isl_map_from_range(isl_map_wrap(map));
4467         map = isl_map_set_tuple_name(map, isl_dim_in, "kernel");
4468         schedule = isl_union_map_from_map(map);
4469
4470         build = isl_ast_build_set_at_each_domain(build, &attach_id, id);
4471         node = isl_ast_build_ast_from_schedule(build, schedule);
4472         isl_ast_build_free(build);
4473
4474         return node;
4475 }
4476
4477 /* This function is called for each leaf in the AST of the host code.
4478  * We first specialize the schedule to the site of the leaf, compute
4479  * the size of shared memory and then construct the body of the host code
4480  * and the associated kernel.
4481  *
4482  * The necessary information for printing the kernel launch is
4483  * stored in a struct ppcg_kernel and attached to the leaf node
4484  * created to represent the launch.
4485  */
4486 static __isl_give isl_ast_node *create_host_leaf(
4487         __isl_take isl_ast_build *build, void *user)
4488 {
4489         struct gpu_gen *gen = (struct gpu_gen *) user;
4490         isl_id *id;
4491         isl_ast_node *node;
4492         struct ppcg_kernel *kernel;
4493         isl_set *host_domain;
4494         isl_union_map *schedule;
4495         isl_union_map *local_sched;
4496         isl_union_map *access;
4497         isl_union_set *domain;
4498         int i;
4499
4500         schedule = isl_ast_build_get_schedule(build);
4501
4502         isl_union_map_foreach_map(schedule, &extract_tile_len, gen);
4503         read_sizes(gen);
4504
4505         domain = isl_union_map_domain(isl_union_map_copy(schedule));
4506
4507         local_sched = isl_union_map_copy(gen->sched);
4508         local_sched = isl_union_map_intersect_domain(local_sched, domain);
4509         access = isl_union_map_union(isl_union_map_copy(gen->prog->read),
4510                                      isl_union_map_copy(gen->prog->write));
4511         access = isl_union_map_apply_domain(access,
4512                                             isl_union_map_copy(local_sched));
4513
4514         gen->tiled_sched = tile_schedule(gen, local_sched);
4515         gen->tiled_sched = parametrize_tiled_schedule(gen, gen->tiled_sched);
4516         gen->tiled_sched = scale_tile_loops(gen, gen->tiled_sched);
4517
4518         gen->local_sched = isl_union_map_copy(gen->tiled_sched);
4519         gen->local_sched = thread_tile_schedule(gen, gen->local_sched);
4520         gen->local_sched = scale_thread_tile_loops(gen, gen->local_sched);
4521
4522         kernel = gen->kernel = isl_calloc_type(gen->ctx, struct ppcg_kernel);
4523         if (!kernel)
4524                 goto error;
4525
4526         kernel->id = gen->kernel_id++;
4527         kernel->context = isl_union_map_params(isl_union_map_copy(schedule));
4528         kernel->grid_size = extract_grid_size(gen, kernel);
4529         extract_block_size(gen, kernel);
4530         kernel->arrays = isl_union_map_range(access);
4531         kernel->space = isl_ast_build_get_schedule_space(build);
4532
4533         gen->private_access = NULL;
4534         compute_shared_sched(gen);
4535         gen->privatization = compute_privatization(gen);
4536         group_references(gen);
4537         compute_private_access(gen);
4538         check_shared_memory_bound(gen);
4539         compute_group_tilings(gen);
4540         host_domain = isl_set_from_union_set(isl_union_map_range(
4541                                                 isl_union_map_copy(schedule)));
4542         localize_bounds(gen, kernel, host_domain);
4543
4544         gen->local_sched = interchange_for_unroll(gen, gen->local_sched);
4545
4546         kernel->tree = generate_kernel(gen, build, host_domain,
4547                                         kernel->grid_size);
4548         create_kernel_vars(gen, kernel);
4549
4550         free_local_array_info(gen);
4551         isl_map_free(gen->privatization);
4552         isl_union_map_free(gen->private_access);
4553         isl_union_map_free(gen->local_sched);
4554         isl_union_map_free(gen->tiled_sched);
4555         isl_union_map_free(gen->shared_sched);
4556         isl_union_map_free(gen->shared_proj);
4557         isl_set_free(host_domain);
4558         free(gen->tile_size);
4559
4560         node = construct_launch(build, schedule, kernel);
4561
4562         return node;
4563 error:
4564         isl_union_map_free(schedule);
4565         return NULL;
4566 }
4567
4568 /* Use isl to generate code for the outer gen->tile_first loops
4569  * of the global schedule in gen->sched, resulting in the host code.
4570  * Within each iteration of this partial schedule, i.e., for each kernel
4571  * launch, create_host_leaf takes care of generating the kernel code.
4572  */
4573 static __isl_give isl_ast_node *generate_host_code(struct gpu_gen *gen)
4574 {
4575         isl_ast_build *build;
4576         isl_ast_node *tree;
4577         isl_union_map *sched;
4578         isl_map *proj;
4579         isl_id_list *iterators;
4580
4581         sched = isl_union_map_copy(gen->sched);
4582         proj = projection(isl_union_map_get_space(sched),
4583                             gen->untiled_len, gen->tile_first);
4584         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
4585
4586         isl_options_set_ast_build_group_coscheduled(gen->ctx, 1);
4587         build = isl_ast_build_from_context(isl_set_copy(gen->prog->context));
4588         iterators = generate_names(gen->ctx, gen->tile_first, "h");
4589         build = isl_ast_build_set_iterators(build, iterators);
4590         build = isl_ast_build_set_create_leaf(build, &create_host_leaf, gen);
4591         tree = isl_ast_build_ast_from_schedule(build, sched);
4592         isl_ast_build_free(build);
4593
4594         return tree;
4595 }
4596
4597 __isl_give isl_union_map *extract_sizes_from_str(isl_ctx *ctx, const char *str)
4598 {
4599         if (!str)
4600                 return NULL;
4601         return isl_union_map_read_from_str(ctx, str);
4602 }
4603
4604 /* Information about the outermost tilable bands in the forest of bands.
4605  *
4606  * tile_len and n_parallel are only sets on band_info structures
4607  * that correspond to outermost bands.  For other bands (in particular,
4608  * ancestors of the outermost bands), n_parallal is set to 0.
4609  *
4610  * prefix is the (padded) schedule leading up to the outermost tilable bands.
4611  *
4612  * tile_first is the number of schedule dimensions in prefix.
4613  *
4614  * suffix is the schedule of the outermost tilable bands and their descendants.
4615  */
4616 struct band_info {
4617         struct gpu_gen *gen;
4618         int tile_first;
4619         int tile_len;
4620         int n_parallel;
4621         isl_union_map *prefix;
4622         isl_union_map *suffix;
4623 };
4624
4625 /* Set tile_len and n_parallel of the statement to that of
4626  * their outermost band, recorded in the band_info.
4627  */
4628 static int set_stmt_tile_len(__isl_take isl_map *map, void *user)
4629 {
4630         struct band_info *info = user;
4631         struct gpu_stmt *stmt;
4632         isl_id *id;
4633
4634         id = isl_map_get_tuple_id(map, isl_dim_in);
4635         stmt = find_stmt(info->gen->prog, id);
4636         isl_id_free(id);
4637
4638         stmt->tile_len = info->tile_len;
4639         stmt->n_parallel = info->n_parallel;
4640
4641         isl_map_free(map);
4642
4643         return 0;
4644 }
4645
4646 static void list_select_outer_band(struct gpu_gen *gen,
4647         __isl_take isl_band_list *list, int pos, struct band_info *list_info);
4648
4649 /* Check if this band has any parallel loops.  If so, take it as
4650  * the outermost tilable band.  If not, continue looking for the
4651  * outermost tilable band in the children of the current band.
4652  */
4653 static void band_select_outer_band(struct gpu_gen *gen,
4654         __isl_take isl_band *band, int pos, struct band_info *info)
4655 {
4656         int n = isl_band_n_member(band);
4657         int n_parallel;
4658
4659         for (n_parallel = 0; n_parallel < n; ++n_parallel)
4660                 if (!isl_band_member_is_zero_distance(band, n_parallel))
4661                         break;
4662
4663         info->n_parallel = n_parallel;
4664         if (n_parallel) {
4665                 gen->any_parallelism = 1;
4666                 info->gen = gen;
4667                 info->tile_first = pos;
4668                 info->tile_len = n;
4669                 info->prefix = isl_band_get_prefix_schedule(band);
4670                 info->suffix = isl_union_map_flat_range_product(
4671                                 isl_band_get_partial_schedule(band),
4672                                 isl_band_get_suffix_schedule(band));
4673                 isl_union_map_foreach_map(info->prefix,
4674                                             &set_stmt_tile_len, info);
4675         } else if (isl_band_has_children(band)) {
4676                 isl_band_list *children;
4677                 children = isl_band_get_children(band);
4678                 list_select_outer_band(gen, children, pos + n, info);
4679         } else {
4680                 info->gen = gen;
4681                 info->tile_first = pos + n;
4682                 info->tile_len = 0;
4683                 info->prefix = isl_union_map_flat_range_product(
4684                                 isl_band_get_prefix_schedule(band),
4685                                 isl_band_get_partial_schedule(band));
4686                 info->suffix = isl_band_get_suffix_schedule(band);
4687                 isl_union_map_foreach_map(info->prefix,
4688                                             &set_stmt_tile_len, info);
4689         }
4690
4691         isl_band_free(band);
4692 }
4693
4694 /* Comparison function that returns a non-zero value for band_infos
4695  * with different tile_len fields or different n_parallel fields.
4696  */
4697 static int cmp_band(const void *p1, const void *p2)
4698 {
4699         const struct band_info *info1 = p1;
4700         const struct band_info *info2 = p2;
4701
4702         if (info1->tile_len != info2->tile_len)
4703                 return info1->tile_len - info2->tile_len;
4704
4705         return info1->n_parallel - info2->n_parallel;
4706 }
4707
4708 /* Extend "umap" with coordinates with fixed value "val"
4709  * to a total length of "dst_len", assuming the original dimension is "src_len".
4710  */
4711 static __isl_give isl_union_map *extend_range(
4712         __isl_take isl_union_map *umap, int src_len, int dst_len, int val)
4713 {
4714         isl_space *dim;
4715         isl_map *map;
4716         int i;
4717
4718         dim = isl_union_map_get_space(umap);
4719         map = isl_map_reverse(projection(dim, dst_len, src_len));
4720         for (i = src_len; i < dst_len; ++i)
4721                 map = isl_map_fix_si(map, isl_dim_out, i, val);
4722
4723         umap = isl_union_map_apply_range(umap, isl_union_map_from_map(map));
4724
4725         return umap;
4726 }
4727
4728 /* Group bands with the same values for tile_len and n_parallel.
4729  * The prefix schedule is then extended with a fixed coordinate that
4730  * is different for each such group.
4731  * Note that the actual values for this coordinate are not important.
4732  * The bands have already been effectively separated at a higher level
4733  * or they are independent and may be executed in parallel.
4734  * The list of band_info has been sorted before this functions is called.
4735  */
4736 static void separate_bands(struct band_info *info, int n)
4737 {
4738         int i;
4739         int j = 0;
4740
4741         for (i = 0; i < n; ++i) {
4742                 int l = info[i].tile_first;
4743
4744                 if (i &&
4745                     (info[i].tile_len != info[i - 1].tile_len ||
4746                      info[i].n_parallel != info[i - 1].n_parallel))
4747                         j++;
4748
4749                 info[i].prefix = extend_range(info[i].prefix,
4750                                                 l, l + 1, j);
4751                 info[i].tile_first = l + 1;
4752         }
4753 }
4754
4755 /* Select the outermost bands in the elements of the list, align
4756  * their prefix schedules, separate bands with different values
4757  * for tile_len and/or n_parallel and then combine the resulting
4758  * prefix and suffix schedules into a single pair of prefix and
4759  * suffix schedules for the entire list.
4760  */
4761 static void list_select_outer_band(struct gpu_gen *gen,
4762         __isl_take isl_band_list *list, int pos, struct band_info *list_info)
4763 {
4764         isl_band *band;
4765         int i;
4766         int n = isl_band_list_n_band(list);
4767         isl_ctx *ctx = isl_band_list_get_ctx(list);
4768         struct band_info *info;
4769         int max_tile_first;
4770         isl_union_map *prefix;
4771         isl_union_map *suffix;
4772
4773         assert(n >= 1);
4774         info = isl_calloc_array(ctx, struct band_info, n);
4775         assert(info);
4776
4777         max_tile_first = 0;
4778         for (i = 0; i < n; ++i) {
4779                 band = isl_band_list_get_band(list, i);
4780                 band_select_outer_band(gen, band, pos, &info[i]);
4781                 if (info[i].tile_first > max_tile_first)
4782                         max_tile_first = info[i].tile_first;
4783         }
4784
4785         for (i = 0; i < n; ++i) {
4786                 if (info[i].tile_first == max_tile_first)
4787                         continue;
4788                 info[i].prefix = extend_range(info[i].prefix,
4789                                         info[i].tile_first, max_tile_first, 0);
4790                 info[i].tile_first = max_tile_first;
4791         }
4792
4793         qsort(info, n, sizeof(struct band_info), &cmp_band);
4794
4795         for (i = 0; i < n - 1; ++i)
4796                 if (info[i].tile_len != info[i + 1].tile_len ||
4797                     info[i].n_parallel != info[i + 1].n_parallel)
4798                         break;
4799
4800         if (i < n -1)
4801                 separate_bands(info, n);
4802
4803         prefix = info[0].prefix;
4804         suffix = info[0].suffix;
4805
4806         for (i = 1; i < n; ++i) {
4807                 prefix = isl_union_map_union(prefix, info[i].prefix);
4808                 suffix = isl_union_map_union(suffix, info[i].suffix);
4809         }
4810
4811         list_info->tile_first = info[0].tile_first;
4812         list_info->tile_len = -1;
4813         list_info->prefix = prefix;
4814         list_info->suffix = suffix;
4815
4816         isl_band_list_free(list);
4817         free(info);
4818 }
4819
4820 /* Select the outermost tilable band that (by construction)
4821  * has at least one parallel loop.
4822  * The starting position of the aligned band is stored in the pair
4823  * gen->tile_first.
4824  * The sizes and number of parallel loops may be different in different
4825  * parts of the band forest and are therefore stored in the gpu_stmts.
4826  *
4827  * Return the complete schedule, with the tilable bands aligned
4828  * at gen->tile_first and padded with zero, if needed.
4829  */
4830 static __isl_give isl_union_map *select_outer_tilable_band(struct gpu_gen *gen,
4831         __isl_keep isl_schedule *schedule)
4832 {
4833         isl_band_list *list;
4834         struct band_info info;
4835
4836         gen->n_parallel = 0;
4837         gen->tile_len = -1;
4838
4839         list = isl_schedule_get_band_forest(schedule);
4840
4841         if (isl_band_list_n_band(list) == 0) {
4842                 isl_band_list_free(list);
4843                 return isl_schedule_get_map(schedule);
4844         }
4845
4846         list_select_outer_band(gen, list, 0, &info);
4847
4848         gen->tile_first = info.tile_first;
4849         info.suffix = align_range(info.suffix);
4850
4851         return isl_union_map_flat_range_product(info.prefix, info.suffix);
4852 }
4853
4854 /* Set gen->untiled_len to the number of scheduling dimensions
4855  * for the schedule of the first domain.
4856  * We assume here that this number is the same for all domains.
4857  */
4858 static int set_untiled_len(__isl_take isl_map *map, void *user)
4859 {
4860         unsigned *untiled_len = user;
4861
4862         *untiled_len = isl_map_dim(map, isl_dim_out);
4863
4864         isl_map_free(map);
4865         return -1;
4866 }
4867
4868 /* Compute an appropriate schedule based on the accesses in
4869  * gen->read and gen->write.
4870  *
4871  * We use the dependences in gen->prog->scop to compute
4872  * a schedule that has a parallel loop in each tilable band.
4873  * Finally, we select the outermost tilable band.
4874  */
4875 static void compute_schedule(struct gpu_gen *gen)
4876 {
4877         isl_union_set *domain;
4878         isl_union_map *dep_raw, *dep;
4879         isl_union_map *sched;
4880         isl_schedule *schedule;
4881
4882         dep_raw = isl_union_map_copy(gen->prog->scop->dep_flow);
4883
4884         dep = isl_union_map_copy(gen->prog->scop->dep_false);
4885         dep = isl_union_map_union(dep, dep_raw);
4886         dep = isl_union_map_coalesce(dep);
4887
4888         domain = isl_union_set_copy(gen->prog->scop->domain);
4889         domain = isl_union_set_intersect_params(domain,
4890                                 isl_set_copy(gen->prog->scop->context));
4891         schedule = isl_union_set_compute_schedule(isl_union_set_copy(domain),
4892                                 isl_union_map_copy(dep), dep);
4893         if (gen->options->debug->dump_schedule)
4894                 isl_schedule_dump(schedule);
4895
4896         sched = select_outer_tilable_band(gen, schedule);
4897
4898         isl_union_map_foreach_map(sched, &set_untiled_len, &gen->untiled_len);
4899         sched = isl_union_map_intersect_domain(sched, domain);
4900         gen->sched = sched;
4901
4902         isl_schedule_free(schedule);
4903 }
4904
4905 /* Compute the sets of array elements that need to be copied in and out.
4906  *
4907  * In particular, for each array that is written anywhere in gen->prog and
4908  * that is visible outside the corresponding scop, we copy out its entire
4909  * extent.
4910  *
4911  * Any array elements that is read without first being written needs
4912  * to be copied in. Furthermore, if there are any array elements that
4913  * are copied out, but that are not written inside gen->prog, then
4914  * they also need to be copied in to ensure that the value after execution
4915  * is the same as the value before execution.
4916  * While computing the set of array elements that
4917  * are copied out but not written, we intersect both sets with the context.
4918  * This helps in those cases where the arrays are declared with a fixed size,
4919  * while the accesses are parametric and the context assigns a fixed value
4920  * to the parameters.
4921  */
4922 static void compute_copy_in_and_out(struct gpu_gen *gen)
4923 {
4924         int i;
4925         isl_union_set *write;
4926         isl_union_set *copy_in, *copy_out;
4927         isl_union_set *not_written;
4928         isl_union_map *uninitialized;
4929
4930         write = isl_union_map_range(isl_union_map_copy(gen->prog->write));
4931         write = isl_union_set_intersect_params(write,
4932                                             isl_set_copy(gen->prog->context));
4933         copy_out = isl_union_set_empty(isl_union_set_get_space(write));
4934
4935         for (i = 0; i < gen->prog->n_array; ++i) {
4936                 isl_space *space;
4937                 isl_set *write_i;
4938                 int empty;
4939
4940                 if (gen->prog->array[i].local)
4941                         continue;
4942
4943                 space = isl_space_copy(gen->prog->array[i].space);
4944                 write_i = isl_union_set_extract_set(write, space);
4945                 empty = isl_set_fast_is_empty(write_i);
4946                 isl_set_free(write_i);
4947                 if (empty)
4948                         continue;
4949
4950                 write_i = isl_set_copy(gen->prog->array[i].extent);
4951                 copy_out = isl_union_set_add_set(copy_out, write_i);
4952         }
4953
4954         copy_out = isl_union_set_intersect_params(copy_out,
4955                                             isl_set_copy(gen->prog->context));
4956
4957         gen->prog->copy_out = isl_union_set_copy(copy_out);
4958
4959         uninitialized = isl_union_map_copy(gen->prog->scop->live_in);
4960         copy_in = isl_union_map_range(uninitialized);
4961
4962         not_written = isl_union_set_subtract(copy_out, write);
4963         copy_in = isl_union_set_union(copy_in, not_written);
4964         gen->prog->copy_in = copy_in;
4965 }
4966
4967 static struct gpu_stmt_access **expr_extract_access(struct pet_expr *expr,
4968         struct gpu_stmt_access **next_access)
4969 {
4970         struct gpu_stmt_access *access;
4971         isl_ctx *ctx = isl_map_get_ctx(expr->acc.access);
4972
4973         access = isl_alloc_type(ctx, struct gpu_stmt_access);
4974         assert(access);
4975         access->next = NULL;
4976         access->read = expr->acc.read;
4977         access->write = expr->acc.write;
4978         access->access = isl_map_copy(expr->acc.access);
4979         access->ref_id = isl_id_copy(expr->acc.ref_id);
4980
4981         *next_access = access;
4982         next_access = &(*next_access)->next;
4983         return next_access;
4984 }
4985
4986 static struct gpu_stmt_access **expr_extract_accesses(struct pet_expr *expr,
4987         struct gpu_stmt_access **next_access)
4988 {
4989         int i;
4990
4991         for (i = 0; i < expr->n_arg; ++i)
4992                 next_access = expr_extract_accesses(expr->args[i],
4993                                                         next_access);
4994
4995         if (expr->type == pet_expr_access)
4996                 next_access = expr_extract_access(expr, next_access);
4997
4998         return next_access;
4999 }
5000
5001 static void pet_stmt_extract_accesses(struct gpu_stmt *stmt)
5002 {
5003         struct gpu_stmt_access **next_access = &stmt->accesses;
5004
5005         stmt->accesses = NULL;
5006         expr_extract_accesses(stmt->stmt->body, next_access);
5007 }
5008
5009 /* Return an array of gpu_stmt representing the statements in "scop".
5010  */
5011 static struct gpu_stmt *extract_stmts(isl_ctx *ctx, struct ppcg_scop *scop,
5012         __isl_keep isl_set *context)
5013 {
5014         int i;
5015         struct gpu_stmt *stmts;
5016
5017         stmts = isl_calloc_array(ctx, struct gpu_stmt, scop->n_stmt);
5018         if (!stmts)
5019                 return NULL;
5020
5021         for (i = 0; i < scop->n_stmt; ++i) {
5022                 struct gpu_stmt *s = &stmts[i];
5023
5024                 s->id = isl_set_get_tuple_id(scop->stmts[i]->domain);
5025                 s->stmt = scop->stmts[i];
5026                 pet_stmt_extract_accesses(s);
5027         }
5028
5029         return stmts;
5030 }
5031
5032 /* Callback for ppcg_print_guarded that calls the callback for generate_gpu.
5033  */
5034 static __isl_give isl_printer *print_gpu(__isl_take isl_printer *p, void *user)
5035 {
5036         struct gpu_gen *gen = user;
5037
5038         return gen->print(p, gen->prog, gen->tree, gen->print_user);
5039 }
5040
5041 /* Generate CUDA code for "scop" and print it to "p".
5042  * After generating an AST for the transformed scop as explained below,
5043  * we call "gen->print" to print the AST in the desired output format
5044  * to "p".
5045  *
5046  * If it turns out that it does not make sense to generate GPU code,
5047  * then we generate CPU code instead.
5048  *
5049  * The GPU code is generated in a context where at least one
5050  * statement instance is executed.  The corresponding guard (if any) is printed
5051  * around the entire generated GPU code, except for the declaration
5052  * of the arrays that are visible outside of the scop and that therefore
5053  * cannot be declared inside the body of any possible guard.
5054  *
5055  * We first compute a schedule that respects the dependences
5056  * of the original program and select the outermost band
5057  * of tilable dimensions that has at least one parallel loop.
5058  * We then have three blocks of dimensions
5059  *
5060  *      H               B                       G
5061  *
5062  * The tilable band "B" is first tiled according to "tile" sizes, resulting
5063  * in
5064  *
5065  *      H       T               P               G
5066  *
5067  * For each iteration of the T loop and for each array, we compute
5068  * the array elements accessed by that iteration, construct a rectangular
5069  * box around it and shift it to the origin.  The result is used
5070  * as shared memory for the array.
5071  *
5072  * We then split off at most 2 parallel loops from the T loops and
5073  * at most 3 parallel loops from the P loops
5074  *
5075  *      H       T1      T2      P1      P2      G
5076  *
5077  * The T1/P1 loops are then tiled or "wrapped" over the blocks/threads,
5078  * according to "grid"/"block" sizes.
5079  *
5080  *      H       T1T T1P T2      P1T P1P P2      G
5081  *
5082  * Finally, the T1P and P1P iterators are equated to the block and
5083  * thread dimensions respectively and so are effectively removed.
5084  * The H loops are run on the host.  The T1T, T2, P1T, P2 and G loops
5085  * are run on the GPU.
5086  *
5087  * Code is generated in three stages.  We first generate code for the
5088  * host (the H loops), with iterators h%d.  Then, for each leaf node
5089  * of the resulting AST, we generate code for the shared loops (up to
5090  * and including T2), with iterators g%d and after equating the H loops
5091  * to h%d parameters and the T1P loops to the block dimensions.
5092  * Finally, we generate code for the remaining loops in a similar fashion.
5093  */
5094 static __isl_give isl_printer *generate(__isl_take isl_printer *p,
5095         struct gpu_gen *gen, struct ppcg_scop *scop,
5096         struct ppcg_options *options)
5097 {
5098         struct gpu_prog *prog;
5099         isl_ctx *ctx;
5100         isl_set *context, *guard;
5101
5102         if (!scop)
5103                 return isl_printer_free(p);
5104
5105         ctx = isl_printer_get_ctx(p);
5106         prog = gpu_prog_alloc(ctx, scop);
5107         if (!prog)
5108                 return isl_printer_free(p);
5109
5110         context = isl_set_copy(prog->context);
5111         guard = isl_union_set_params(isl_union_set_copy(prog->scop->domain));
5112         prog->context = isl_set_intersect(prog->context, isl_set_copy(guard));
5113
5114         gen->prog = prog;
5115         gen->any_parallelism = 0;
5116         compute_schedule(gen);
5117
5118         if (!gen->any_parallelism) {
5119                 isl_set_free(context);
5120                 isl_set_free(guard);
5121                 p = print_cpu(p, scop, options);
5122         } else {
5123                 compute_copy_in_and_out(gen);
5124                 gen->tree = generate_host_code(gen);
5125                 p = ppcg_print_exposed_declarations(p, prog->scop);
5126                 p = ppcg_print_guarded(p, guard, context, &print_gpu, gen);
5127                 isl_ast_node_free(gen->tree);
5128         }
5129
5130         isl_union_map_free(gen->sched);
5131
5132         gpu_prog_free(prog);
5133
5134         return p;
5135 }
5136
5137 /* Wrapper around generate for use as a ppcg_transform callback.
5138  */
5139 static __isl_give isl_printer *generate_wrap(__isl_take isl_printer *p,
5140         struct ppcg_scop *scop, void *user)
5141 {
5142         struct gpu_gen *gen = user;
5143
5144         return generate(p, gen, scop, gen->options);
5145 }
5146
5147 /* Transform the code in the file called "input" by replacing
5148  * all scops by corresponding GPU code and write the results to "out".
5149  */
5150 int generate_gpu(isl_ctx *ctx, const char *input, FILE *out,
5151         struct ppcg_options *options,
5152         __isl_give isl_printer *(*print)(__isl_take isl_printer *p,
5153                 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
5154                 void *user), void *user)
5155 {
5156         struct gpu_gen gen;
5157         int r;
5158
5159         gen.ctx = ctx;
5160         gen.sizes = extract_sizes_from_str(ctx, options->sizes);
5161         gen.options = options;
5162         gen.kernel_id = 0;
5163         gen.print = print;
5164         gen.print_user = user;
5165
5166         r = ppcg_transform(ctx, input, out, options, &generate_wrap, &gen);
5167
5168         isl_union_map_free(gen.sizes);
5169
5170         return r;
5171 }
5172
5173 struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop)
5174 {
5175         struct gpu_prog *prog;
5176
5177         if (!scop)
5178                 return NULL;
5179
5180         prog = isl_calloc_type(ctx, struct gpu_prog);
5181         assert(prog);
5182
5183         prog->ctx = ctx;
5184         prog->scop = scop;
5185         prog->context = isl_set_copy(scop->context);
5186         prog->n_stmts = scop->n_stmt;
5187         prog->stmts = extract_stmts(ctx, scop, prog->context);
5188         prog->read = isl_union_map_copy(scop->reads);
5189         prog->write = isl_union_map_copy(scop->writes);
5190
5191         if (!prog->stmts)
5192                 return gpu_prog_free(prog);
5193
5194         if (collect_array_info(prog) < 0)
5195                 return gpu_prog_free(prog);
5196
5197         return prog;
5198 }
5199
5200 void *gpu_prog_free(struct gpu_prog *prog)
5201 {
5202         if (!prog)
5203                 return NULL;
5204         free_array_info(prog);
5205         free_stmts(prog->stmts, prog->n_stmts);
5206         isl_union_set_free(prog->copy_in);
5207         isl_union_set_free(prog->copy_out);
5208         isl_union_map_free(prog->read);
5209         isl_union_map_free(prog->write);
5210         isl_set_free(prog->context);
5211         free(prog);
5212         return NULL;
5213 }