gpu.c

   1 /*
   2  * Copyright 2010-2011 INRIA Saclay
   3  * Copyright 2012-2013 Ecole Normale Superieure
   4  *
   5  * Use of this software is governed by the MIT license
   6  *
   7  * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
   8  * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
   9  * 91893 Orsay, France
  10  * and Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
  11  */
  12
  13 #include <assert.h>
  14 #include <stdlib.h>
  15 #include <string.h>
  16
  17 #include <isl/polynomial.h>
  18 #include <isl/union_set.h>
  19 #include <isl/aff.h>
  20 #include <isl/ilp.h>
  21 #include <isl/flow.h>
  22 #include <isl/band.h>
  23 #include <isl/schedule.h>
  24 #include <isl/options.h>
  25 #include <isl/ast_build.h>
  26
  27 #include "cpu.h"
  28 #include "gpu.h"
  29 #include "schedule.h"
  30 #include "ppcg_options.h"
  31 #include "print.h"
  32
  33 /* The fields stride, shift and shift_map only contain valid information
  34  * if shift != NULL.
  35  * If so, they express that current index is such that if you add shift,
  36  * then the result is always a multiple of stride.
  37  * shift_map contains the mapping
  38  *
  39  *      i -> (i + shift)/stride
  40  *
  41  * Let D represent the initial shared_len dimensions of the computed schedule.
  42  * The spaces of "lb" and "shift" are of the form
  43  *
  44  *      D -> [b]
  45  *
  46  * "shift_map" is of the form
  47  *
  48  *      [D -> i] -> [D -> (i + shift(D))/stride]
  49  */
  50 struct gpu_array_bound {
  51         isl_val *size;
  52         isl_aff *lb;
  53
  54         isl_val *stride;
  55         isl_aff *shift;
  56         isl_basic_map *shift_map;
  57 };
  58
  59 /* A tile of an array.
  60  *
  61  * n is the dimension of the array.
  62  * bound is an array of size "n" representing the lower bound
  63  *      and size for each index.
  64  *
  65  * tiling maps a tile in the global array to the corresponding
  66  * shared/private memory tile and is of the form
  67  *
  68  *      { [D[i] -> A[a]] -> T[(a + shift(i))/stride - lb(i)] }
  69  *
  70  * where D represents the initial shared_len dimensions
  71  * of the computed schedule.
  72  */
  73 struct gpu_array_tile {
  74         int n;
  75         struct gpu_array_bound *bound;
  76         isl_multi_aff *tiling;
  77 };
  78
  79 struct gpu_array_info;
  80
  81 /* A group of array references in a kernel that should be handled together.
  82  * If private_tile is not NULL, then it is mapped to registers.
  83  * Otherwise, if shared_tile is not NULL, it is mapped to shared memory.
  84  * Otherwise, it is accessed from global memory.
  85  */
  86 struct gpu_array_ref_group {
  87         /* The references in this group access this array. */
  88         struct gpu_array_info *array;
  89         /* Position of this group in the list of reference groups of array. */
  90         int nr;
  91
  92         /* The following fields are use during the construction of the groups.
  93          * access is the combined access relation relative to the shared
  94          * memory tiling.  In particular, the domain of the map corresponds
  95          * to the first shared_len dimensions of the computed schedule.
  96          * write is set if any access in the group is a write.
  97          */
  98         isl_map *access;
  99         int write;
 100
 101         /* The shared memory tile, NULL if none. */
 102         struct gpu_array_tile *shared_tile;
 103
 104         /* The private memory tile, NULL if none. */
 105         struct gpu_array_tile *private_tile;
 106
 107         /* References in this group; point to elements of a linked list. */
 108         int n_ref;
 109         struct gpu_stmt_access **refs;
 110
 111         /* Last shared memory tile dimension that affects tile of this group. */
 112         int last_shared;
 113 };
 114
 115 struct gpu_gen {
 116         isl_ctx *ctx;
 117         struct ppcg_options *options;
 118
 119         /* Callback for printing of AST in appropriate format. */
 120         __isl_give isl_printer *(*print)(__isl_take isl_printer *p,
 121                 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
 122                 void *user);
 123         void *print_user;
 124
 125         struct gpu_prog *prog;
 126         /* The generated AST. */
 127         isl_ast_node *tree;
 128
 129         /* tile, grid and block sizes for each kernel */
 130         isl_union_map *sizes;
 131
 132         /* Identifier of current kernel. */
 133         int kernel_id;
 134         /* Pointer to the current kernel. */
 135         struct ppcg_kernel *kernel;
 136         /* Does the computed schedule exhibit any parallelism? */
 137         int any_parallelism;
 138
 139         /* First tile dimension. */
 140         int tile_first;
 141         /* Number of tile dimensions. */
 142         int tile_len;
 143         /* Number of initial parallel loops among tile dimensions. */
 144         int n_parallel;
 145
 146         /* Number of dimensions determining shared memory. */
 147         int shared_len;
 148
 149         /* Number of rows in the untiled schedule. */
 150         int untiled_len;
 151         /* Number of rows in the tiled schedule. */
 152         int tiled_len;
 153         /* Number of rows in schedule after tiling/wrapping over threads. */
 154         int thread_tiled_len;
 155
 156         /* Global untiled schedule. */
 157         isl_union_map *sched;
 158         /* Local (per kernel launch) tiled schedule. */
 159         isl_union_map *tiled_sched;
 160         /* Local schedule per shared memory tile loop iteration. */
 161         isl_union_map *local_sched;
 162
 163         /* Local tiled schedule projected onto the shared tile loops and
 164          * the loops that will be wrapped over the threads,
 165          * with all shared tile loops parametrized.
 166          */
 167         isl_union_map *shared_sched;
 168         /* Projects out the loops that will be wrapped over the threads
 169          * from shared_sched.
 170          */
 171         isl_union_map *shared_proj;
 172
 173         /* A map that takes the range of shared_sched as input,
 174          * wraps the appropriate loops over the threads and then projects
 175          * out these loops.
 176          */
 177         isl_map *privatization;
 178
 179         /* A map from the shared memory tile loops and the thread indices
 180          * (as parameters) to the set of accessed memory elements that
 181          * will be accessed through private copies.
 182          */
 183         isl_union_map *private_access;
 184
 185         /* The schedule for the current private/shared access
 186          * (within print_private_access or print_shared_access).
 187          */
 188         isl_map *copy_sched;
 189         /* The array reference group corresponding to copy_sched. */
 190         struct gpu_array_ref_group *copy_group;
 191
 192         /* First loop to unroll (or -1 if none) in the current part of the
 193          * schedule.
 194          */
 195         int first_unroll;
 196
 197         int n_grid;
 198         int n_block;
 199         /* Note: in the input file, the sizes of the grid and the blocks
 200          * are specified in the order x, y, z, but internally, the sizes
 201          * are stored in reverse order, so that the last element always
 202          * refers to the x dimension.
 203          */
 204         int grid_dim[2];
 205         int block_dim[3];
 206         int *tile_size;
 207 };
 208
 209 /* Print the name of the local copy of a given group of array references.
 210  */
 211 static __isl_give isl_printer *print_array_name(__isl_take isl_printer *p,
 212         struct gpu_array_ref_group *group)
 213 {
 214         int global = 0;
 215
 216         if (group->private_tile)
 217                 p = isl_printer_print_str(p, "private_");
 218         else if (group->shared_tile)
 219                 p = isl_printer_print_str(p, "shared_");
 220         else
 221                 global = 1;
 222         p = isl_printer_print_str(p, group->array->name);
 223         if (!global && group->array->n_group > 1) {
 224                 p = isl_printer_print_str(p, "_");
 225                 p = isl_printer_print_int(p, group->nr);
 226         }
 227
 228         return p;
 229 }
 230
 231 /* Collect all references to the given array and store pointers to them
 232  * in array->refs.
 233  */
 234 static void collect_references(struct gpu_prog *prog,
 235         struct gpu_array_info *array)
 236 {
 237         int i;
 238         int n;
 239
 240         n = 0;
 241         for (i = 0; i < prog->n_stmts; ++i) {
 242                 struct gpu_stmt *stmt = &prog->stmts[i];
 243                 struct gpu_stmt_access *access;
 244
 245                 for (access = stmt->accesses; access; access = access->next) {
 246                         const char *name;
 247                         name = isl_map_get_tuple_name(access->access,
 248                                                       isl_dim_out);
 249                         if (name && !strcmp(array->name, name))
 250                                 n++;
 251                 }
 252         }
 253
 254         array->n_ref = n;
 255         array->refs = isl_alloc_array(prog->ctx, struct gpu_stmt_access *, n);
 256         assert(array->refs);
 257
 258         n = 0;
 259         for (i = 0; i < prog->n_stmts; ++i) {
 260                 struct gpu_stmt *stmt = &prog->stmts[i];
 261                 struct gpu_stmt_access *access;
 262
 263                 for (access = stmt->accesses; access; access = access->next) {
 264                         const char *name;
 265                         name = isl_map_get_tuple_name(access->access,
 266                                                       isl_dim_out);
 267                         if (!name || strcmp(array->name, name))
 268                                 continue;
 269
 270                         array->refs[n++] = access;
 271                 }
 272         }
 273 }
 274
 275 /* Create a gpu_array_tile for an array of dimension "n_index".
 276  */
 277 static struct gpu_array_tile *create_tile(isl_ctx *ctx, int n_index)
 278 {
 279         int i;
 280         struct gpu_array_tile *tile;
 281
 282         tile = isl_calloc_type(ctx, struct gpu_array_tile);
 283         assert(tile);
 284
 285         tile->n = n_index;
 286
 287         tile->bound = isl_alloc_array(ctx, struct gpu_array_bound, n_index);
 288         assert(tile->bound);
 289
 290         for (i = 0; i < n_index; ++i) {
 291                 tile->bound[i].size = NULL;
 292                 tile->bound[i].lb = NULL;
 293                 tile->bound[i].stride = NULL;
 294                 tile->bound[i].shift = NULL;
 295                 tile->bound[i].shift_map = NULL;
 296         }
 297
 298         return tile;
 299 }
 300
 301 static void *free_tile(struct gpu_array_tile *tile)
 302 {
 303         int j;
 304
 305         if (!tile)
 306                 return NULL;
 307
 308         for (j = 0; j < tile->n; ++j) {
 309                 isl_val_free(tile->bound[j].size);
 310                 isl_val_free(tile->bound[j].stride);
 311                 isl_aff_free(tile->bound[j].lb);
 312                 isl_aff_free(tile->bound[j].shift);
 313                 isl_basic_map_free(tile->bound[j].shift_map);
 314         }
 315         free(tile->bound);
 316         isl_multi_aff_free(tile->tiling);
 317         free(tile);
 318
 319         return NULL;
 320 }
 321
 322 static struct pet_array *find_array(struct ppcg_scop *scop,
 323         __isl_keep isl_set *accessed)
 324 {
 325         int i;
 326         isl_id *id;
 327
 328         id = isl_set_get_tuple_id(accessed);
 329
 330         for (i = 0; i < scop->n_array; ++i) {
 331                 isl_id *id_i;
 332
 333                 id_i = isl_set_get_tuple_id(scop->arrays[i]->extent);
 334                 isl_id_free(id_i);
 335                 if (id == id_i)
 336                         break;
 337         }
 338         isl_id_free(id);
 339
 340         return i < scop->n_array ? scop->arrays[i] : NULL;
 341 }
 342
 343 /* Compute and return the extent of "array", taking into account the set of
 344  * accessed elements.
 345  *
 346  * In particular, the extent in the outer dimension is taken
 347  * from "accessed", while then extent in the remaing dimensions
 348  * are taken from array->extent.
 349  *
 350  * The extent in the outer dimension cannot be taken from array->extent
 351  * because that may be unbounded.  Furthermore, even if it is bounded,
 352  * it may be larger than the piece of the array that is being accessed.
 353  */
 354 static __isl_give isl_set *compute_extent(struct pet_array *array,
 355         __isl_keep isl_set *accessed)
 356 {
 357         int n_index;
 358         isl_id *id;
 359         isl_set *outer;
 360         isl_set *extent;
 361
 362         extent = isl_set_copy(array->extent);
 363
 364         n_index = isl_set_dim(accessed, isl_dim_set);
 365         if (n_index == 0)
 366                 return extent;
 367
 368         extent = isl_set_project_out(extent, isl_dim_set, 0, 1);
 369         outer = isl_set_copy(accessed);
 370         outer = isl_set_project_out(outer, isl_dim_set, 1, n_index - 1);
 371         extent = isl_set_flat_product(outer, extent);
 372         id = isl_set_get_tuple_id(accessed);
 373         extent = isl_set_set_tuple_id(extent, id);
 374
 375         return extent;
 376 }
 377
 378 /* Is the array "array" being extracted a read-only scalar?
 379  *
 380  * That is, is "array" a scalar that is never written to.
 381  */
 382 static int is_read_only_scalar(struct gpu_array_info *array,
 383         struct gpu_prog *prog)
 384 {
 385         isl_set *space;
 386         isl_union_map *write;
 387         int empty;
 388
 389         if (array->n_index != 0)
 390                 return 0;
 391
 392         write = isl_union_map_copy(prog->write);
 393         space = isl_set_universe(isl_space_copy(array->space));
 394         write = isl_union_map_intersect_range(write,
 395                                                 isl_union_set_from_set(space));
 396         empty = isl_union_map_is_empty(write);
 397         isl_union_map_free(write);
 398
 399         return empty;
 400 }
 401
 402 /* Compute bounds on the host arrays based on the accessed elements
 403  * and collect all references to the array.
 404  *
 405  * If the array is zero-dimensional, i.e., a scalar, we check
 406  * whether it is read-only.
 407  */
 408 static int extract_array_info(__isl_take isl_set *array, void *user)
 409 {
 410         int i;
 411         struct gpu_prog *prog = (struct gpu_prog *)user;
 412         const char *name;
 413         int n_index;
 414         isl_pw_aff **bounds;
 415         struct pet_array *pa;
 416         struct gpu_array_info *info;
 417         isl_set *extent;
 418
 419         info = &prog->array[prog->n_array];
 420         prog->n_array++;
 421
 422         n_index = isl_set_dim(array, isl_dim_set);
 423         name = isl_set_get_tuple_name(array);
 424         bounds = isl_alloc_array(isl_set_get_ctx(array),
 425                                  isl_pw_aff *, n_index);
 426         if (!bounds)
 427                 goto error;
 428
 429         info->space = isl_set_get_space(array);
 430         info->name = strdup(name);
 431         info->n_index = n_index;
 432         info->bound = bounds;
 433
 434         pa = find_array(prog->scop, array);
 435         if (!pa)
 436                 isl_die(isl_set_get_ctx(array), isl_error_internal,
 437                         "unable to find array in scop", goto error);
 438
 439         info->type = strdup(pa->element_type);
 440         info->size = pa->element_size;
 441         info->local = pa->declared && !pa->exposed;
 442         info->read_only_scalar = is_read_only_scalar(info, prog);
 443
 444         extent = compute_extent(pa, array);
 445         for (i = 0; i < n_index; ++i) {
 446                 isl_set *dom;
 447                 isl_local_space *ls;
 448                 isl_aff *one;
 449                 isl_pw_aff *bound;
 450
 451                 bound = isl_set_dim_max(isl_set_copy(extent), i);
 452                 assert(bound);
 453                 dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
 454                 ls = isl_local_space_from_space(isl_set_get_space(dom));
 455                 one = isl_aff_zero_on_domain(ls);
 456                 one = isl_aff_add_constant_si(one, 1);
 457                 bound = isl_pw_aff_add(bound, isl_pw_aff_alloc(dom, one));
 458                 bound = isl_pw_aff_gist(bound, isl_set_copy(prog->context));
 459
 460                 bounds[i] = bound;
 461         }
 462         info->extent = extent;
 463
 464         collect_references(prog, info);
 465
 466         isl_set_free(array);
 467         return 0;
 468 error:
 469         isl_set_free(array);
 470         return -1;
 471 }
 472
 473 /* Construct a gpu_array_info for each array accessed by "prog" and
 474  * collect them in prog->array.
 475  */
 476 static int collect_array_info(struct gpu_prog *prog)
 477 {
 478         int r;
 479         isl_union_set *arrays;
 480
 481         arrays = isl_union_map_range(isl_union_map_copy(prog->read));
 482         arrays = isl_union_set_union(arrays,
 483                         isl_union_map_range(isl_union_map_copy(prog->write)));
 484         arrays = isl_union_set_coalesce(arrays);
 485
 486         prog->n_array = isl_union_set_n_set(arrays);
 487         prog->array = isl_alloc_array(prog->ctx,
 488                                      struct gpu_array_info, prog->n_array);
 489         assert(prog->array);
 490         prog->n_array = 0;
 491         r = isl_union_set_foreach_set(arrays, &extract_array_info, prog);
 492         isl_union_set_free(arrays);
 493
 494         return r;
 495 }
 496
 497 static void free_array_info(struct gpu_prog *prog)
 498 {
 499         int i, j;
 500
 501         for (i = 0; i < prog->n_array; ++i) {
 502                 int n_index = prog->array[i].n_index;
 503                 free(prog->array[i].type);
 504                 free(prog->array[i].name);
 505                 for (j = 0; j < n_index; ++j)
 506                         isl_pw_aff_free(prog->array[i].bound[j]);
 507                 isl_space_free(prog->array[i].space);
 508                 isl_set_free(prog->array[i].extent);
 509                 free(prog->array[i].bound);
 510                 free(prog->array[i].refs);
 511         }
 512         free(prog->array);
 513 }
 514
 515 /* Check if a gpu array is a scalar.  A scalar is a value that is not stored
 516  * as an array or through a pointer reference, but as single data element.  At
 517  * the moment, scalars are represented as zero dimensional arrays.
 518  */
 519 int gpu_array_is_scalar(struct gpu_array_info *array)
 520 {
 521         return (array->n_index == 0);
 522 }
 523
 524 /* Is "array" a read-only scalar?
 525  */
 526 int gpu_array_is_read_only_scalar(struct gpu_array_info *array)
 527 {
 528         return array->read_only_scalar;
 529 }
 530
 531 /* Internal data structure for extract_size_of_type.
 532  * "type" specifies the name of the space that we want to extract.
 533  * "res" is used to store the subset of that space.
 534  */
 535 struct ppcg_extract_size_data {
 536         const char *type;
 537         isl_set *res;
 538 };
 539
 540 /* This function is called for each set in a union_set.
 541  * If the name of the set matches data->type, we store the
 542  * set in data->res.
 543  */
 544 static int extract_size_of_type(__isl_take isl_set *size, void *user)
 545 {
 546         struct ppcg_extract_size_data *data = user;
 547         const char *name;
 548
 549         name = isl_set_get_tuple_name(size);
 550         if (name && !strcmp(name, data->type)) {
 551                 data->res = size;
 552                 return -1;
 553         }
 554
 555         isl_set_free(size);
 556         return 0;
 557 }
 558
 559 /* Given a union map { kernel[i] -> *[...] },
 560  * return the range in the space called "type" for the kernel with
 561  * sequence number "id".
 562  */
 563 static __isl_give isl_set *extract_sizes(__isl_keep isl_union_map *sizes,
 564         const char *type, int id)
 565 {
 566         isl_space *space;
 567         isl_set *dom;
 568         isl_union_set *local_sizes;
 569         struct ppcg_extract_size_data data = { type, NULL };
 570
 571         if (!sizes)
 572                 return NULL;
 573
 574         space = isl_union_map_get_space(sizes);
 575         space = isl_space_set_from_params(space);
 576         space = isl_space_add_dims(space, isl_dim_set, 1);
 577         space = isl_space_set_tuple_name(space, isl_dim_set, "kernel");
 578         dom = isl_set_universe(space);
 579         dom = isl_set_fix_si(dom, isl_dim_set, 0, id);
 580
 581         local_sizes = isl_union_set_apply(isl_union_set_from_set(dom),
 582                                         isl_union_map_copy(sizes));
 583         isl_union_set_foreach_set(local_sizes, &extract_size_of_type, &data);
 584         isl_union_set_free(local_sizes);
 585         return data.res;
 586 }
 587
 588 /* Given a singleton set, extract the first (at most *len) elements
 589  * of the single integer tuple into *sizes and update *len if needed.
 590  */
 591 static void read_sizes_from_set(__isl_take isl_set *set, int *sizes, int *len)
 592 {
 593         int i;
 594         int dim;
 595
 596         if (!set)
 597                 return;
 598
 599         dim = isl_set_dim(set, isl_dim_set);
 600         if (dim < *len)
 601                 *len = dim;
 602
 603         for (i = 0; i < *len; ++i) {
 604                 isl_val *v;
 605
 606                 v = isl_set_plain_get_val_if_fixed(set, isl_dim_set, i);
 607                 assert(v);
 608
 609                 sizes[i] = isl_val_get_num_si(v);
 610                 isl_val_free(v);
 611         }
 612
 613         isl_set_free(set);
 614 }
 615
 616 /* Extract user specified "tile" sizes from the "sizes" command line option,
 617  * defaulting to option->tile_size in each dimension.
 618  */
 619 static void read_tile_sizes(struct gpu_gen *gen)
 620 {
 621         int n;
 622         isl_set *size;
 623
 624         gen->tile_size = isl_alloc_array(gen->ctx, int, gen->tile_len);
 625         assert(gen->tile_size);
 626         for (n = 0; n < gen->tile_len; ++n)
 627                 gen->tile_size[n] = gen->options->tile_size;
 628
 629         size = extract_sizes(gen->sizes, "tile", gen->kernel_id);
 630         read_sizes_from_set(size, gen->tile_size, &gen->tile_len);
 631
 632         if (gen->n_parallel > gen->tile_len)
 633                 gen->n_parallel = gen->tile_len;
 634 }
 635
 636 /* Extract user specified "block" sizes from the "sizes" command line option,
 637  * after filling in some potentially useful defaults.
 638  */
 639 static void read_block_sizes(struct gpu_gen *gen)
 640 {
 641         int n;
 642         isl_set *size;
 643
 644         n = gen->n_parallel;
 645         gen->n_block = (n <= 3) ? n : 3;
 646         switch (gen->n_block) {
 647         case 1:
 648                 gen->block_dim[0] = 512;
 649                 break;
 650         case 2:
 651                 gen->block_dim[0] = 32;
 652                 gen->block_dim[1] = 16;
 653                 break;
 654         default:
 655                 gen->block_dim[0] = 32;
 656                 gen->block_dim[1] = 4;
 657                 gen->block_dim[2] = 4;
 658                 break;
 659         }
 660
 661         size = extract_sizes(gen->sizes, "block", gen->kernel_id);
 662         read_sizes_from_set(size, gen->block_dim, &gen->n_block);
 663 }
 664
 665 /* Extract user specified "grid" sizes from the "sizes" command line option,
 666  * after filling in some potentially useful defaults.
 667  */
 668 static void read_grid_sizes(struct gpu_gen *gen)
 669 {
 670         int n = gen->n_parallel;
 671         isl_set *size;
 672
 673         gen->n_grid = (n <= 2) ? n : 2;
 674         switch (gen->n_grid) {
 675         case 1:
 676                 gen->grid_dim[0] = 32768;
 677                 break;
 678         default:
 679                 gen->grid_dim[0] = 256;
 680                 gen->grid_dim[1] = 256;
 681                 break;
 682         }
 683
 684         size = extract_sizes(gen->sizes, "grid", gen->kernel_id);
 685         read_sizes_from_set(size, gen->grid_dim, &gen->n_grid);
 686 }
 687
 688 /* Extract user specified sizes from the "sizes" command line option
 689  * after filling in some potentially useful defaults.
 690  */
 691 static void read_sizes(struct gpu_gen *gen)
 692 {
 693         read_tile_sizes(gen);
 694         read_block_sizes(gen);
 695         read_grid_sizes(gen);
 696 }
 697
 698 static void *free_stmts(struct gpu_stmt *stmts, int n)
 699 {
 700         int i;
 701
 702         if (!stmts)
 703                 return NULL;
 704
 705         for (i = 0; i < n; ++i) {
 706                 struct gpu_stmt_access *access, *next;
 707
 708                 for (access = stmts[i].accesses; access; access = next) {
 709                         next = access->next;
 710                         isl_id_free(access->ref_id);
 711                         isl_map_free(access->access);
 712                         free(access);
 713                 }
 714
 715                 isl_id_free(stmts[i].id);
 716         }
 717         free(stmts);
 718
 719         return NULL;
 720 }
 721
 722 /* Construct a map from a domain of dimensionality "len"
 723  * to a domain of dimensionality "len" + "tile_len" that tiles
 724  * the "tile_len" coordinates starting at "first".
 725  * In particular, [s_i] -> [s_i / tile_size[i], s_i % tile_size[i]].
 726  * "dim" prescribes the parameters.
 727  */
 728 static __isl_give isl_map *tile(__isl_take isl_space *dim, int len,
 729         int first, int tile_len, int *tile_size)
 730 {
 731         int i;
 732         isl_basic_map *bmap;
 733         isl_constraint *c;
 734         isl_local_space *ls;
 735
 736         dim = isl_space_add_dims(dim, isl_dim_in, len);
 737         dim = isl_space_add_dims(dim, isl_dim_out, len + tile_len);
 738         bmap = isl_basic_map_universe(isl_space_copy(dim));
 739         ls = isl_local_space_from_space(dim);
 740
 741         for (i = 0; i < len - tile_len; ++i) {
 742                 int j = i < first ? i : i + tile_len;
 743                 int k = i < first ? i : i + 2 * tile_len;
 744
 745                 c = isl_equality_alloc(isl_local_space_copy(ls));
 746                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, j, -1);
 747                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, k, 1);
 748                 bmap = isl_basic_map_add_constraint(bmap, c);
 749         }
 750
 751         for (i = 0; i < tile_len; ++i) {
 752                 c = isl_equality_alloc(isl_local_space_copy(ls));
 753                 c = isl_constraint_set_coefficient_si(c, isl_dim_in,
 754                                                 first + i, -1);
 755                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 756                                                 first + i, tile_size[i]);
 757                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 758                                                 first + i + tile_len, 1);
 759                 bmap = isl_basic_map_add_constraint(bmap, c);
 760
 761                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 762                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 763                                                    first + i + tile_len, 1);
 764                 bmap = isl_basic_map_add_constraint(bmap, c);
 765
 766                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 767                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 768                                                    first + i + tile_len, -1);
 769                 c = isl_constraint_set_constant_si(c, tile_size[i] - 1);
 770                 bmap = isl_basic_map_add_constraint(bmap, c);
 771         }
 772
 773         isl_local_space_free(ls);
 774
 775         return isl_map_from_basic_map(bmap);
 776 }
 777
 778 /* Construct a map from a domain of dimensionality "len"
 779  * to a domain of dimensionality "len" + "wrap_len" that "wraps"
 780  * the "wrap_len" coordinates starting at "first" according to "wrap_size".
 781  * In particular, [s_i] -> [s_i, s_i % wrap_size[i]].
 782  * To do so, we need extra variables corresponding to [s_i / wrap_size[i]],
 783  * that are projected out at the end.
 784  * "dim" prescribes the parameters.
 785  */
 786 static __isl_give isl_map *wrap(__isl_take isl_space *dim, int len,
 787         int first, int wrap_len, int *wrap_size)
 788 {
 789         int i;
 790         isl_basic_map *bmap;
 791         isl_constraint *c;
 792         isl_local_space *ls;
 793
 794         dim = isl_space_add_dims(dim, isl_dim_in, len);
 795         dim = isl_space_add_dims(dim, isl_dim_out, len + 2 * wrap_len);
 796         bmap = isl_basic_map_universe(isl_space_copy(dim));
 797         ls = isl_local_space_from_space(dim);
 798
 799         for (i = 0; i < len; ++i) {
 800                 int k = i < first + wrap_len ? i : i + 2 * wrap_len;
 801
 802                 c = isl_equality_alloc(isl_local_space_copy(ls));
 803                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, -1);
 804                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, k, 1);
 805                 bmap = isl_basic_map_add_constraint(bmap, c);
 806         }
 807
 808         for (i = 0; i < wrap_len; ++i) {
 809                 c = isl_equality_alloc(isl_local_space_copy(ls));
 810                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 811                                                     first + i, -1);
 812                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 813                                                     first + wrap_len + i, 1);
 814                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 815                                     first + 2 * wrap_len + i, wrap_size[i]);
 816                 bmap = isl_basic_map_add_constraint(bmap, c);
 817
 818                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 819                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 820                                                     first + wrap_len + i, 1);
 821                 bmap = isl_basic_map_add_constraint(bmap, c);
 822
 823                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 824                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 825                                                     first + wrap_len + i, -1);
 826                 c = isl_constraint_set_constant_si(c, wrap_size[i] - 1);
 827                 bmap = isl_basic_map_add_constraint(bmap, c);
 828         }
 829
 830         isl_local_space_free(ls);
 831
 832         bmap = isl_basic_map_project_out(bmap, isl_dim_out,
 833                                 first + 2 * wrap_len, wrap_len);
 834
 835         return isl_map_from_basic_map(bmap);
 836 }
 837
 838 /* Add "n" parameters named prefix%d.
 839  */
 840 static __isl_give isl_set *add_params( __isl_take isl_set *set,
 841         int n, const char *prefix)
 842 {
 843         int i;
 844         unsigned nparam;
 845         char name[20];
 846
 847         nparam = isl_set_dim(set, isl_dim_param);
 848         set = isl_set_add_dims(set, isl_dim_param, n);
 849
 850         for (i = 0; i < n; ++i) {
 851                 snprintf(name, sizeof(name), "%s%d", prefix, i);
 852                 set = isl_set_set_dim_name(set, isl_dim_param,
 853                                             nparam + i, name);
 854         }
 855
 856         return set;
 857 }
 858
 859 /* Equate the "n" dimensions of "set" starting at "first" to
 860  * freshly created parameters named prefix%d.
 861  */
 862 static __isl_give isl_set *parametrize(__isl_take isl_set *set,
 863         int first, int n, const char *prefix)
 864 {
 865         int i;
 866         unsigned nparam;
 867
 868         nparam = isl_set_dim(set, isl_dim_param);
 869
 870         set = add_params(set, n, prefix);
 871
 872         for (i = 0; i < n; ++i)
 873                 set = isl_set_equate(set, isl_dim_param, nparam + i,
 874                                         isl_dim_set, first + i);
 875
 876         return set;
 877 }
 878
 879 /* Given a parameter space "space", create a set of dimension "len"
 880  * of which the "n" dimensions starting at "first" are equated to
 881  * freshly created parameters named prefix%d.
 882  */
 883 static __isl_give isl_set *parametrization(__isl_take isl_space *space,
 884         int len, int first, int n, const char *prefix)
 885 {
 886         isl_set *set;
 887
 888         space = isl_space_set_from_params(space);
 889         space = isl_space_add_dims(space, isl_dim_set, len);
 890         set = isl_set_universe(space);
 891
 892         return parametrize(set, first, n, prefix);
 893 }
 894
 895 /* Tile the B loops over the tile sizes and then tile/wrap
 896  * the T1 loops over the blocks.
 897  */
 898 static __isl_give isl_union_map *tile_schedule(struct gpu_gen *gen,
 899         __isl_take isl_union_map *sched)
 900 {
 901         isl_space *dim;
 902         isl_map *tiling, *block_tiling;
 903
 904         dim = isl_union_map_get_space(sched);
 905         tiling = tile(isl_space_copy(dim), gen->untiled_len,
 906                       gen->tile_first, gen->tile_len, gen->tile_size);
 907
 908         if (gen->options->wrap)
 909                 block_tiling = wrap(dim, gen->untiled_len + gen->tile_len,
 910                                 gen->tile_first, gen->n_grid, gen->grid_dim);
 911         else
 912                 block_tiling = tile(dim, gen->untiled_len + gen->tile_len,
 913                                 gen->tile_first, gen->n_grid, gen->grid_dim);
 914
 915         gen->tiled_len = gen->untiled_len + gen->tile_len + gen->n_grid;
 916
 917         tiling = isl_map_apply_range(tiling, block_tiling);
 918
 919         sched = isl_union_map_apply_range(sched,
 920                                              isl_union_map_from_map(tiling));
 921
 922         gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
 923
 924         return sched;
 925 }
 926
 927 /* Equate the "T1P" iterators in the tiled schedule "sched"
 928  * to the block dimensions.
 929  */
 930 static __isl_give isl_union_map *parametrize_tiled_schedule(
 931         struct gpu_gen *gen, __isl_take isl_union_map *sched)
 932 {
 933         isl_space *dim;
 934         isl_set *par;
 935
 936         dim = isl_union_map_get_space(sched);
 937         par = parametrization(dim, gen->tiled_len,
 938                 gen->tile_first + gen->n_grid, gen->n_grid, "b");
 939         sched = isl_union_map_intersect_range(sched,
 940                                                 isl_union_set_from_set(par));
 941
 942         return sched;
 943 }
 944
 945 /* Tile/wrap the P1 loops over the threads.
 946  */
 947 static __isl_give isl_union_map *thread_tile_schedule(struct gpu_gen *gen,
 948         __isl_take isl_union_map *sched)
 949 {
 950         isl_space *dim;
 951         isl_map *tiling;
 952         isl_set *par;
 953
 954         dim = isl_union_map_get_space(sched);
 955
 956         if (gen->options->wrap)
 957                 tiling = wrap(isl_space_copy(dim), gen->tiled_len,
 958                                 gen->shared_len, gen->n_block, gen->block_dim);
 959         else
 960                 tiling = tile(isl_space_copy(dim), gen->tiled_len,
 961                                 gen->shared_len, gen->n_block, gen->block_dim);
 962         gen->thread_tiled_len = gen->tiled_len + gen->n_block;
 963
 964         sched = isl_union_map_apply_range(sched,
 965                                              isl_union_map_from_map(tiling));
 966
 967         par = parametrization(dim, gen->thread_tiled_len,
 968                 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
 969                 gen->n_block, "t");
 970         sched = isl_union_map_intersect_range(sched,
 971                                                 isl_union_set_from_set(par));
 972
 973         gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
 974
 975         return sched;
 976 }
 977
 978 /* If the user asked for it, scale the shared memory tile loops
 979  * (T1T and T2) of "sched" by gen->tile_size[i].
 980  * If we are not performing "wrapping", then additionally scale the T1P
 981  * loops by gen->grid_dim[i].
 982  */
 983 static __isl_give isl_union_map *scale_tile_loops(struct gpu_gen *gen,
 984         __isl_take isl_union_map *sched)
 985 {
 986         int i;
 987         isl_space *dim;
 988         isl_basic_map *scale;
 989         isl_constraint *c;
 990         isl_local_space *ls;
 991
 992         if (!gen->options->scale_tile_loops)
 993                 return sched;
 994
 995         dim = isl_union_map_get_space(sched);
 996         dim = isl_space_add_dims(dim, isl_dim_in, gen->tiled_len);
 997         dim = isl_space_add_dims(dim, isl_dim_out, gen->tiled_len);
 998         scale = isl_basic_map_universe(isl_space_copy(dim));
 999         ls = isl_local_space_from_space(dim);
1000
1001         for (i = 0; i < gen->tiled_len; ++i) {
1002                 int f = 1;
1003
1004                 if (i >= gen->tile_first && i < gen->tile_first + gen->n_grid) {
1005                         f = gen->tile_size[i - gen->tile_first];
1006                         if (!gen->options->wrap)
1007                                 f *= gen->grid_dim[i - gen->tile_first];
1008                 } else if (i >= gen->tile_first + gen->n_grid &&
1009                            i < gen->tile_first + gen->n_grid + gen->tile_len) {
1010                         f = gen->tile_size[i - (gen->tile_first + gen->n_grid)];
1011                 }
1012
1013                 c = isl_equality_alloc(isl_local_space_copy(ls));
1014                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1015                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1016                 scale = isl_basic_map_add_constraint(scale, c);
1017         }
1018
1019         isl_local_space_free(ls);
1020
1021         sched = isl_union_map_apply_range(sched,
1022                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1023
1024         return sched;
1025 }
1026
1027 /* If we are not performing "wrapping" and if the user asked for it,
1028  * scale the thread tile loops (P1T) of "sched" by gen->block_dim[i].
1029  */
1030 static __isl_give isl_union_map *scale_thread_tile_loops(struct gpu_gen *gen,
1031         __isl_take isl_union_map *sched)
1032 {
1033         int i;
1034         isl_space *dim;
1035         isl_basic_map *scale;
1036         isl_constraint *c;
1037         isl_local_space *ls;
1038
1039         if (gen->options->wrap)
1040                 return sched;
1041         if (!gen->options->scale_tile_loops)
1042                 return sched;
1043
1044         dim = isl_union_map_get_space(sched);
1045         dim = isl_space_add_dims(dim, isl_dim_in, gen->thread_tiled_len);
1046         dim = isl_space_add_dims(dim, isl_dim_out, gen->thread_tiled_len);
1047         scale = isl_basic_map_universe(isl_space_copy(dim));
1048         ls = isl_local_space_from_space(dim);
1049
1050         for (i = 0; i < gen->thread_tiled_len; ++i) {
1051                 int f = 1;
1052
1053                 if (i >= gen->shared_len &&
1054                     i < gen->shared_len + gen->n_block)
1055                         f = gen->block_dim[i - gen->shared_len];
1056
1057                 c = isl_equality_alloc(isl_local_space_copy(ls));
1058                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1059                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1060                 scale = isl_basic_map_add_constraint(scale, c);
1061         }
1062
1063         isl_local_space_free(ls);
1064
1065         sched = isl_union_map_apply_range(sched,
1066                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1067
1068         return sched;
1069 }
1070
1071 /* If we are not performing "wrapping" and if the user asked for it,
1072  * scale the "n_tile" loops starting at "first" of "sched" by gen->block_dim[i].
1073  */
1074 static __isl_give isl_union_map *scale_access_tile_loops(struct gpu_gen *gen,
1075         __isl_take isl_union_map *sched, int len, int first, int n_tile)
1076 {
1077         int i;
1078         isl_space *dim;
1079         isl_basic_map *scale;
1080         isl_constraint *c;
1081         isl_local_space *ls;
1082
1083         if (gen->options->wrap)
1084                 return sched;
1085         if (!gen->options->scale_tile_loops)
1086                 return sched;
1087
1088         dim = isl_union_map_get_space(sched);
1089         dim = isl_space_add_dims(dim, isl_dim_in, len);
1090         dim = isl_space_add_dims(dim, isl_dim_out, len);
1091         scale = isl_basic_map_universe(isl_space_copy(dim));
1092         ls = isl_local_space_from_space(dim);
1093
1094         for (i = 0; i < len; ++i) {
1095                 int f = 1;
1096
1097                 if (i >= first && i < first + n_tile)
1098                         f = gen->kernel->block_dim[i - first];
1099
1100                 c = isl_equality_alloc(isl_local_space_copy(ls));
1101                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1102                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1103                 scale = isl_basic_map_add_constraint(scale, c);
1104         }
1105
1106         isl_local_space_free(ls);
1107
1108         sched = isl_union_map_apply_range(sched,
1109                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1110
1111         return sched;
1112 }
1113
1114 /* Add "len" parameters p[i] called prefix%d,
1115  * with bounds to 0 <= p[i] < size[i].
1116  */
1117 __isl_give isl_set *add_bounded_parameters(__isl_take isl_set *set,
1118         int len, int *size, const char *prefix)
1119 {
1120         int i;
1121         unsigned nparam;
1122         isl_space *dim;
1123         isl_basic_set *bset;
1124         isl_constraint *c;
1125         isl_local_space *ls;
1126         char name[20];
1127
1128         nparam = isl_set_dim(set, isl_dim_param);
1129         set = isl_set_add_dims(set, isl_dim_param, len);
1130
1131         for (i = 0; i < len; ++i) {
1132                 snprintf(name, sizeof(name), "%s%d", prefix, i);
1133                 set = isl_set_set_dim_name(set, isl_dim_param,
1134                                             nparam + i, name);
1135         }
1136
1137         dim = isl_set_get_space(set);
1138         bset = isl_basic_set_universe(isl_space_copy(dim));
1139         ls = isl_local_space_from_space(dim);
1140
1141         for (i = 0; i < len; ++i) {
1142                 c = isl_inequality_alloc(isl_local_space_copy(ls));
1143                 c = isl_constraint_set_coefficient_si(c, isl_dim_param,
1144                                                         nparam + i, 1);
1145                 bset = isl_basic_set_add_constraint(bset, c);
1146
1147                 c = isl_inequality_alloc(isl_local_space_copy(ls));
1148                 c = isl_constraint_set_coefficient_si(c, isl_dim_param,
1149                                                         nparam + i, -1);
1150                 c = isl_constraint_set_constant_si(c, size[i] - 1);
1151                 bset = isl_basic_set_add_constraint(bset, c);
1152         }
1153
1154         isl_local_space_free(ls);
1155
1156         return isl_set_intersect(set, isl_set_from_basic_set(bset));
1157 }
1158
1159 /* Add "len" parameters p[i] called prefix%d,
1160  * with bounds to 0 <= p[i] < size[i].
1161  */
1162 static __isl_give isl_set *add_bounded_parameters_dynamic(
1163         __isl_take isl_set *set, __isl_keep isl_multi_pw_aff *size,
1164         const char *prefix)
1165 {
1166         int i, len;
1167         unsigned nparam;
1168         isl_space *space;
1169         isl_local_space *ls;
1170         char name[20];
1171
1172         len = isl_multi_pw_aff_dim(size, isl_dim_out);
1173         nparam = isl_set_dim(set, isl_dim_param);
1174         set = isl_set_add_dims(set, isl_dim_param, len);
1175
1176         for (i = 0; i < len; ++i) {
1177                 snprintf(name, sizeof(name), "%s%d", prefix, i);
1178                 set = isl_set_set_dim_name(set, isl_dim_param,
1179                                             nparam + i, name);
1180         }
1181
1182         space = isl_space_params(isl_set_get_space(set));
1183         ls = isl_local_space_from_space(space);
1184         for (i = 0; i < len; ++i) {
1185                 isl_pw_aff *param, *size_i, *zero;
1186                 isl_set *bound;
1187
1188                 param = isl_pw_aff_var_on_domain(isl_local_space_copy(ls),
1189                                                 isl_dim_param, nparam + i);
1190
1191                 size_i = isl_multi_pw_aff_get_pw_aff(size, i);
1192                 bound = isl_pw_aff_lt_set(isl_pw_aff_copy(param), size_i);
1193                 set = isl_set_intersect_params(set, bound);
1194
1195                 zero = isl_pw_aff_zero_on_domain(isl_local_space_copy(ls));
1196                 bound = isl_pw_aff_ge_set(param, zero);
1197                 set = isl_set_intersect_params(set, bound);
1198         }
1199         isl_local_space_free(ls);
1200
1201         return set;
1202 }
1203
1204 /* Construct a map from an access to group->array to the corresponding
1205  * shared/private memory tile.
1206  * The map is of the form
1207  *
1208  *      { [D[i] -> A[a]] -> T[t] }
1209  *
1210  * where D represents the initial shared_len dimensions
1211  * of the computed schedule.
1212  */
1213 static __isl_give isl_map *shift_access(struct gpu_array_ref_group *group)
1214 {
1215         struct gpu_array_tile *tile;
1216         isl_multi_aff *tiling;
1217
1218         tile = group->private_tile;
1219         if (!tile)
1220                 tile = group->shared_tile;
1221
1222         tiling = isl_multi_aff_copy(tile->tiling);
1223
1224         return isl_map_from_multi_aff(tiling);
1225 }
1226
1227 /* Does "map" have an obviously fixed value at variable "pos" of "type"?
1228  */
1229 static int map_plain_is_fixed(isl_map *map, enum isl_dim_type type,
1230         unsigned pos)
1231 {
1232         isl_val *v;
1233         int fixed;
1234
1235         v = isl_map_plain_get_val_if_fixed(map, type, pos);
1236         if (!v)
1237                 return -1;
1238         fixed = isl_val_is_int(v);
1239         isl_val_free(v);
1240
1241         return fixed;
1242 }
1243
1244 /* Given a schedule that iterates over all elements in a piece of an array,
1245  * perform tiling/wrapping over the threads.
1246  *
1247  * In particular, we tile the final iterators so that the final thread
1248  * dimension runs over the final array dimension.
1249  * However, if those final iterators have only a single iteration,
1250  * we try to tile earlier iterators instead.
1251  */
1252 static __isl_give isl_map *tile_access_schedule(struct gpu_gen *gen,
1253         __isl_take isl_map *sched)
1254 {
1255         isl_space *dim;
1256         isl_union_map *usched;
1257         isl_map *tiling;
1258         isl_set *par;
1259         unsigned nvar = isl_map_dim(sched, isl_dim_out);
1260         int n_tile;
1261         int first;
1262
1263         n_tile = gen->kernel->n_block;
1264         if (n_tile > nvar) {
1265                 int i;
1266                 sched = isl_map_insert_dims(sched,
1267                                                 isl_dim_out, 0, n_tile - nvar);
1268                 for (i = 0; i < n_tile - nvar; ++i)
1269                         sched = isl_map_fix_si(sched, isl_dim_out, i, 0);
1270                 nvar = n_tile;
1271         }
1272
1273         first = nvar - n_tile;
1274
1275         for (; first > 0; first --)
1276                 if (!map_plain_is_fixed(sched, isl_dim_out, first + n_tile - 1))
1277                         break;
1278
1279         dim = isl_map_get_space(sched);
1280         dim = isl_space_params(dim);
1281         if (gen->options->wrap)
1282                 tiling = wrap(isl_space_copy(dim), nvar, first,
1283                                 n_tile, gen->kernel->block_dim);
1284         else
1285                 tiling = tile(isl_space_copy(dim), nvar, first,
1286                                 n_tile, gen->kernel->block_dim);
1287         sched = isl_map_apply_range(sched, tiling);
1288
1289         par = parametrization(dim, nvar + n_tile, first + n_tile, n_tile, "t");
1290         sched = isl_map_intersect_range(sched, par);
1291
1292         usched = isl_union_map_from_map(sched);
1293         usched = scale_access_tile_loops(gen, usched, nvar + n_tile,
1294                                          first, n_tile);
1295         sched = isl_map_from_union_map(usched);
1296
1297         return sched;
1298 }
1299
1300 /* Return the union of all read (read = 1) and/or write (write = 1)
1301  * access relations in the group.
1302  */
1303 static __isl_give isl_union_map *group_access_relation(
1304         struct gpu_array_ref_group *group, int read, int write)
1305 {
1306         int i;
1307         isl_union_map *access;
1308
1309         access = isl_union_map_empty(isl_map_get_space(group->access));
1310         for (i = 0; i < group->n_ref; ++i) {
1311                 isl_map *map_i;
1312
1313                 if (!((read && group->refs[i]->read) ||
1314                      (write && group->refs[i]->write)))
1315                         continue;
1316                 map_i = isl_map_copy(group->refs[i]->access);
1317                 access = isl_union_map_union(access,
1318                                             isl_union_map_from_map(map_i));
1319         }
1320
1321         return access;
1322 }
1323
1324 /* Return the extent of "array", recomputed from the bounds.
1325  * The recomputed extent may be simpler than the original extent.
1326  */
1327 static __isl_give isl_set *array_extent(struct gpu_array_info *array)
1328 {
1329         int i;
1330         isl_id *id;
1331         isl_space *space;
1332         isl_local_space *ls;
1333         isl_set *extent;
1334
1335         id = isl_set_get_tuple_id(array->extent);
1336         space = isl_set_get_space(array->extent);
1337         extent = isl_set_universe(isl_space_copy(space));
1338         ls = isl_local_space_from_space(space);
1339         for (i = 0; i < array->n_index; ++i) {
1340                 isl_pw_aff *bound;
1341                 isl_aff *aff;
1342                 isl_pw_aff *index;
1343                 isl_set *lt;
1344
1345                 extent = isl_set_lower_bound_si(extent, isl_dim_set, i, 0);
1346
1347                 aff = isl_aff_var_on_domain(isl_local_space_copy(ls),
1348                                                 isl_dim_set, i);
1349                 index = isl_pw_aff_from_aff(aff);
1350                 bound = isl_pw_aff_copy(array->bound[i]);
1351                 bound = isl_pw_aff_from_range(bound);
1352                 bound = isl_pw_aff_add_dims(bound, isl_dim_in, array->n_index);
1353                 bound = isl_pw_aff_set_tuple_id(bound, isl_dim_in,
1354                                                 isl_id_copy(id));
1355                 lt = isl_pw_aff_lt_set(index, bound);
1356                 extent = isl_set_intersect(extent, lt);
1357         }
1358         isl_local_space_free(ls);
1359         isl_id_free(id);
1360
1361         return extent;
1362 }
1363
1364 /* Return a map from the first shared_len dimensions of the computed
1365  * schedule to the array tile in
1366  * global memory that corresponds to the shared memory copy.
1367  *
1368  * In particular, return a map
1369  *
1370  *      { D[i] -> A[a] }
1371  *
1372  * with constraints
1373  *
1374  *      tile_offset(i) <= a <= tile_offset(i) + tile_size - 1           (1)
1375  *
1376  * and
1377  *
1378  *      0 <= a <= array_size - 1                                        (2)
1379  *
1380  * Note that if some stride has been detected (i.e., when
1381  * group->shared_tile->bound[i].shift is set), then a in (1) refers
1382  * to the shifted and scaled down version.
1383  *
1384  * Constraints (1) are obtained by mapping the size constraints on the
1385  * shared/private memory tile back to the access relation.
1386  * Constraints (2) are obtained from the (recomputed) extent.
1387  */
1388 static __isl_give isl_map *group_tile(struct gpu_array_ref_group *group)
1389 {
1390         int i;
1391         int n_index = group->array->n_index;
1392         isl_map *tile;
1393         isl_space *space;
1394         isl_set *local;
1395         isl_set *extent;
1396
1397         space = isl_multi_aff_get_space(group->shared_tile->tiling);
1398         space = isl_space_range(space);
1399         local = isl_set_universe(space);
1400         for (i = 0; i < n_index; ++i) {
1401                 isl_val *bound;
1402
1403                 local = isl_set_lower_bound_si(local, isl_dim_set, i, 0);
1404                 bound = isl_val_copy(group->shared_tile->bound[i].size);
1405                 bound = isl_val_sub_ui(bound, 1);
1406                 local = isl_set_upper_bound_val(local, isl_dim_set, i, bound);
1407         }
1408         local = isl_set_preimage_multi_aff(local,
1409                                 isl_multi_aff_copy(group->shared_tile->tiling));
1410         tile = isl_set_unwrap(local);
1411         extent = array_extent(group->array);
1412         tile = isl_map_intersect_range(tile, extent);
1413
1414         return tile;
1415 }
1416
1417 /* Given a mapping "iterator_map" from the AST schedule to a domain,
1418  * return the corresponding mapping from the AST schedule to
1419  * to the first shared_len dimensions of the schedule computed by PPCG.
1420  */
1421 static __isl_give isl_pw_multi_aff *compute_sched_to_shared(struct gpu_gen *gen,
1422         __isl_take isl_pw_multi_aff *iterator_map)
1423 {
1424         isl_union_map *umap;
1425         isl_space *space;
1426         isl_map *map, *sched;;
1427
1428         space = isl_space_range(isl_pw_multi_aff_get_space(iterator_map));
1429         space = isl_space_from_domain(space);
1430         space = isl_space_add_dims(space, isl_dim_out, gen->shared_len);
1431
1432         umap = isl_union_map_copy(gen->shared_sched);
1433         umap = isl_union_map_apply_range(umap,
1434                         isl_union_map_copy(gen->shared_proj));
1435         map = isl_union_map_extract_map(umap, space);
1436         isl_union_map_free(umap);
1437
1438         sched = isl_map_preimage_domain_pw_multi_aff(map, iterator_map);
1439         sched = isl_map_detect_equalities(sched);
1440
1441         return isl_pw_multi_aff_from_map(sched);
1442 }
1443
1444 /* Set unroll[j] if the input dimension j is involved in
1445  * the index expression represented by ma.
1446  */
1447 static int check_unroll(__isl_take isl_set *set, __isl_take isl_multi_aff *ma,
1448         void *user)
1449 {
1450         int i, j;
1451         int n_in = isl_multi_aff_dim(ma, isl_dim_in);
1452         int n_out = isl_multi_aff_dim(ma, isl_dim_out);
1453         int *unroll = user;
1454
1455         for (i = 0; i < n_out; ++i) {
1456                 isl_aff *aff;
1457
1458                 aff = isl_multi_aff_get_aff(ma, i);
1459                 for (j = 0; j < n_in; ++j)
1460                         if (isl_aff_involves_dims(aff, isl_dim_in, j, 1))
1461                                 unroll[j] = 1;
1462                 isl_aff_free(aff);
1463         }
1464
1465         isl_set_free(set);
1466         isl_multi_aff_free(ma);
1467         return 0;
1468 }
1469
1470 /* Given an array pos mapping input dimensions to the corresponding
1471  * output dimension, construct the corresponding map.
1472  */
1473 static __isl_give isl_map *permutation(__isl_take isl_space *dim,
1474         int *pos, int len)
1475 {
1476         int i;
1477         isl_constraint *c;
1478         isl_basic_map *bmap;
1479         isl_local_space *ls;
1480
1481         dim = isl_space_add_dims(dim, isl_dim_in, len);
1482         dim = isl_space_add_dims(dim, isl_dim_out, len);
1483         bmap = isl_basic_map_universe(isl_space_copy(dim));
1484         ls = isl_local_space_from_space(dim);
1485
1486         for (i = 0; i < len; ++i) {
1487                 c = isl_equality_alloc(isl_local_space_copy(ls));
1488                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i,
1489                                                       -1);
1490                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, pos[i],
1491                                                       1);
1492                 bmap = isl_basic_map_add_constraint(bmap, c);
1493         }
1494         isl_local_space_free(ls);
1495
1496         return isl_map_from_basic_map(bmap);
1497 }
1498
1499 /* Find all loops involved in any of the index expressions for any of
1500  * the private accesses, move them innermost and then mark them as
1501  * requiring unrolling by setting gen->first_unroll.
1502  * The loops involved should all be parallel because of the checks
1503  * we performed in check_private_group_access.  Moving them innermost
1504  * is therefore a valid transformation.
1505  *
1506  * Loops up to gen->shared_len are generated before the mapping to
1507  * threads is applied.  They should therefore be ignored.
1508  *
1509  * We compute the hidden equalities of the schedule first
1510  * since we will need them in our calls to isl_pw_multi_aff_from_map
1511  * and because we want to make sure that the same equalities
1512  * are also available to the code generator.
1513  */
1514 static __isl_give isl_union_map *interchange_for_unroll(struct gpu_gen *gen,
1515         __isl_take isl_union_map *sched)
1516 {
1517         int i, j;
1518         int unroll[gen->thread_tiled_len];
1519         int perm[gen->thread_tiled_len];
1520         isl_space *dim;
1521         isl_map *permute;
1522         int len = gen->shared_len + gen->n_parallel + gen->n_block;
1523
1524         gen->first_unroll = -1;
1525
1526         sched = isl_union_map_detect_equalities(sched);
1527         for (i = 0; i < gen->thread_tiled_len; ++i)
1528                 unroll[i] = 0;
1529         for (i = 0; i < gen->prog->n_array; ++i) {
1530                 struct gpu_array_info *array = &gen->prog->array[i];
1531
1532                 for (j = 0; j < array->n_group; ++j) {
1533                         isl_union_map *access;
1534                         isl_map *acc;
1535                         isl_pw_multi_aff *pma;
1536
1537                         if (!array->groups[j]->private_tile)
1538                                 continue;
1539
1540                         access = group_access_relation(array->groups[j], 1, 1);
1541                         access = isl_union_map_apply_domain(access,
1542                                                 isl_union_map_copy(sched));
1543
1544                         acc = isl_map_from_union_map(access);
1545                         pma = isl_pw_multi_aff_from_map(acc);
1546                         isl_pw_multi_aff_foreach_piece(pma,
1547                                                         &check_unroll, unroll);
1548
1549                         isl_pw_multi_aff_free(pma);
1550                 }
1551         }
1552
1553         for (i = gen->shared_len; i < len; ++i)
1554                 if (unroll[i])
1555                         break;
1556
1557         if (i >= len)
1558                 return sched;
1559
1560         for (i = len; i < gen->thread_tiled_len; ++i)
1561                 if (unroll[i])
1562                         return sched;
1563
1564         j = 0;
1565         for (i = 0; i < gen->shared_len; ++i)
1566                 perm[i] = j++;
1567         for (i = gen->shared_len; i < gen->thread_tiled_len; ++i)
1568                 if (!unroll[i])
1569                         perm[i] = j++;
1570         gen->first_unroll = j - gen->shared_len;
1571         for (i = gen->shared_len; i < len; ++i)
1572                 if (unroll[i])
1573                         perm[i] = j++;
1574
1575         dim = isl_union_map_get_space(sched);
1576         permute = permutation(dim, perm, gen->thread_tiled_len);
1577         sched = isl_union_map_apply_range(sched,
1578                                           isl_union_map_from_map(permute));
1579
1580         return sched;
1581 }
1582
1583 /* Given a constraint
1584  *
1585  *              a(p,i) + j = g f(e)
1586  *
1587  * or -a(p,i) - j = g f(e) if sign < 0,
1588  * store a(p,i) in bound->shift and g (stride) in bound->stride.
1589  * a(p,i) is assumed to be an expression in only the parameters
1590  * and the input dimensions.
1591  */
1592 static void extract_stride(__isl_keep isl_constraint *c,
1593         struct gpu_array_bound *bound, __isl_keep isl_val *stride, int sign)
1594 {
1595         int i;
1596         isl_val *v;
1597         isl_space *space;
1598         unsigned nparam;
1599         unsigned nvar;
1600         isl_aff *aff;
1601
1602         isl_val_free(bound->stride);
1603         bound->stride = isl_val_copy(stride);
1604
1605         space = isl_constraint_get_space(c);
1606         space = isl_space_domain(space);
1607
1608         nparam = isl_space_dim(space, isl_dim_param);
1609         nvar = isl_space_dim(space, isl_dim_set);
1610
1611         v = isl_constraint_get_constant_val(c);
1612         if (sign < 0)
1613                 v = isl_val_neg(v);
1614         aff = isl_aff_zero_on_domain(isl_local_space_from_space(space));
1615         aff = isl_aff_set_constant_val(aff, v);
1616
1617         for (i = 0; i < nparam; ++i) {
1618                 if (!isl_constraint_involves_dims(c, isl_dim_param, i, 1))
1619                         continue;
1620                 v = isl_constraint_get_coefficient_val(c, isl_dim_param, i);
1621                 if (sign < 0)
1622                         v = isl_val_neg(v);
1623                 aff = isl_aff_add_coefficient_val(aff, isl_dim_param, i, v);
1624         }
1625
1626         for (i = 0; i < nvar; ++i) {
1627                 if (!isl_constraint_involves_dims(c, isl_dim_in, i, 1))
1628                         continue;
1629                 v = isl_constraint_get_coefficient_val(c, isl_dim_in, i);
1630                 if (sign < 0)
1631                         v = isl_val_neg(v);
1632                 aff = isl_aff_add_coefficient_val(aff, isl_dim_in, i, v);
1633         }
1634
1635         bound->shift = aff;
1636 }
1637
1638 /* Given an equality constraint of a map with a single output dimension j,
1639  * check if the constraint is of the form
1640  *
1641  *              a(p,i) + j = g f(e)
1642  *
1643  * with a(p,i) an expression in the parameters and input dimensions
1644  * and f(e) an expression in the existentially quantified variables.
1645  * If so, and if g is larger than any such g from a previously considered
1646  * constraint, then call extract_stride to record the stride information
1647  * in bound.
1648  */
1649 static int check_stride_constraint(__isl_take isl_constraint *c, void *user)
1650 {
1651         int i;
1652         isl_ctx *ctx;
1653         isl_val *v;
1654         unsigned n_div;
1655         struct gpu_array_bound *bound = user;
1656
1657         ctx = isl_constraint_get_ctx(c);
1658         n_div = isl_constraint_dim(c, isl_dim_div);
1659         v = isl_constraint_get_coefficient_val(c, isl_dim_out, 0);
1660
1661         if (n_div && (isl_val_is_one(v) || isl_val_is_negone(v))) {
1662                 int s = isl_val_sgn(v);
1663                 isl_val *stride = isl_val_zero(ctx);
1664
1665                 isl_val_free(v);
1666                 for (i = 0; i < n_div; ++i) {
1667                         v = isl_constraint_get_coefficient_val(c,
1668                                                                 isl_dim_div, i);
1669                         stride = isl_val_gcd(stride, v);
1670                 }
1671                 if (!isl_val_is_zero(stride) &&
1672                     isl_val_gt(stride, bound->stride))
1673                         extract_stride(c, bound, stride, s);
1674
1675                 isl_val_free(stride);
1676         } else
1677                 isl_val_free(v);
1678
1679         isl_constraint_free(c);
1680         return 0;
1681 }
1682
1683 /* Given contraints on an array index i, check if we can find
1684  * a shift a(p) and a stride g such that
1685  *
1686  *      a(p) + i = 0 mod g
1687  *
1688  * If so, record the information in bound and apply the mapping
1689  * i -> (i + a(p))/g to the array index in bounds and return
1690  * the new constraints.
1691  * If not, simply return the original constraints.
1692  *
1693  * If bounds is a subset of the space
1694  *
1695  *      D -> i
1696  *
1697  * then the bound recorded in bound->shift is of the form
1698  *
1699  *      D -> s(D)
1700  *
1701  * with s(D) equal to a(p) above.
1702  * The mapping recorded in bound->shift_map is of the form
1703  *
1704  *      [D -> i] -> [D -> (i + S(D))/g]
1705  *
1706  * This mapping is computed as follows.
1707  * We first introduce "i" in the domain through precomposition
1708  * with [D -> i] -> D obtaining
1709  *
1710  *      [D -> i] -> s(D)
1711  *
1712  * Adding [D -> i] -> i produces
1713  *
1714  *      [D -> i] -> i + s(D)
1715  *
1716  * and the domain product with [D -> i] -> D yields
1717  *
1718  *      [D -> i] -> [D -> i + s(D)]
1719  *
1720  * Composition with [D -> i] -> [D -> i/g] gives the desired result.
1721  */
1722 static __isl_give isl_basic_map *check_stride(struct gpu_array_bound *bound,
1723         __isl_take isl_basic_map *bounds)
1724 {
1725         isl_space *space;
1726         isl_basic_map *hull;
1727         isl_basic_map *shift, *id, *bmap, *scale;
1728         isl_basic_set *bset;
1729         isl_aff *aff;
1730
1731         bound->stride = NULL;
1732
1733         hull = isl_basic_map_affine_hull(isl_basic_map_copy(bounds));
1734
1735         isl_basic_map_foreach_constraint(hull, &check_stride_constraint, bound);
1736
1737         isl_basic_map_free(hull);
1738
1739         if (!bound->stride)
1740                 return bounds;
1741
1742         shift = isl_basic_map_from_aff(isl_aff_copy(bound->shift));
1743         space = isl_basic_map_get_space(bounds);
1744         bmap = isl_basic_map_domain_map(isl_basic_map_universe(space));
1745         shift = isl_basic_map_apply_range(bmap, shift);
1746         space = isl_basic_map_get_space(bounds);
1747         id = isl_basic_map_range_map(isl_basic_map_universe(space));
1748         shift = isl_basic_map_sum(id, shift);
1749         space = isl_basic_map_get_space(bounds);
1750         id = isl_basic_map_domain_map(isl_basic_map_universe(space));
1751         shift = isl_basic_map_range_product(id, shift);
1752
1753         space = isl_space_domain(isl_basic_map_get_space(bounds));
1754         id = isl_basic_map_identity(isl_space_map_from_set(space));
1755         space = isl_space_range(isl_basic_map_get_space(bounds));
1756         aff = isl_aff_zero_on_domain(isl_local_space_from_space(space));
1757         aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1);
1758         aff = isl_aff_scale_down_val(aff, isl_val_copy(bound->stride));
1759         scale = isl_basic_map_from_aff(aff);
1760         scale = isl_basic_map_product(id, scale);
1761
1762         bound->shift_map = isl_basic_map_apply_range(shift, scale);
1763         bmap = isl_basic_map_copy(bound->shift_map);
1764         bset = isl_basic_set_apply(isl_basic_map_wrap(bounds), bmap);
1765         bounds = isl_basic_set_unwrap(bset);
1766
1767         return bounds;
1768 }
1769
1770 /* Data used in compute_array_dim_size and compute_size_in_direction.
1771  *
1772  * pos is the position of the variable representing the array index,
1773  * i.e., the variable for which want to compute the size.  This variable
1774  * is also the last variable in the set.
1775  */
1776 struct gpu_size_info {
1777         isl_basic_set *bset;
1778         struct gpu_array_bound *bound;
1779         int pos;
1780 };
1781
1782 /* Given a constraint from the basic set describing the bounds on
1783  * an array index, check if it is a lower bound, say m i >= b(x), and,
1784  * if so, check whether the expression "i - ceil(b(x)/m) + 1" has a constant
1785  * upper bound.  If so, and if this bound is smaller than any bound
1786  * derived from earlier constraints, set the size to this bound on
1787  * the expression and the lower bound to ceil(b(x)/m).
1788  */
1789 static int compute_size_in_direction(__isl_take isl_constraint *c, void *user)
1790 {
1791         struct gpu_size_info *size = user;
1792         unsigned nparam;
1793         unsigned n_div;
1794         isl_val *v;
1795         isl_aff *aff;
1796         isl_aff *lb;
1797
1798         nparam = isl_basic_set_dim(size->bset, isl_dim_param);
1799         n_div = isl_constraint_dim(c, isl_dim_div);
1800
1801         if (isl_constraint_involves_dims(c, isl_dim_div, 0, n_div) ||
1802             !isl_constraint_is_lower_bound(c, isl_dim_set, size->pos)) {
1803                 isl_constraint_free(c);
1804                 return 0;
1805         }
1806
1807         aff = isl_constraint_get_bound(c, isl_dim_set, size->pos);
1808         aff = isl_aff_ceil(aff);
1809
1810         lb = isl_aff_copy(aff);
1811
1812         aff = isl_aff_neg(aff);
1813         aff = isl_aff_add_coefficient_si(aff, isl_dim_in, size->pos, 1);
1814
1815         v = isl_basic_set_max_val(size->bset, aff);
1816         isl_aff_free(aff);
1817
1818         if (isl_val_is_int(v)) {
1819                 v = isl_val_add_ui(v, 1);
1820                 if (!size->bound->size || isl_val_lt(v, size->bound->size)) {
1821                         isl_val_free(size->bound->size);
1822                         size->bound->size = isl_val_copy(v);
1823                         lb = isl_aff_drop_dims(lb, isl_dim_in, size->pos, 1);
1824                         isl_aff_free(size->bound->lb);
1825                         size->bound->lb = isl_aff_copy(lb);
1826                 }
1827         }
1828         isl_val_free(v);
1829         isl_aff_free(lb);
1830
1831         isl_constraint_free(c);
1832
1833         return 0;
1834 }
1835
1836 /* Given a basic map "bounds" that maps parameters and input dimensions
1837  * to a single output dimension, look for an expression in the parameters
1838  * and input dimensions such that the range of the output dimension shifted
1839  * by this expression is a constant.
1840  *
1841  * In particular, we currently only consider lower bounds on the output
1842  * dimension as candidate expressions.
1843  */
1844 static int compute_array_dim_size(struct gpu_array_bound *bound,
1845         __isl_take isl_basic_map *bounds)
1846 {
1847         struct gpu_size_info size;
1848
1849         bounds = isl_basic_map_detect_equalities(bounds);
1850         bounds = check_stride(bound, bounds);
1851
1852         bound->size = NULL;
1853         bound->lb = NULL;
1854
1855         size.bound = bound;
1856         size.pos = isl_basic_map_dim(bounds, isl_dim_in);
1857         size.bset = isl_basic_map_wrap(bounds);
1858         size.bset = isl_basic_set_flatten(size.bset);
1859         size.bset = isl_set_simple_hull(isl_basic_set_compute_divs(size.bset));
1860         isl_basic_set_foreach_constraint(size.bset, &compute_size_in_direction,
1861                                         &size);
1862         isl_basic_set_free(size.bset);
1863
1864         return bound->size ? 0 : -1;
1865 }
1866
1867 /* Check if we can find a memory tile for the given array
1868  * based on the given accesses, and if so, put the results in "tile".
1869  *
1870  * We project the accesses on each index in turn and look for a parametric
1871  * offset such that the size is constant.
1872  */
1873 static int can_tile(__isl_keep isl_map *access, struct gpu_array_tile *tile)
1874 {
1875         int i;
1876
1877         for (i = 0; i < tile->n; ++i) {
1878                 isl_map *access_i;
1879                 isl_basic_map *hull;
1880
1881                 access_i = isl_map_copy(access);
1882                 access_i = isl_map_project_out(access_i, isl_dim_out, 0, i);
1883                 access_i = isl_map_project_out(access_i, isl_dim_out,
1884                                             1, tile->n - (i + 1));
1885                 access_i = isl_map_compute_divs(access_i);
1886                 hull = isl_map_simple_hull(access_i);
1887                 if (compute_array_dim_size(&tile->bound[i], hull) < 0)
1888                         return 0;
1889         }
1890
1891         return 1;
1892 }
1893
1894 /* Construct a map with input the shared tile loops and the loops that
1895  * will be wrapped around the threads that relates these later loops
1896  * to the thread indices and then projects them out.
1897  */
1898 static __isl_give isl_map *compute_privatization(struct gpu_gen *gen)
1899 {
1900         isl_map *priv;
1901         isl_map *tiling;
1902         isl_map *proj;
1903         isl_set *par;
1904         isl_space *dim;
1905
1906         dim = isl_union_map_get_space(gen->shared_sched);
1907
1908         if (gen->options->wrap)
1909                 tiling = wrap(isl_space_copy(dim), gen->shared_len + gen->n_block,
1910                                 gen->shared_len, gen->n_block, gen->block_dim);
1911         else
1912                 tiling = tile(isl_space_copy(dim), gen->shared_len + gen->n_block,
1913                                 gen->shared_len, gen->n_block, gen->block_dim);
1914
1915         priv = tiling;
1916
1917         par = parametrization(dim, gen->shared_len + 2 * gen->n_block,
1918                 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
1919                 gen->n_block, "t");
1920
1921         priv = isl_map_align_params(priv, isl_set_get_space(par));
1922         priv = isl_map_intersect_range(priv, par);
1923
1924         dim = isl_map_get_space(priv);
1925         dim = isl_space_drop_dims(dim, isl_dim_in, 0, isl_space_dim(dim, isl_dim_in));
1926         dim = isl_space_drop_dims(dim, isl_dim_out, 0, isl_space_dim(dim, isl_dim_out));
1927         proj = projection(dim, gen->shared_len + 2 * gen->n_block,
1928                           gen->shared_len);
1929
1930         priv = isl_map_apply_range(priv, proj);
1931
1932         return priv;
1933 }
1934
1935 /* Construct a map from domain_dim to domain_dim that increments
1936  * the dimension at position "pos" and leaves all other dimensions
1937  * constant.
1938  */
1939 static __isl_give isl_map *next(__isl_take isl_space *domain_dim, int pos)
1940 {
1941         int i;
1942         int len = isl_space_dim(domain_dim, isl_dim_set);
1943         isl_space *dim;
1944         isl_basic_map *next;
1945         isl_local_space *ls;
1946
1947         dim = isl_space_map_from_set(domain_dim);
1948         next = isl_basic_map_universe(isl_space_copy(dim));
1949         ls = isl_local_space_from_space(dim);
1950
1951         for (i = 0; i < len; ++i) {
1952                 isl_constraint *c;
1953
1954                 c = isl_equality_alloc(isl_local_space_copy(ls));
1955                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, 1);
1956                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1957                 if (i == pos)
1958                         c = isl_constraint_set_constant_si(c, 1);
1959                 next = isl_basic_map_add_constraint(next, c);
1960         }
1961
1962         isl_local_space_free(ls);
1963
1964         return isl_map_from_basic_map(next);
1965 }
1966
1967 /* Check if the given access is coalesced.
1968  * That is, check whether incrementing the dimension that will get
1969  * wrapped over the last thread index results in incrementing
1970  * the last array index.
1971  *
1972  * This function is only called for access relations without reuse.
1973  */
1974 static int access_is_coalesced(struct gpu_gen *gen,
1975         __isl_keep isl_union_map *access)
1976 {
1977         isl_space *dim;
1978         isl_map *access_map;
1979         isl_map *next_thread_x;
1980         isl_map *next_element;
1981         isl_map *map;
1982         int coalesced;
1983
1984         access = isl_union_map_copy(access);
1985         access = isl_union_map_apply_domain(access,
1986                                 isl_union_map_copy(gen->tiled_sched));
1987         access_map = isl_map_from_union_map(access);
1988
1989         dim = isl_map_get_space(access_map);
1990         dim = isl_space_domain(dim);
1991         next_thread_x = next(dim, gen->shared_len + gen->n_block - 1);
1992
1993         dim = isl_map_get_space(access_map);
1994         dim = isl_space_range(dim);
1995         next_element = next(dim, isl_space_dim(dim, isl_dim_set) - 1);
1996
1997         map = isl_map_apply_domain(next_thread_x, isl_map_copy(access_map));
1998         map = isl_map_apply_range(map, access_map);
1999
2000         coalesced = isl_map_is_subset(map, next_element);
2001
2002         isl_map_free(next_element);
2003         isl_map_free(map);
2004
2005         return coalesced;
2006 }
2007
2008 /* Given an access relation in terms of the first gen->shared_len + gen->n_block
2009  * dimensions of the computed schedule, check if it is bijective for
2010  * fixed values of the first gen->shared_len dimensions.
2011  * We perform this check by equating these dimensions to parameters.
2012  */
2013 static int access_is_bijective(struct gpu_gen *gen, __isl_keep isl_map *access)
2014 {
2015         int res;
2016         isl_set *par;
2017         isl_space *space;
2018
2019         access = isl_map_copy(access);
2020         space = isl_space_params(isl_map_get_space(access));
2021         par = parametrization(space, gen->shared_len + gen->n_block,
2022                                 0, gen->shared_len, "s");
2023         access = isl_map_intersect_domain(access, par);
2024         res = isl_map_is_bijective(access);
2025         isl_map_free(access);
2026
2027         return res;
2028 }
2029
2030 /* Look for the last shared tile loop that affects the offset of "tile"
2031  * and return the result.
2032  * If there is no such loop, then return the index of the loop
2033  * before the first shared tile loop, in particular gen->tile_first - 1.
2034  */
2035 static int compute_tile_last_shared(struct gpu_gen *gen,
2036         struct gpu_array_tile *tile)
2037 {
2038         int i, j;
2039
2040         for (j = gen->shared_len - 1; j >= gen->tile_first; --j) {
2041                 for (i = 0; i < tile->n; ++i) {
2042                         isl_aff *lb;
2043                         isl_aff *shift;
2044
2045                         lb = tile->bound[i].lb;
2046                         if (isl_aff_involves_dims(lb, isl_dim_in, j, 1))
2047                                 break;
2048
2049                         shift = tile->bound[i].shift;
2050                         if (!shift)
2051                                 continue;
2052                         if (isl_aff_involves_dims(shift, isl_dim_in, j, 1))
2053                                 break;
2054                 }
2055                 if (i < tile->n)
2056                         break;
2057         }
2058
2059         return j;
2060 }
2061
2062 /* Look for the last shared tile loop that affects the offset of the
2063  * shared or private tile and store the result in group->last_shared.
2064  * If there is no such loop, then group->last_shared is set to a value
2065  * before the first shared tile loop, in particular gen->tile_first - 1.
2066  * If there is no tile defined on the array reference group,
2067  * then set group->last_shared to gen->shared_len - 1.
2068  */
2069 static void set_last_shared(struct gpu_gen *gen,
2070         struct gpu_array_ref_group *group)
2071 {
2072         struct gpu_array_tile *tile;
2073
2074         group->last_shared = gen->shared_len - 1;
2075
2076         tile = group->private_tile;
2077         if (!tile)
2078                 tile = group->shared_tile;
2079         if (!tile)
2080                 return;
2081
2082         group->last_shared = compute_tile_last_shared(gen, tile);
2083 }
2084
2085 /* Compute a privatized copy of all access relations from reference groups that
2086  * are mapped to private memory and store the result in gen->privatization.
2087  */
2088 static void compute_private_access(struct gpu_gen *gen)
2089 {
2090         int i, j;
2091         isl_union_map *private;
2092
2093         if (!gen->options->use_private_memory)
2094                 return;
2095
2096         private = isl_union_map_empty(isl_union_map_get_space(gen->shared_sched));
2097
2098         for (i = 0; i < gen->prog->n_array; ++i) {
2099                 struct gpu_array_info *array = &gen->prog->array[i];
2100
2101                 if (gpu_array_is_read_only_scalar(array))
2102                         continue;
2103
2104                 for (j = 0; j < array->n_group; ++j) {
2105                         if (!array->groups[j]->private_tile)
2106                                 continue;
2107
2108                         private = isl_union_map_union(private,
2109                                 group_access_relation(array->groups[j], 1, 1));
2110                 }
2111         }
2112
2113         if (isl_union_map_is_empty(private))
2114                 isl_union_map_free(private);
2115         else {
2116                 isl_union_map *priv;
2117
2118                 private = isl_union_map_apply_domain(private,
2119                                         isl_union_map_copy(gen->shared_sched));
2120                 priv = isl_union_map_from_map(isl_map_copy(gen->privatization));
2121                 private = isl_union_map_apply_domain(private, priv);
2122                 gen->private_access = private;
2123         }
2124 }
2125
2126 /* Compute the size of the tile specified by "tile"
2127  * in number of elements and return the result.
2128  */
2129 static __isl_give isl_val *tile_size(isl_ctx *ctx, struct gpu_array_tile *tile)
2130 {
2131         int i;
2132         isl_val *size;
2133
2134         size = isl_val_one(ctx);
2135
2136         for (i = 0; i < tile->n; ++i)
2137                 size = isl_val_mul(size, isl_val_copy(tile->bound[i].size));
2138
2139         return size;
2140 }
2141
2142 /* If max_shared_memory is not set to infinity (-1), then make
2143  * sure that the total amount of shared memory required by the
2144  * array reference groups mapped to shared memory is no larger
2145  * than this maximum.
2146  *
2147  * We apply a greedy approach and discard (keep in global memory)
2148  * those groups that would result in a total memory size that
2149  * is larger than the maximum.
2150  */
2151 static void check_shared_memory_bound(struct gpu_gen *gen)
2152 {
2153         int i, j;
2154         isl_val *left, *size;
2155
2156         if (gen->options->max_shared_memory < 0)
2157                 return;
2158
2159         left = isl_val_int_from_si(gen->ctx, gen->options->max_shared_memory);
2160
2161         for (i = 0; i < gen->prog->n_array; ++i) {
2162                 struct gpu_array_info *array = &gen->prog->array[i];
2163
2164                 for (j = 0; j < array->n_group; ++j) {
2165                         struct gpu_array_ref_group *group;
2166
2167                         group = array->groups[j];
2168                         if (!group->shared_tile)
2169                                 continue;
2170
2171                         size = tile_size(gen->ctx, group->shared_tile);
2172                         size = isl_val_mul_ui(size, array->size);
2173
2174                         if (isl_val_le(size, left)) {
2175                                 left = isl_val_sub(left, size);
2176                                 continue;
2177                         }
2178                         isl_val_free(size);
2179
2180                         group->shared_tile = free_tile(group->shared_tile);
2181                 }
2182         }
2183
2184         isl_val_free(left);
2185 }
2186
2187 /* Given a description of an array tile "tile" and the "space"
2188  *
2189  *      { D -> A }
2190  *
2191  * where D represents the first shared_len schedule dimensions
2192  * and A represents the array, construct an isl_multi_aff
2193  *
2194  *      { [D[i] -> A[a]] -> A'[a'] }
2195  *
2196  * with A' a scaled down copy of A according to the shifts and strides
2197  * in "tile".  In particular,
2198  *
2199  *      a' = (a + shift(i))/stride
2200  *
2201  * "insert_array" represents
2202  *
2203  *      { [D -> A] -> D }
2204  *
2205  * and is used to insert A into the domain of functions that only
2206  * reference D.
2207  */
2208 static __isl_give isl_multi_aff *strided_tile(
2209         struct gpu_array_tile *tile, __isl_keep isl_space *space,
2210         __isl_keep isl_multi_aff *insert_array)
2211 {
2212         int i;
2213         isl_ctx *ctx;
2214         isl_multi_aff *shift;
2215         isl_multi_val *stride;
2216         isl_space *space2;
2217         isl_local_space *ls;
2218         isl_multi_aff *tiling;
2219
2220         ctx = isl_space_get_ctx(space);
2221         space2 = isl_space_domain(isl_space_copy(space));
2222         ls = isl_local_space_from_space(space2);
2223         space2 = isl_space_range(isl_space_copy(space));
2224         stride = isl_multi_val_zero(space2);
2225         shift = isl_multi_aff_zero(isl_space_copy(space));
2226
2227         for (i = 0; i < tile->n; ++i) {
2228                 struct gpu_array_bound *bound = &tile->bound[i];
2229                 isl_val *stride_i;
2230                 isl_aff *shift_i;
2231
2232                 if (tile->bound[i].shift) {
2233                         stride_i = isl_val_copy(bound->stride);
2234                         shift_i = isl_aff_copy(bound->shift);
2235                 } else {
2236                         stride_i = isl_val_one(ctx);
2237                         shift_i = isl_aff_zero_on_domain(
2238                                         isl_local_space_copy(ls));
2239                 }
2240
2241                 stride = isl_multi_val_set_val(stride, i, stride_i);
2242                 shift = isl_multi_aff_set_aff(shift, i, shift_i);
2243         }
2244         isl_local_space_free(ls);
2245
2246         shift = isl_multi_aff_pullback_multi_aff(shift,
2247                                     isl_multi_aff_copy(insert_array));
2248
2249         tiling = isl_multi_aff_range_map(isl_space_copy(space));
2250         tiling = isl_multi_aff_add(tiling, shift);
2251         tiling = isl_multi_aff_scale_down_multi_val(tiling, stride);
2252
2253         return tiling;
2254 }
2255
2256 /* Compute a tiling for the array reference group "group".
2257  *
2258  * The tiling is of the form
2259  *
2260  *      { [D[i] -> A[a]] -> T[t] }
2261  *
2262  * where D represents the first shared_len schedule dimensions,
2263  * A represents the global array and T represents the shared or
2264  * private memory tile.  The name of T is the name of the local
2265  * array.
2266  *
2267  * If there is any stride in the accesses, then the mapping is
2268  *
2269  *      t = (a + shift(i))/stride - lb(i)
2270  *
2271  * otherwise, it is simply
2272  *
2273  *      t = a - lb(i)
2274  */
2275 static void compute_group_tiling(struct gpu_array_ref_group *group)
2276 {
2277         int i;
2278         struct gpu_array_tile *tile;
2279         struct gpu_array_info *array = group->array;
2280         isl_space *space;
2281         isl_multi_aff *tiling, *lb, *insert_array;
2282         isl_printer *p;
2283         char *local_name;
2284
2285         tile = group->private_tile;
2286         if (!tile)
2287                 tile = group->shared_tile;
2288         if (!tile)
2289                 return;
2290
2291         space = isl_map_get_space(group->access);
2292         insert_array = isl_multi_aff_domain_map(isl_space_copy(space));
2293
2294         for (i = 0; i < tile->n; ++i)
2295                 if (tile->bound[i].shift)
2296                         break;
2297
2298         if (i < tile->n)
2299                 tiling = strided_tile(tile, space, insert_array);
2300         else
2301                 tiling = isl_multi_aff_range_map(isl_space_copy(space));
2302
2303         lb = isl_multi_aff_zero(space);
2304         for (i = 0; i < tile->n; ++i) {
2305                 isl_aff *lb_i = isl_aff_copy(tile->bound[i].lb);
2306                 lb = isl_multi_aff_set_aff(lb, i, lb_i);
2307         }
2308         lb = isl_multi_aff_pullback_multi_aff(lb, insert_array);
2309
2310         tiling = isl_multi_aff_sub(tiling, lb);
2311
2312         p = isl_printer_to_str(isl_multi_aff_get_ctx(tiling));
2313         p = print_array_name(p, group);
2314         local_name = isl_printer_get_str(p);
2315         isl_printer_free(p);
2316         tiling = isl_multi_aff_set_tuple_name(tiling, isl_dim_out, local_name);
2317         free(local_name);
2318
2319         tile->tiling = tiling;
2320 }
2321
2322 /* Compute a tiling for all the array reference groups.
2323  */
2324 static void compute_group_tilings(struct gpu_gen *gen)
2325 {
2326         int i, j;
2327
2328         for (i = 0; i < gen->prog->n_array; ++i) {
2329                 struct gpu_array_info *array = &gen->prog->array[i];
2330
2331                 for (j = 0; j < array->n_group; ++j)
2332                         compute_group_tiling(array->groups[j]);
2333         }
2334 }
2335
2336 /* Fill up the groups array with singleton groups, i.e., one group
2337  * per reference, initializing the array, access, write, n_ref and refs fields.
2338  * In particular the access field is initialized to the scheduled
2339  * access relation of the array reference.
2340  *
2341  * Return the number of elements initialized, i.e., the number of
2342  * active references in the current kernel.
2343  */
2344 static int populate_array_references(struct gpu_array_info *array,
2345         __isl_keep isl_union_map *sched, struct gpu_array_ref_group **groups)
2346 {
2347         int i;
2348         int n;
2349         isl_ctx *ctx = isl_union_map_get_ctx(sched);
2350
2351         n = 0;
2352         for (i = 0; i < array->n_ref; ++i) {
2353                 isl_union_map *umap;
2354                 isl_map *map;
2355                 struct gpu_array_ref_group *group;
2356                 struct gpu_stmt_access *access = array->refs[i];
2357
2358                 map = isl_map_copy(access->access);
2359                 umap = isl_union_map_from_map(map);
2360                 umap = isl_union_map_apply_domain(umap,
2361                                 isl_union_map_copy(sched));
2362
2363                 if (isl_union_map_is_empty(umap)) {
2364                         isl_union_map_free(umap);
2365                         continue;
2366                 }
2367
2368                 map = isl_map_from_union_map(umap);
2369                 map = isl_map_detect_equalities(map);
2370
2371                 group = isl_calloc_type(ctx, struct gpu_array_ref_group);
2372                 assert(group);
2373                 group->array = array;
2374                 group->access = map;
2375                 group->write = access->write;
2376                 group->refs = &array->refs[i];
2377                 group->n_ref = 1;
2378
2379                 groups[n++] = group;
2380         }
2381
2382         return n;
2383 }
2384
2385 /* If group->n_ref == 1, then group->refs was set by
2386  * populate_array_references to point directly into
2387  * group->array->refs and should not be freed.
2388  * If group->n_ref > 1, then group->refs was set by join_groups
2389  * to point to a newly allocated array.
2390  */
2391 static void free_array_ref_group(struct gpu_array_ref_group *group)
2392 {
2393         if (!group)
2394                 return;
2395         free_tile(group->shared_tile);
2396         free_tile(group->private_tile);
2397         isl_map_free(group->access);
2398         if (group->n_ref > 1)
2399                 free(group->refs);
2400         free(group);
2401 }
2402
2403 /* Given a map where the input dimensions represent the tile loops,
2404  * eliminate the innermost of those that have a fixed value
2405  * until we reach one that does not (obviously) have a fixed value.
2406  */
2407 static __isl_give isl_map *eliminate_fixed_inner_loops(
2408         __isl_take isl_map *access)
2409 {
2410         int i, n;
2411
2412         n = isl_map_dim(access, isl_dim_in);
2413
2414         for (i = n - 1; i >= 0; --i) {
2415                 if (!map_plain_is_fixed(access, isl_dim_in, i))
2416                         break;
2417                 access = isl_map_eliminate(access, isl_dim_in, i, 1);
2418         }
2419         return access;
2420 }
2421
2422 /* Check if the access relations of group1 and group2 overlap within
2423  * the innermost loop.  In particular, ignore any inner dimension
2424  * with a fixed value.
2425  * The copying to and from shared memory will be performed within
2426  * the innermost actual loop so we are only allowed to consider
2427  * the dimensions up to that innermost loop while checking whether
2428  * two access relations overlap.
2429  */
2430 static int accesses_overlap(struct gpu_array_ref_group *group1,
2431         struct gpu_array_ref_group *group2)
2432 {
2433         int empty;
2434         isl_map *access1, *access2;
2435
2436         access1 = isl_map_copy(group1->access);
2437         access1 = eliminate_fixed_inner_loops(access1);
2438         access2 = isl_map_copy(group2->access);
2439         access2 = eliminate_fixed_inner_loops(access2);
2440         access1 = isl_map_intersect(access1, access2);
2441         empty = isl_map_is_empty(access1);
2442         isl_map_free(access1);
2443
2444         return !empty;
2445 }
2446
2447 /* Combine the given two groups into a single group, containing
2448  * the references of both groups.
2449  */
2450 static struct gpu_array_ref_group *join_groups(
2451         struct gpu_array_ref_group *group1,
2452         struct gpu_array_ref_group *group2)
2453 {
2454         int i;
2455         isl_ctx *ctx;
2456         struct gpu_array_ref_group *group;
2457
2458         ctx = isl_map_get_ctx(group1->access);
2459         group = isl_calloc_type(ctx, struct gpu_array_ref_group);
2460         assert(group);
2461         group->array = group1->array;
2462         group->access = isl_map_union(isl_map_copy(group1->access),
2463                                         isl_map_copy(group2->access));
2464         group->write = group1->write || group2->write;
2465         group->n_ref = group1->n_ref + group2->n_ref;
2466         group->refs = isl_alloc_array(ctx, struct gpu_stmt_access *,
2467                                         group->n_ref);
2468         assert(group->refs);
2469         for (i = 0; i < group1->n_ref; ++i)
2470                 group->refs[i] = group1->refs[i];
2471         for (i = 0; i < group2->n_ref; ++i)
2472                 group->refs[group1->n_ref + i] = group2->refs[i];
2473
2474         return group;
2475 }
2476
2477 /* Combine the given two groups into a single group and free
2478  * the original two groups.
2479  */
2480 static struct gpu_array_ref_group *join_groups_and_free(
2481         struct gpu_array_ref_group *group1,
2482         struct gpu_array_ref_group *group2)
2483 {
2484         struct gpu_array_ref_group *group;
2485
2486         group = join_groups(group1, group2);
2487         free_array_ref_group(group1);
2488         free_array_ref_group(group2);
2489         return group;
2490 }
2491
2492 /* Compute the private and/or shared memory tiles for the array
2493  * reference group "group" of array "array".
2494  *
2495  * If the array is a read-only scalar or if the user requested
2496  * not to use shared or private memory, then we do not need to do anything.
2497  *
2498  * We only try to compute a shared memory tile if there is any reuse
2499  * or if the access is not coalesced.
2500  *
2501  * For computing a private memory tile, we also require that there is
2502  * some reuse.  Moreover, we require that the access is private
2503  * to the thread.  That is, we check that any given array element
2504  * is only accessed by a single thread.
2505  * We compute an access relation that maps the shared tile loop iterators
2506  * and the shared point loop iterators that will be wrapped over the
2507  * threads to the array elements.
2508  * We actually check that those iterators that will be wrapped
2509  * partition the array space.  This check is stricter than necessary
2510  * since several iterations may be mapped onto the same thread
2511  * and then they could be allowed to access the same memory elements,
2512  * but our check does not allow this situation.
2513  *
2514  * We also check that the index expression only depends on parallel
2515  * loops.  That way, we can move those loops innermost and unroll them.
2516  * Again, we use a test that is stricter than necessary.
2517  * We actually check whether the index expression only depends
2518  * on the iterators that are wrapped over the threads.
2519  * These are necessarily parallel, but there may be more parallel loops.
2520  *
2521  * Combining the injectivity of the first test with the single-valuedness
2522  * of the second test, we simply test for bijectivity.
2523  *
2524  * If it turns out we can use registers, we compute the private memory
2525  * tile size using can_tile, after introducing a dependence
2526  * on the thread indices.
2527  */
2528 static void compute_group_bounds_core(struct gpu_gen *gen,
2529         struct gpu_array_ref_group *group)
2530 {
2531         isl_ctx *ctx = isl_space_get_ctx(group->array->space);
2532         isl_union_map *access;
2533         int n_index = group->array->n_index;
2534         int no_reuse;
2535         isl_map *acc;
2536         int use_shared = gen->options->use_shared_memory;
2537         int use_private = gen->options->use_private_memory;
2538
2539         if (!use_shared && !use_private)
2540                 return;
2541         if (gpu_array_is_read_only_scalar(group->array))
2542                 return;
2543
2544         access = group_access_relation(group, 1, 1);
2545         no_reuse = isl_union_map_is_injective(access);
2546
2547         if (use_shared && (!no_reuse || !access_is_coalesced(gen, access))) {
2548                 group->shared_tile = create_tile(ctx, group->array->n_index);
2549                 if (!can_tile(group->access, group->shared_tile))
2550                         group->shared_tile = free_tile(group->shared_tile);
2551         }
2552
2553         if (!use_private || no_reuse) {
2554                 isl_union_map_free(access);
2555                 return;
2556         }
2557
2558         access = isl_union_map_apply_domain(access,
2559                                         isl_union_map_copy(gen->shared_sched));
2560
2561         acc = isl_map_from_union_map(access);
2562
2563         if (!access_is_bijective(gen, acc)) {
2564                 isl_map_free(acc);
2565                 return;
2566         }
2567
2568         group->private_tile = create_tile(gen->ctx, n_index);
2569         acc = isl_map_apply_domain(acc, isl_map_copy(gen->privatization));
2570         if (!can_tile(acc, group->private_tile))
2571                 group->private_tile = free_tile(group->private_tile);
2572
2573         isl_map_free(acc);
2574 }
2575
2576 /* Compute the private and/or shared memory tiles for the array
2577  * reference group "group" of array "array" and set last_shared.
2578  */
2579 static void compute_group_bounds(struct gpu_gen *gen,
2580         struct gpu_array_ref_group *group)
2581 {
2582         compute_group_bounds_core(gen, group);
2583         set_last_shared(gen, group);
2584 }
2585
2586 /* If two groups have overlapping access relations (as determined by
2587  * the "overlap" function) and if one of them involves a write,
2588  * then merge the two groups into one.
2589  * If "compute_bounds" is set, then call compute_group_bounds
2590  * on the merged groups.
2591  *
2592  * Return the updated number of groups.
2593  */
2594 static int group_writes(struct gpu_gen *gen,
2595         int n, struct gpu_array_ref_group **groups,
2596         int (*overlap)(struct gpu_array_ref_group *group1,
2597                 struct gpu_array_ref_group *group2), int compute_bounds)
2598 {
2599         int i, j;
2600
2601         for (i = 0; i < n; ++i) {
2602                 for (j = n - 1; j > i; --j) {
2603                         if (!groups[i]->write && !groups[j]->write)
2604                                 continue;
2605
2606                         if (!overlap(groups[i], groups[j]))
2607                                 continue;
2608
2609                         groups[i] = join_groups_and_free(groups[i], groups[j]);
2610                         if (compute_bounds)
2611                                 compute_group_bounds(gen, groups[i]);
2612                         if (j != n - 1)
2613                                 groups[j] = groups[n - 1];
2614                         n--;
2615                 }
2616         }
2617
2618         return n;
2619 }
2620
2621 /* If two groups have overlapping access relations (within the innermost
2622  * loop) and if one of them involves a write, then merge the two groups
2623  * into one.
2624  *
2625  * Return the updated number of groups.
2626  */
2627 static int group_overlapping_writes(struct gpu_gen *gen,
2628         int n, struct gpu_array_ref_group **groups)
2629 {
2630         return group_writes(gen, n, groups, &accesses_overlap, 0);
2631 }
2632
2633 /* Check if the access relations of group1 and group2 overlap within
2634  * the outermost min(group1->last_shared, group2->last_shared) loops.
2635  */
2636 static int last_shared_accesses_overlap(struct gpu_array_ref_group *group1,
2637         struct gpu_array_ref_group *group2)
2638 {
2639         int last_shared;
2640         int dim;
2641         int empty;
2642         isl_map *map_i, *map_j, *map;
2643
2644         last_shared = group1->last_shared;
2645         if (group2->last_shared < last_shared)
2646                 last_shared = group2->last_shared;
2647         map_i = isl_map_copy(group1->access);
2648         dim = isl_map_dim(map_i, isl_dim_in);
2649         map_i = isl_map_eliminate(map_i, isl_dim_in,
2650                                 last_shared + 1, dim - (last_shared + 1));
2651         map_j = isl_map_copy(group2->access);
2652         map_j = isl_map_eliminate(map_j, isl_dim_in,
2653                                 last_shared + 1, dim - (last_shared + 1));
2654         map = isl_map_intersect(map_i, map_j);
2655         empty = isl_map_is_empty(map);
2656         isl_map_free(map);
2657
2658         return !empty;
2659 }
2660
2661 /* If two groups have overlapping access relations (within the outer
2662  * last_shared loops) and if one of them involves a write,
2663  * then merge the two groups into one.
2664  *
2665  * Return the updated number of groups.
2666  */
2667 static int group_last_shared_overlapping_writes(struct gpu_gen *gen, int n,
2668         struct gpu_array_ref_group **groups)
2669 {
2670         return group_writes(gen, n, groups, &last_shared_accesses_overlap, 1);
2671 }
2672
2673 /* Is the size of the tile specified by "tile" smaller than the sum of
2674  * the sizes of the tiles specified by "tile1" and "tile2"?
2675  */
2676 static int smaller_tile(isl_ctx *ctx, struct gpu_array_tile *tile,
2677         struct gpu_array_tile *tile1, struct gpu_array_tile *tile2)
2678 {
2679         int smaller;
2680         isl_val *size, *size1, *size2;
2681
2682         size = tile_size(ctx, tile);
2683         size1 = tile_size(ctx, tile1);
2684         size2 = tile_size(ctx, tile2);
2685
2686         size = isl_val_sub(size, size1);
2687         size = isl_val_sub(size, size2);
2688         smaller = isl_val_is_neg(size);
2689
2690         isl_val_free(size);
2691
2692         return smaller;
2693 }
2694
2695 /* Given an initial grouping of array references and shared memory tiles
2696  * for each group that allows for a shared memory tile, merge two groups
2697  * if both have a shared memory tile, the merged group also has
2698  * a shared memory tile and the size of the tile for the merge group
2699  * is smaller than the sum of the tile sizes of the individual groups.
2700  *
2701  * If merging two groups decreases the "last_shared" dimension of
2702  * one or both of the two groups, then we need to check for overlapping
2703  * writes again.
2704  *
2705  * Return the number of groups after merging.
2706  */
2707 static int group_common_shared_memory_tile(struct gpu_gen *gen,
2708         struct gpu_array_info *array, int n,
2709         struct gpu_array_ref_group **groups)
2710 {
2711         int i, j;
2712         int recompute_overlap = 0;
2713         isl_ctx *ctx = isl_space_get_ctx(array->space);
2714
2715         for (i = 0; i < n; ++i) {
2716                 if (!groups[i]->shared_tile)
2717                         continue;
2718                 for (j = n - 1; j > i; --j) {
2719                         isl_map *map;
2720                         int empty;
2721                         struct gpu_array_ref_group *group;
2722
2723                         if (!groups[j]->shared_tile)
2724                                 continue;
2725
2726                         map = isl_map_intersect(isl_map_copy(groups[i]->access),
2727                                             isl_map_copy(groups[j]->access));
2728                         empty = isl_map_is_empty(map);
2729                         isl_map_free(map);
2730
2731                         if (empty)
2732                                 continue;
2733
2734                         group = join_groups(groups[i], groups[j]);
2735                         compute_group_bounds(gen, group);
2736                         if (!group->shared_tile ||
2737                             !smaller_tile(ctx, group->shared_tile,
2738                                         groups[i]->shared_tile,
2739                                         groups[j]->shared_tile)) {
2740                                 free_array_ref_group(group);
2741                                 continue;
2742                         }
2743
2744                         if (group->last_shared < groups[i]->last_shared ||
2745                             group->last_shared < groups[j]->last_shared)
2746                                 recompute_overlap = 1;
2747                         free_array_ref_group(groups[i]);
2748                         free_array_ref_group(groups[j]);
2749                         groups[i] = group;
2750                         if (j != n - 1)
2751                                 groups[j] = groups[n - 1];
2752                         n--;
2753                 }
2754         }
2755
2756         if (recompute_overlap)
2757                 n = group_last_shared_overlapping_writes(gen, n, groups);
2758         return n;
2759 }
2760
2761 /* Set array->n_group and array->groups to n and groups.
2762  *
2763  * Additionally, set the "nr" field of each group
2764  * and the "group" field of each reference in each group.
2765  */
2766 static void set_array_groups(struct gpu_array_info *array,
2767         int n, struct gpu_array_ref_group **groups)
2768 {
2769         int i, j;
2770
2771         array->n_group = n;
2772         array->groups = groups;
2773
2774         for (i = 0; i < n; ++i) {
2775                 groups[i]->nr = i;
2776
2777                 for (j = 0; j < groups[i]->n_ref; ++j)
2778                         groups[i]->refs[j]->group = i;
2779         }
2780 }
2781
2782 /* Group array references that should be considered together when
2783  * deciding whether to access them from private, shared or global memory.
2784  *
2785  * In particular, if two array references overlap and if one of them
2786  * is a write, then the two references are grouped together.
2787  * We first perform an initial grouping based only on the access relation.
2788  * After computing shared and private memory tiles, we check for
2789  * overlapping writes again, but this time taking into account
2790  * the "last_shared" property.
2791  *
2792  * Furthermore, if two groups admit a shared memory tile and if the
2793  * combination of the two also admits a shared memory tile, we merge
2794  * the two groups.
2795  */
2796 static void group_array_references(struct gpu_gen *gen,
2797         struct gpu_array_info *array, __isl_keep isl_union_map *sched)
2798 {
2799         int i;
2800         int n;
2801         isl_ctx *ctx = isl_union_map_get_ctx(sched);
2802         struct gpu_array_ref_group **groups;
2803
2804         groups = isl_calloc_array(ctx, struct gpu_array_ref_group *,
2805                                         array->n_ref);
2806         assert(groups);
2807
2808         n = populate_array_references(array, sched, groups);
2809
2810         n = group_overlapping_writes(gen, n, groups);
2811
2812         for (i = 0; i < n; ++i)
2813                 compute_group_bounds(gen, groups[i]);
2814
2815         n = group_last_shared_overlapping_writes(gen, n, groups);
2816
2817         n = group_common_shared_memory_tile(gen, array, n, groups);
2818
2819         set_array_groups(array, n, groups);
2820 }
2821
2822 /* Take tiled_sched, project it onto the shared tile loops and
2823  * the loops that will be wrapped over the threads and
2824  * store the result in gen->shared_sched.
2825  * Also compute a projection that projects out the loops that will be
2826  * wrapped over the threads and store this projection in gen->shared_proj.
2827  */
2828 static void compute_shared_sched(struct gpu_gen *gen)
2829 {
2830         isl_space *dim;
2831         isl_map *proj;
2832         isl_set *par;
2833         isl_union_map *sched;
2834
2835         sched = isl_union_map_copy(gen->tiled_sched);
2836
2837         dim = isl_union_map_get_space(sched);
2838         proj = projection(dim, gen->tiled_len, gen->shared_len + gen->n_block);
2839         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
2840
2841         dim = isl_union_map_get_space(sched);
2842         proj = projection(dim, gen->shared_len + gen->n_block, gen->shared_len);
2843
2844         gen->shared_sched = sched;
2845         gen->shared_proj = isl_union_map_from_map(proj);
2846 }
2847
2848 /* Group references of all arrays in the program.
2849  */
2850 static void group_references(struct gpu_gen *gen)
2851 {
2852         int i;
2853         isl_union_map *sched;
2854
2855         sched = isl_union_map_apply_range(isl_union_map_copy(gen->shared_sched),
2856                                           isl_union_map_copy(gen->shared_proj));
2857
2858         for (i = 0; i < gen->prog->n_array; ++i)
2859                 group_array_references(gen, &gen->prog->array[i], sched);
2860
2861         isl_union_map_free(sched);
2862 }
2863
2864 /* Free all array information that is local to the current kernel.
2865  */
2866 static void free_local_array_info(struct gpu_gen *gen)
2867 {
2868         int i, j;
2869
2870         for (i = 0; i < gen->prog->n_array; ++i) {
2871                 struct gpu_array_info *array = &gen->prog->array[i];
2872
2873                 for (j = 0; j < array->n_group; ++j)
2874                         free_array_ref_group(array->groups[j]);
2875                 free(array->groups);
2876         }
2877 }
2878
2879 /* Compute the size of a bounding box around the origin and "set",
2880  * where "set" is assumed to contain only non-negative elements.
2881  * In particular, compute the maximal value of "set" in each direction
2882  * and add one.
2883  */
2884 static __isl_give isl_multi_pw_aff *extract_size(__isl_take isl_set *set,
2885         __isl_keep isl_set *context)
2886 {
2887         int i, n;
2888         isl_multi_pw_aff *mpa;
2889
2890         n = isl_set_dim(set, isl_dim_set);
2891         mpa = isl_multi_pw_aff_zero(isl_set_get_space(set));
2892         for (i = 0; i < n; ++i) {
2893                 isl_space *space;
2894                 isl_aff *one;
2895                 isl_pw_aff *bound;
2896
2897                 bound = isl_set_dim_max(isl_set_copy(set), i);
2898                 bound = isl_pw_aff_coalesce(bound);
2899                 bound = isl_pw_aff_gist(bound, isl_set_copy(context));
2900
2901                 space = isl_pw_aff_get_domain_space(bound);
2902                 one = isl_aff_zero_on_domain(isl_local_space_from_space(space));
2903                 one = isl_aff_add_constant_si(one, 1);
2904                 bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one));
2905                 mpa = isl_multi_pw_aff_set_pw_aff(mpa, i, bound);
2906         }
2907         isl_set_free(set);
2908
2909         return mpa;
2910 }
2911
2912 /* Compute the effective grid size as a list of the sizes in each dimension.
2913  *
2914  * The grid size specified by the user or set by default
2915  * in read_grid_sizes() and applied in tile_schedule(),
2916  * may be too large for the given code in the sense that
2917  * it may contain blocks that don't need to execute anything.
2918  * We therefore don't return this grid size, but instead the
2919  * smallest grid size that ensures that all blocks that actually
2920  * execute code are included in the grid.
2921  *
2922  * We first extract a description of the grid, i.e., the possible values
2923  * of the block ids, from gen->tiled_sched.
2924  * The block ids are parameters in gen->tiled_sched.
2925  * We simply need to change them into set dimensions.
2926  *
2927  * Then, for each block dimension, we compute the maximal value of the block id
2928  * and add one.
2929  */
2930 static __isl_give isl_multi_pw_aff *extract_grid_size(struct gpu_gen *gen,
2931         struct ppcg_kernel *kernel)
2932 {
2933         int i;
2934         isl_set *grid;
2935
2936         grid = isl_union_map_params(isl_union_map_copy(gen->tiled_sched));
2937         grid = isl_set_from_params(grid);
2938         grid = isl_set_add_dims(grid, isl_dim_set, gen->n_grid);
2939         for (i = 0; i < gen->n_grid; ++i) {
2940                 int pos;
2941                 char name[20];
2942
2943                 snprintf(name, sizeof(name), "b%d", i);
2944                 pos = isl_set_find_dim_by_name(grid, isl_dim_param, name);
2945                 assert(pos >= 0);
2946                 grid = isl_set_equate(grid, isl_dim_param, pos, isl_dim_set, i);
2947                 grid = isl_set_project_out(grid, isl_dim_param, pos, 1);
2948         }
2949
2950         return extract_size(grid, kernel->context);
2951 }
2952
2953 /* Compute the size of a fixed bounding box around the origin and "set",
2954  * where "set" is assumed to contain only non-negative elements,
2955  * and store the results in "size".
2956  * In particular, compute the maximal value of "set" in each direction
2957  * and add one.
2958  */
2959 static void extract_fixed_size(__isl_take isl_set *set, int *size)
2960 {
2961         int i, n;
2962         isl_local_space *ls;
2963         isl_aff *obj;
2964
2965         n = isl_set_dim(set, isl_dim_set);
2966         ls = isl_local_space_from_space(isl_set_get_space(set));
2967         obj = isl_aff_zero_on_domain(ls);
2968         for (i = 0; i < n; ++i) {
2969                 isl_val *max;
2970
2971                 obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 1);
2972                 max = isl_set_max_val(set, obj);
2973                 size[i] = isl_val_get_num_si(max) + 1;
2974                 isl_val_free(max);
2975                 obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 0);
2976         }
2977         isl_aff_free(obj);
2978         isl_set_free(set);
2979 }
2980
2981 /* Compute the effective block size as a list of the sizes in each dimension
2982  * and store the sizes in kernel->block_dim.
2983  *
2984  * The block size specified by the user or set by default
2985  * in read_block_sizes() and applied in thread_tile_schedule(),
2986  * may be too large for the given code in the sense that
2987  * it may contain threads that don't need to execute anything.
2988  * We therefore don't store this block size in kernel->block_dim,
2989  * but instead the smallest block size that ensures that all threads
2990  * that actually execute code are included in the block.
2991  *
2992  * The current implementation eliminates all parameters, ensuring
2993  * that the size is a fixed constant in each dimension.
2994  * In principle we could also compute parametric sizes.
2995  * We would have to make sure to project out all b%d and t%d parameters,
2996  * however.
2997  */
2998 static void extract_block_size(struct gpu_gen *gen, struct ppcg_kernel *kernel)
2999 {
3000         int i;
3001         int nparam;
3002         isl_set *block;
3003         isl_multi_pw_aff *mpa;
3004
3005         block = isl_union_map_params(isl_union_map_copy(gen->local_sched));
3006         block = isl_set_from_params(block);
3007         block = isl_set_add_dims(block, isl_dim_set, gen->n_block);
3008         kernel->n_block = gen->n_block;
3009         for (i = 0; i < gen->n_block; ++i) {
3010                 int pos;
3011                 char name[20];
3012
3013                 snprintf(name, sizeof(name), "t%d", i);
3014                 pos = isl_set_find_dim_by_name(block, isl_dim_param, name);
3015                 assert(pos >= 0);
3016                 block = isl_set_equate(block, isl_dim_param, pos,
3017                                         isl_dim_set, i);
3018         }
3019         nparam = isl_set_dim(block, isl_dim_param);
3020         block = isl_set_project_out(block, isl_dim_param, 0, nparam);
3021
3022         extract_fixed_size(block, kernel->block_dim);
3023 }
3024
3025 void ppcg_kernel_free(void *user)
3026 {
3027         struct ppcg_kernel *kernel = user;
3028         int i;
3029
3030         if (!kernel)
3031                 return;
3032
3033         isl_multi_pw_aff_free(kernel->grid_size);
3034         isl_set_free(kernel->context);
3035         isl_union_set_free(kernel->arrays);
3036         isl_space_free(kernel->space);
3037         isl_ast_node_free(kernel->tree);
3038
3039         for (i = 0; i < kernel->n_array; ++i)
3040                 isl_pw_aff_list_free(kernel->array[i].bound);
3041         free(kernel->array);
3042
3043         for (i = 0; i < kernel->n_var; ++i) {
3044                 free(kernel->var[i].name);
3045                 isl_vec_free(kernel->var[i].size);
3046         }
3047         free(kernel->var);
3048
3049         free(kernel);
3050 }
3051
3052 static void create_kernel_var(isl_ctx *ctx, struct gpu_array_ref_group *group,
3053         struct ppcg_kernel_var *var)
3054 {
3055         int j;
3056         struct gpu_array_tile *tile;
3057         isl_printer *p;
3058         char *name;
3059
3060         var->array = group->array;
3061
3062         tile = group->private_tile;
3063         var->type = ppcg_access_private;
3064         if (!tile) {
3065                 tile = group->shared_tile;
3066                 var->type = ppcg_access_shared;
3067         }
3068
3069         p = isl_printer_to_str(ctx);
3070         p = print_array_name(p, group);
3071         var->name = isl_printer_get_str(p);
3072         isl_printer_free(p);
3073
3074         var->size = isl_vec_alloc(ctx, group->array->n_index);
3075
3076         for (j = 0; j < group->array->n_index; ++j)
3077                 var->size = isl_vec_set_element_val(var->size, j,
3078                                             isl_val_copy(tile->bound[j].size));
3079 }
3080
3081 static void create_kernel_vars(struct gpu_gen *gen, struct ppcg_kernel *kernel)
3082 {
3083         int i, j, n;
3084
3085         n = 0;
3086         for (i = 0; i < gen->prog->n_array; ++i) {
3087                 struct gpu_array_info *array = &gen->prog->array[i];
3088
3089                 for (j = 0; j < array->n_group; ++j) {
3090                         struct gpu_array_ref_group *group = array->groups[j];
3091                         if (group->private_tile || group->shared_tile)
3092                                 ++n;
3093                 }
3094         }
3095
3096         kernel->n_var = n;
3097         kernel->var = isl_calloc_array(gen->ctx, struct ppcg_kernel_var, n);
3098         assert(kernel->var);
3099
3100         n = 0;
3101         for (i = 0; i < gen->prog->n_array; ++i) {
3102                 struct gpu_array_info *array = &gen->prog->array[i];
3103
3104                 for (j = 0; j < array->n_group; ++j) {
3105                         struct gpu_array_ref_group *group = array->groups[j];
3106                         if (!group->private_tile && !group->shared_tile)
3107                                 continue;
3108                         create_kernel_var(gen->ctx, group, &kernel->var[n]);
3109                         ++n;
3110                 }
3111         }
3112 }
3113
3114 /* The sizes of the arrays on the host that have been computed by
3115  * extract_array_info may depend on the parameters.  Use the extra
3116  * constraints on the parameters that are valid at "host_domain"
3117  * to simplify these expressions and store the results in kernel->array.
3118  */
3119 static void localize_bounds(struct gpu_gen *gen, struct ppcg_kernel *kernel,
3120         __isl_keep isl_set *host_domain)
3121 {
3122         int i, j;
3123         isl_set *context;
3124
3125         kernel->array = isl_calloc_array(gen->ctx,
3126                             struct gpu_local_array_info, gen->prog->n_array);
3127         assert(kernel->array);
3128         kernel->n_array = gen->prog->n_array;
3129
3130         context = isl_set_copy(host_domain);
3131         context = isl_set_params(context);
3132
3133         for (i = 0; i < gen->prog->n_array; ++i) {
3134                 struct gpu_array_info *array = &gen->prog->array[i];
3135                 isl_pw_aff_list *local;
3136
3137                 if (array->n_group == 0)
3138                         continue;
3139
3140                 local = isl_pw_aff_list_alloc(gen->ctx, array->n_index);
3141
3142                 for (j = 0; j < array->n_index; ++j) {
3143                         isl_pw_aff *pwaff;
3144
3145                         pwaff = isl_pw_aff_copy(array->bound[j]);
3146                         pwaff = isl_pw_aff_gist(pwaff, isl_set_copy(context));
3147                         local = isl_pw_aff_list_add(local, pwaff);
3148                 }
3149
3150                 kernel->array[i].bound = local;
3151         }
3152         isl_set_free(context);
3153 }
3154
3155 /* Find the element in gen->stmt that has the given "id".
3156  * Return NULL if no such gpu_stmt can be found.
3157  */
3158 static struct gpu_stmt *find_stmt(struct gpu_prog *prog, __isl_keep isl_id *id)
3159 {
3160         int i;
3161
3162         for (i = 0; i < prog->n_stmts; ++i) {
3163                 if (id == prog->stmts[i].id)
3164                         break;
3165         }
3166
3167         return i < prog->n_stmts ? &prog->stmts[i] : NULL;
3168 }
3169
3170 /* Set gen->tile_len and gen->n_parallel to those of the statement
3171  * affected by the first map (part of the schedule)
3172  * on which this function is called.
3173  * Because of the way the schedule is constructed, the other statements
3174  * in the list, if any, should have the same values for these properties.
3175  */
3176 static int extract_tile_len(__isl_take isl_map *map, void *user)
3177 {
3178         struct gpu_gen *gen = (struct gpu_gen *) user;
3179         isl_id *id;
3180         struct gpu_stmt *stmt;
3181
3182         id = isl_map_get_tuple_id(map, isl_dim_in);
3183         stmt = find_stmt(gen->prog, id);
3184         isl_id_free(id);
3185
3186         isl_map_free(map);
3187
3188         if (!stmt)
3189                 isl_die(gen->ctx, isl_error_unknown,
3190                         "statement not found", return -1);
3191
3192         gen->tile_len = stmt->tile_len;
3193         gen->n_parallel = stmt->n_parallel;
3194
3195         return -1;
3196 }
3197
3198 void ppcg_kernel_stmt_free(void *user)
3199 {
3200         int i;
3201         struct ppcg_kernel_stmt *stmt = user;
3202
3203         if (!stmt)
3204                 return;
3205
3206         switch (stmt->type) {
3207         case ppcg_kernel_copy:
3208                 isl_ast_expr_free(stmt->u.c.index);
3209                 isl_ast_expr_free(stmt->u.c.local_index);
3210                 break;
3211         case ppcg_kernel_domain:
3212                 isl_id_to_ast_expr_free(stmt->u.d.ref2expr);
3213                 break;
3214         case ppcg_kernel_sync:
3215                 break;
3216         }
3217
3218         free(stmt);
3219 }
3220
3221 /* Set the options of "context" to
3222  *
3223  *      { space -> [x] : x >= first }
3224  */
3225 static __isl_give isl_ast_build *set_unroll(
3226         __isl_take isl_ast_build *build, __isl_take isl_space *space,
3227         int first)
3228 {
3229         isl_ctx *ctx;
3230         isl_map *unroll;
3231         isl_union_map *opt;
3232
3233         ctx = isl_ast_build_get_ctx(build);
3234
3235         space = isl_space_from_domain(space);
3236         space = isl_space_add_dims(space, isl_dim_out, 1);
3237         space = isl_space_set_tuple_name(space, isl_dim_out, "unroll");
3238         unroll = isl_map_universe(space);
3239         unroll = isl_map_lower_bound_si(unroll, isl_dim_out, 0, first);
3240         opt = isl_union_map_from_map(unroll);
3241
3242         build = isl_ast_build_set_options(build, opt);
3243
3244         return build;
3245 }
3246
3247 /* Return a list of isl_ids of the form "prefix%d".
3248  */
3249 static __isl_give isl_id_list *generate_names(isl_ctx *ctx,
3250         int n, const char *prefix)
3251 {
3252         int i;
3253         char name[10];
3254         isl_id_list *names;
3255
3256         names = isl_id_list_alloc(ctx, n);
3257         for (i = 0; i < n; ++i) {
3258                 isl_id *id;
3259
3260                 snprintf(name, sizeof(name), "%s%d", prefix, i);
3261                 id = isl_id_alloc(ctx, name, NULL);
3262                 names = isl_id_list_add(names, id);
3263         }
3264
3265         return names;
3266 }
3267
3268 /* Extend the schedule "schedule" with the part of "extension"
3269  * starting at "first" up to "len".
3270  */
3271 static __isl_give isl_union_map *extend_schedule(
3272         __isl_take isl_union_map *schedule,
3273         __isl_take isl_union_map *extension, int first, int len)
3274 {
3275         isl_space *space;
3276         isl_map *proj;
3277         isl_union_map *umap;
3278         isl_set *set;
3279
3280         space = isl_union_map_get_space(schedule);
3281         space = isl_space_set_from_params(space);
3282         space = isl_space_add_dims(space, isl_dim_set, len);
3283         proj = isl_set_identity(isl_set_universe(space));
3284         proj = isl_map_project_out(proj, isl_dim_out, 0, first);
3285         extension = isl_union_map_apply_range(extension,
3286                                                 isl_union_map_from_map(proj));
3287
3288         schedule = isl_union_map_range_product(schedule, extension);
3289
3290         return schedule;
3291 }
3292
3293 /* Return the gpu_stmt_access in the list "accesses" that corresponds
3294  * to "ref_id".
3295  */
3296 static struct gpu_stmt_access *find_access(struct gpu_stmt_access *accesses,
3297         __isl_keep isl_id *ref_id)
3298 {
3299         struct gpu_stmt_access *access;
3300
3301         for (access = accesses; access; access = access->next)
3302                 if (access->ref_id == ref_id)
3303                         return access;
3304
3305         return NULL;
3306 }
3307
3308 /* Return the index of the array called "name" in the list of arrays.
3309  */
3310 static int find_array_index(struct gpu_gen *gen, const char *name)
3311 {
3312         int i;
3313
3314         for (i = 0; i < gen->prog->n_array; ++i)
3315                 if (!strcmp(name, gen->prog->array[i].name))
3316                         return i;
3317
3318         return -1;
3319 }
3320
3321 /* Internal data structure for the index and AST expression transformation
3322  * callbacks for pet_stmt_build_ast_exprs.
3323  *
3324  * "accesses" is the list of gpu_stmt_access in the statement.
3325  * "iterator_map" expresses the statement iterators in terms of
3326  * the AST loop iterators.
3327  * "sched2shared" expresses the first shared_len dimensions of
3328  * the computed schedule in terms of the AST loop iterators.
3329  *
3330  * The following fields are set in transform_index and used in transform_expr.
3331  * "array" is the array that is being accessed.
3332  * "global" is set if the global array is accessed (rather than
3333  * shared/private memory).
3334  * "local_array" refers to information on the array specialized
3335  * to the current kernel.
3336  */
3337 struct ppcg_transform_data {
3338         struct gpu_gen *gen;
3339         struct gpu_stmt_access *accesses;
3340         isl_pw_multi_aff *iterator_map;
3341         isl_pw_multi_aff *sched2shared;
3342
3343         struct gpu_array_info *array;
3344         int global;
3345         struct gpu_local_array_info *local_array;
3346 };
3347
3348 /* Index transformation callback for pet_stmt_build_ast_exprs.
3349  *
3350  * "index" expresses the array indices in terms of statement iterators
3351  *
3352  * We first reformulate "index" in terms of the AST loop iterators.
3353  * Then we check if we are accessing the global array or
3354  * a shared/private copy.  In the former case, we simply return
3355  * the updated index.  If "index" is an affine expression rather
3356  * than an array access, then we also return the updated index here.
3357  *
3358  * Otherwise, we apply the tiling to the index.
3359  * This tiling is of the form
3360  *
3361  *      [D -> A] -> T
3362  *
3363  * The index is of the form
3364  *
3365  *      L -> A
3366  *
3367  * We update the tiling to refer to the AST loop iteratos
3368  *
3369  *      [L -> A] -> T
3370  *
3371  * and modify index to keep track of those iterators
3372  *
3373  *      L -> [L -> A]
3374  *
3375  * Combining these two yields a tiled index expression in terms
3376  * of the AST loop iterators
3377  *
3378  *      L -> T
3379  */
3380 static __isl_give isl_multi_pw_aff *transform_index(
3381         __isl_take isl_multi_pw_aff *index, __isl_keep isl_id *ref_id,
3382         void *user)
3383 {
3384         struct ppcg_transform_data *data = user;
3385         struct gpu_stmt_access *access;
3386         struct gpu_array_ref_group *group;
3387         struct gpu_array_tile *tile;
3388         isl_pw_multi_aff *iterator_map;
3389         int i;
3390         const char *name;
3391         isl_space *space;
3392         isl_multi_pw_aff *tiling;
3393         isl_pw_multi_aff *pma;
3394         isl_multi_pw_aff *mpa;
3395
3396         data->array = NULL;
3397
3398         iterator_map = isl_pw_multi_aff_copy(data->iterator_map);
3399         index = isl_multi_pw_aff_pullback_pw_multi_aff(index, iterator_map);
3400
3401         access = find_access(data->accesses, ref_id);
3402         if (!access)
3403                 return index;
3404         if (!isl_map_has_tuple_name(access->access, isl_dim_out))
3405                 return index;
3406
3407         name = isl_map_get_tuple_name(access->access, isl_dim_out);
3408         i = find_array_index(data->gen, name);
3409         if (i < 0)
3410                 isl_die(isl_multi_pw_aff_get_ctx(index), isl_error_internal,
3411                         "cannot find array reference group",
3412                         return isl_multi_pw_aff_free(index));
3413
3414         data->array = &data->gen->prog->array[i];
3415         data->local_array = &data->gen->kernel->array[i];
3416         group = data->array->groups[access->group];
3417         tile = group->private_tile;
3418         if (!tile)
3419                 tile = group->shared_tile;
3420         data->global = !tile;
3421         if (!tile)
3422                 return index;
3423
3424         space = isl_space_range(isl_multi_pw_aff_get_space(index));
3425         space = isl_space_map_from_set(space);
3426         pma = isl_pw_multi_aff_identity(space);
3427         pma = isl_pw_multi_aff_product(
3428                         isl_pw_multi_aff_copy(data->sched2shared), pma);
3429         tiling = isl_multi_pw_aff_from_multi_aff(
3430                                     isl_multi_aff_copy(tile->tiling));
3431         tiling = isl_multi_pw_aff_pullback_pw_multi_aff(tiling, pma);
3432
3433         space = isl_space_domain(isl_multi_pw_aff_get_space(index));
3434         space = isl_space_map_from_set(space);
3435         mpa = isl_multi_pw_aff_identity(space);
3436         index = isl_multi_pw_aff_range_product(mpa, index);
3437         index = isl_multi_pw_aff_pullback_multi_pw_aff(tiling, index);
3438
3439         return index;
3440 }
3441
3442 /* Dereference "expr" by adding an index [0].
3443  * The original "expr" is assumed not to have any indices.
3444  */
3445 static __isl_give isl_ast_expr *dereference(__isl_take isl_ast_expr *expr)
3446 {
3447         isl_ctx *ctx;
3448         isl_ast_expr *res;
3449         isl_ast_expr_list *list;
3450
3451         ctx = isl_ast_expr_get_ctx(expr);
3452         res = isl_ast_expr_from_val(isl_val_zero(ctx));
3453         list = isl_ast_expr_list_from_ast_expr(res);
3454         res = isl_ast_expr_get_op_arg(expr, 0);
3455         res = isl_ast_expr_access(res, list);
3456         isl_ast_expr_free(expr);
3457
3458         return res;
3459 }
3460
3461 /* Linearize the index expression "expr" based on the array bounds
3462  * of "array".
3463  *
3464  * That is, transform expression
3465  *
3466  *      A[i_0][i_1]...[i_n]
3467  *
3468  * to
3469  *
3470  *      A[(..((i_0 * b_1 + i_1) ... ) * b_n + i_n]
3471  *
3472  * where b_0, b_1, ..., b_n are the bounds on the array.
3473  */
3474 __isl_give isl_ast_expr *gpu_local_array_info_linearize_index(
3475         struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr)
3476 {
3477         int i, n;
3478         isl_ctx *ctx;
3479         isl_set *context;
3480         isl_ast_expr *res;
3481         isl_ast_expr_list *list;
3482         isl_ast_build *build;
3483
3484         ctx = isl_ast_expr_get_ctx(expr);
3485         context = isl_set_universe(isl_space_params_alloc(ctx, 0));
3486         build = isl_ast_build_from_context(context);
3487
3488         n = isl_ast_expr_get_op_n_arg(expr);
3489         res = isl_ast_expr_get_op_arg(expr, 1);
3490         for (i = 2; i < n; ++i) {
3491                 isl_pw_aff *bound_i;
3492                 isl_ast_expr *expr_i;
3493
3494                 bound_i = isl_pw_aff_list_get_pw_aff(array->bound, i - 1);
3495                 expr_i = isl_ast_build_expr_from_pw_aff(build, bound_i);
3496                 res = isl_ast_expr_mul(res, expr_i);
3497                 expr_i = isl_ast_expr_get_op_arg(expr, i);
3498                 res = isl_ast_expr_add(res, expr_i);
3499         }
3500
3501         isl_ast_build_free(build);
3502
3503         list = isl_ast_expr_list_from_ast_expr(res);
3504         res = isl_ast_expr_get_op_arg(expr, 0);
3505         res = isl_ast_expr_access(res, list);
3506
3507         isl_ast_expr_free(expr);
3508
3509         return res;
3510 }
3511
3512 /* AST expression transformation callback for pet_stmt_build_ast_exprs.
3513  *
3514  * If the AST expression refers to a global scalar that is not
3515  * a read-only scalar, then its address was passed to the kernel and
3516  * we need to dereference it.
3517  *
3518  * If the AST expression refers to an access to a global array,
3519  * then we linearize the access exploiting the bounds in data->local_array.
3520  */
3521 static __isl_give isl_ast_expr *transform_expr(__isl_take isl_ast_expr *expr,
3522         __isl_keep isl_id *id, void *user)
3523 {
3524         struct ppcg_transform_data *data = user;
3525
3526         if (!data->array)
3527                 return expr;
3528         if (gpu_array_is_read_only_scalar(data->array))
3529                 return expr;
3530         if (!data->global)
3531                 return expr;
3532         if (data->array->n_index == 0)
3533                 return dereference(expr);
3534
3535         return gpu_local_array_info_linearize_index(data->local_array, expr);
3536 }
3537
3538 /* This function is called for each instance of a user statement
3539  * in the kernel.
3540  *
3541  * We attach a struct ppcg_kernel_stmt to the "node", containing
3542  * a computed AST expression for each access.
3543  * These AST expressions are computed from iterator_map,
3544  * which expresses the domain
3545  * elements in terms of the generated loops, and sched2shared,
3546  * which expresses the first shared_len dimensions of the schedule
3547  * computed by PPCG in terms of the generated loops.
3548  */
3549 static __isl_give isl_ast_node *at_each_domain(__isl_take isl_ast_node *node,
3550         __isl_keep isl_ast_build *build, void *user)
3551 {
3552         struct ppcg_transform_data data;
3553         struct gpu_gen *gen = (struct gpu_gen *) user;
3554         struct ppcg_kernel_stmt *stmt;
3555         isl_id *id;
3556         isl_pw_multi_aff *sched2shared;
3557         isl_map *map;
3558         isl_pw_multi_aff *iterator_map;
3559         isl_ast_expr *expr, *arg;
3560         isl_union_map *schedule;
3561         int i, n;
3562         struct gpu_stmt_access *access;
3563
3564         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
3565         if (!stmt)
3566                 return isl_ast_node_free(node);
3567
3568         expr = isl_ast_node_user_get_expr(node);
3569         arg = isl_ast_expr_get_op_arg(expr, 0);
3570         id = isl_ast_expr_get_id(arg);
3571
3572         schedule = isl_ast_build_get_schedule(build);
3573         map = isl_map_reverse(isl_map_from_union_map(schedule));
3574         iterator_map = isl_pw_multi_aff_from_map(map);
3575         sched2shared = compute_sched_to_shared(gen,
3576                                         isl_pw_multi_aff_copy(iterator_map));
3577
3578         stmt->type = ppcg_kernel_domain;
3579         stmt->u.d.stmt = find_stmt(gen->prog, id);
3580         if (!stmt->u.d.stmt)
3581                 goto error;
3582
3583         data.gen = gen;
3584         data.accesses = stmt->u.d.stmt->accesses;
3585         data.iterator_map = iterator_map;
3586         data.sched2shared = sched2shared;
3587         stmt->u.d.ref2expr = pet_stmt_build_ast_exprs(stmt->u.d.stmt->stmt,
3588                                             build, &transform_index, &data,
3589                                             &transform_expr, &data);
3590
3591         isl_id_free(id);
3592         isl_pw_multi_aff_free(iterator_map);
3593         isl_pw_multi_aff_free(sched2shared);
3594         isl_ast_expr_free(arg);
3595         isl_ast_expr_free(expr);
3596
3597         id = isl_id_alloc(gen->ctx, NULL, stmt);
3598         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
3599         return isl_ast_node_set_annotation(node, id);
3600 error:
3601         isl_id_free(id);
3602         isl_pw_multi_aff_free(iterator_map);
3603         ppcg_kernel_stmt_free(stmt);
3604         isl_pw_multi_aff_free(sched2shared);
3605         return isl_ast_node_free(node);
3606 }
3607
3608 /* This function is called when code has been generated for the shared
3609  * tile loops.  The "schedule" refers only to the original statements.
3610  *
3611  * We extend the schedule with that part of gen->local_sched that hasn't
3612  * been taken into account yet.  This introduces parameters referring
3613  * to thread ids in the schedule, so we add them (with the appropriate
3614  * bounds to the context as well).
3615  * Finally, we set the appropriate unrolling options
3616  * if gen->first_unroll is set.
3617  */
3618 static __isl_give isl_ast_node *create_domain_leaf(
3619         __isl_take isl_union_map *schedule, __isl_take isl_ast_build *build,
3620         void *user)
3621 {
3622         struct gpu_gen *gen = (struct gpu_gen *) user;
3623         isl_space *space;
3624         isl_union_map *sched;
3625         isl_ast_node *tree;
3626         isl_set *set;
3627         isl_id_list *iterators;
3628         int n;
3629
3630         schedule = extend_schedule(schedule,
3631                         isl_union_map_copy(gen->local_sched),
3632                         gen->shared_len, gen->thread_tiled_len);
3633
3634         space = isl_ast_build_get_schedule_space(build);
3635         set = isl_set_universe(space);
3636         set = add_bounded_parameters(set, gen->kernel->n_block,
3637                                         gen->kernel->block_dim, "t");
3638         build = isl_ast_build_restrict(build, set);
3639
3640         n = gen->thread_tiled_len - gen->shared_len;
3641
3642         if (gen->first_unroll >= 0) {
3643                 space = isl_space_set_alloc(gen->ctx, 0, n);
3644                 build = set_unroll(build, space, gen->first_unroll);
3645         }
3646         iterators = generate_names(gen->ctx, n, "c");
3647         build = isl_ast_build_set_iterators(build, iterators);
3648         build = isl_ast_build_set_at_each_domain(build, &at_each_domain, gen);
3649         tree = isl_ast_build_ast_from_schedule(build, schedule);
3650         isl_ast_build_free(build);
3651
3652         return tree;
3653 }
3654
3655 /* This function is called for each statement node in the AST of the code
3656  * for copying to or from shared/private memory.
3657  * Attach a pointer to a ppcg_kernel_stmt representing the copy
3658  * statement to the node.
3659  * The statement name is "read" or "write", depending on whether we are
3660  * reading from global memory or writing to global memory.
3661  * The name of the T space is {shared,private}_<array>.
3662  *
3663  * The schedule is of the form
3664  *
3665  *      type[A -> T] -> L
3666  *
3667  * where A refers to a piece of an array and T to the corresponding
3668  * shifted tile.  We split this schedule into mappings L -> A and L -> T
3669  * and store the corresponding expressions in stmt->index and stmt->local_index,
3670  * where stmt points to the ppcg_kernel_stmt that is attached to the node.
3671  */
3672 static __isl_give isl_ast_node *attach_copy_stmt(__isl_take isl_ast_node *node,
3673         __isl_keep isl_ast_build *build, void *user)
3674 {
3675         struct gpu_gen *gen = (struct gpu_gen *) user;
3676         struct ppcg_kernel_stmt *stmt;
3677         isl_id *id;
3678         isl_ast_expr *expr;
3679         isl_space *space;
3680         isl_map *access, *local_access, *map;
3681         isl_pw_multi_aff *pma;
3682         const char *type;
3683         int array_index;
3684
3685         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
3686         if (!stmt)
3687                 return isl_ast_node_free(node);
3688
3689         access = isl_map_from_union_map(isl_ast_build_get_schedule(build));
3690         type = isl_map_get_tuple_name(access, isl_dim_in);
3691         stmt->u.c.read = !strcmp(type, "read");
3692         access = isl_map_reverse(access);
3693         space = isl_space_unwrap(isl_space_range(isl_map_get_space(access)));
3694         local_access = isl_map_copy(access);
3695
3696         map = isl_map_domain_map(isl_map_universe(isl_space_copy(space)));
3697         id = isl_map_get_tuple_id(access, isl_dim_out);
3698         map = isl_map_set_tuple_id(map, isl_dim_in, id);
3699         access = isl_map_apply_range(access, map);
3700         pma = isl_pw_multi_aff_from_map(access);
3701         expr = isl_ast_build_access_from_pw_multi_aff(build, pma);
3702         stmt->u.c.index = expr;
3703
3704         map = isl_map_range_map(isl_map_universe(space));
3705         id = isl_map_get_tuple_id(local_access, isl_dim_out);
3706         map = isl_map_set_tuple_id(map, isl_dim_in, id);
3707         local_access = isl_map_apply_range(local_access, map);
3708         pma = isl_pw_multi_aff_from_map(local_access);
3709         expr = isl_ast_build_access_from_pw_multi_aff(build, pma);
3710         stmt->u.c.local_index = expr;
3711
3712         stmt->u.c.array = gen->copy_group->array;
3713         array_index = stmt->u.c.array - gen->prog->array;
3714         stmt->u.c.local_array = &gen->kernel->array[array_index];
3715         stmt->type = ppcg_kernel_copy;
3716
3717         id = isl_id_alloc(gen->ctx, NULL, stmt);
3718         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
3719         return isl_ast_node_set_annotation(node, id);
3720 }
3721
3722 /* Given a schedule of the form
3723  *
3724  *      [S -> A] -> L
3725  *
3726  * (with S the first shared_len dimensions of the computed schedule,
3727  * A the array and L the schedule correponding to the generated loops),
3728  * indicating where to copy the array elements that need to be copied,
3729  * construct code for performing the copying.
3730  *
3731  * "group" is the array reference group that is being copied
3732  * "type" is either "read" or "write"
3733  * private is set if copying needs to be performed to/from registers
3734  *
3735  * We first construct a mapping to a shifted tile of the array,
3736  *
3737  *      [S -> A] -> T(S,A)                                      (1)
3738  *
3739  * If private is set, then we also use this mapping as a schedule
3740  * (which is already thread-specific and will be completely unrolled).
3741  * Otherwise, we wrap/tile the range over the threads.
3742  * The result is
3743  *
3744  *      [S -> A] -> T'(S,A)
3745  *
3746  * Combined with the given schedule, we have
3747  *
3748  *      [S -> A] -> [L -> T'(S,A)]                              (2)
3749  *
3750  * From the shifted tile mapping, we construct a mapping
3751  *
3752  *      [S -> A] -> [A -> T(S,A)]
3753  *
3754  * and apply it to the schedule (2), obtaining
3755  *
3756  *      [A -> T(S(L),A)] -> [L -> T'(S(L),A)]
3757  *
3758  * Note that we can project out S because it is uniquely defined by L.
3759  */
3760 static __isl_give isl_ast_node *copy_access(struct gpu_gen *gen,
3761         __isl_take isl_map *sched,
3762         const char *type, struct gpu_array_ref_group *group,
3763         __isl_take isl_ast_build *build, int private)
3764 {
3765         isl_space *space;
3766         isl_ast_node *tree;
3767         isl_map *schedule, *shift, *map;
3768         isl_set *set;
3769         isl_id_list *iterators;
3770         int n;
3771
3772         shift = shift_access(group);
3773
3774         schedule = isl_map_copy(shift);
3775         schedule = isl_map_reset_tuple_id(schedule, isl_dim_out);
3776         if (!private)
3777                 schedule = tile_access_schedule(gen, schedule);
3778
3779         n = isl_map_dim(schedule, isl_dim_out);
3780         set = isl_set_universe(isl_ast_build_get_schedule_space(build));
3781         set = add_bounded_parameters(set, gen->kernel->n_block,
3782                                         gen->kernel->block_dim, "t");
3783
3784         schedule = isl_map_range_product(sched, schedule);
3785
3786         space = isl_space_domain(isl_map_get_space(shift));
3787         map = isl_map_range_map(isl_map_universe(isl_space_unwrap(space)));
3788         map = isl_map_range_product(map, shift);
3789
3790         schedule = isl_map_apply_domain(schedule, map);
3791
3792         schedule = isl_map_set_tuple_name(schedule, isl_dim_in, type);
3793
3794         build = isl_ast_build_restrict(build, set);
3795
3796         gen->copy_group = group;
3797
3798         if (private) {
3799                 space = isl_space_range(isl_map_get_space(schedule));
3800                 space = isl_space_range(isl_space_unwrap(space));
3801                 build = set_unroll(build, space, 0);
3802         }
3803         iterators = generate_names(gen->ctx, n, "c");
3804         build = isl_ast_build_set_iterators(build, iterators);
3805         build = isl_ast_build_set_at_each_domain(build, &attach_copy_stmt, gen);
3806         tree = isl_ast_build_ast_from_schedule(build,
3807                                             isl_union_map_from_map(schedule));
3808         isl_ast_build_free(build);
3809
3810         return tree;
3811 }
3812
3813 /* Return code for reading into or writing from shared memory
3814  * the given array reference group.
3815  *
3816  * If we are performing a read from global memory to shared memory and
3817  * if the array involved is not a scalar, then we copy
3818  * the entire tile to shared memory.  This may result in some extra
3819  * elements getting copied, but it should lead to simpler code
3820  * (which means that fewer registers may be needed) and less divergence.
3821  *
3822  * Otherwise, we only copy the elements that will be read or have been written
3823  * in the kernel.
3824  *
3825  *
3826  * The input "sched" is of the form.
3827  *
3828  *      type[S -> A] -> L
3829  *
3830  * with S the first shared_len dimensions of the computed schedule,
3831  * A the array and L the schedule correponding to the generated loops.
3832  *
3833  * We first drop "type",
3834  *
3835  *      [S -> A] -> L
3836  *
3837  * If the above conditions are satisfied, we project out A,
3838  * resulting in
3839  *
3840  *      S -> L
3841  *
3842  * and then introduce the group tile [S -> T], resulting in
3843  *
3844  *      [S -> T] -> L
3845  */
3846 static __isl_give isl_ast_node *copy_group_shared_accesses(
3847         struct gpu_gen *gen, struct gpu_array_ref_group *group,
3848         __isl_take isl_map *sched, __isl_take isl_ast_build *build)
3849 {
3850         const char *type;
3851         int read;
3852         isl_union_map *access;
3853
3854         type = isl_map_get_tuple_name(sched, isl_dim_in);
3855         read = !strcmp(type, "read");
3856
3857         sched = isl_map_reset_tuple_id(sched, isl_dim_in);
3858
3859         if (read && !gpu_array_is_scalar(group->array)) {
3860                 isl_space *space;
3861                 isl_map *map;
3862
3863                 space = isl_space_domain(isl_map_get_space(sched));
3864                 space = isl_space_unwrap(space);
3865                 map = isl_map_domain_map(isl_map_universe(space));
3866                 sched = isl_map_apply_domain(sched, map);
3867
3868                 map = group_tile(group);
3869                 map = isl_map_reverse(isl_map_domain_map(map));
3870                 sched = isl_map_apply_domain(sched, map);
3871         }
3872
3873         return copy_access(gen, sched, type, group, build, 0);
3874 }
3875
3876 /* Return code for reading into or writing from private memory
3877  * the given array reference group.
3878  *
3879  * Let S be the first shared_len dimensions of the computed schedule,
3880  * D the iteration domains, A the array and L the schedule correponding
3881  * to the generated loops.
3882  * "sched" is of the form
3883  *
3884  *      type[S -> A] -> L
3885  *
3886  * where type is either "read" or "write".
3887  * We apply the privatization D -> S(t), with t the thread ids,
3888  * to the access relation D -> A to obtain the privatized access relation
3889  *
3890  *      S(t) -> A
3891  *
3892  * We drop the type from "sched" and intersect with the privatized access
3893  * relation to obtain
3894  *
3895  *      [S(t) -> A] -> L
3896  */
3897 static __isl_give isl_ast_node *copy_group_private_accesses(
3898         struct gpu_gen *gen, struct gpu_array_ref_group *group,
3899         __isl_take isl_map *sched, __isl_take isl_ast_build *build)
3900 {
3901         const char *type;
3902         int read;
3903         isl_union_map *priv;
3904         isl_union_map *access;
3905         isl_map *access_map;
3906
3907         type = isl_map_get_tuple_name(sched, isl_dim_in);
3908         read = !strcmp(type, "read");
3909
3910         priv = isl_union_map_from_map(isl_map_copy(gen->privatization));
3911         priv = isl_union_map_apply_range(isl_union_map_copy(gen->shared_sched),
3912                                         priv);
3913
3914         access = group_access_relation(group, read, !read);
3915         access = isl_union_map_apply_domain(access, priv);
3916         access_map = isl_map_from_union_map(access);
3917
3918         sched = isl_map_reset_tuple_id(sched, isl_dim_in);
3919         sched = isl_map_intersect_domain(sched, isl_map_wrap(access_map));
3920
3921         return copy_access(gen, sched, type, group, build, 1);
3922 }
3923
3924 /* Return code for reading into or writing from shared or private memory.
3925  *
3926  * "schedule" is of the form
3927  *
3928  *      type[S -> A] -> L
3929  *
3930  * with S be the first shared_len dimensions of the computed schedule,
3931  * A the array and L the schedule correponding to the generated loops.
3932  * The array reference group is attached to "type".
3933  */
3934 static __isl_give isl_ast_node *create_access_leaf(
3935         struct gpu_gen *gen, __isl_take isl_map *schedule,
3936         __isl_take isl_ast_build *build)
3937 {
3938         struct gpu_array_ref_group *group;
3939         isl_id *id;
3940
3941         id = isl_map_get_tuple_id(schedule, isl_dim_in);
3942         group = isl_id_get_user(id);
3943         isl_id_free(id);
3944
3945         if (group->private_tile)
3946                 return copy_group_private_accesses(gen, group, schedule,
3947                                                         build);
3948         else
3949                 return copy_group_shared_accesses(gen, group, schedule,
3950                                                         build);
3951 }
3952
3953 /* Create a domain node representing a synchronization.
3954  */
3955 static __isl_give isl_ast_node *create_sync_leaf(
3956         struct gpu_gen *gen, __isl_take isl_map *schedule,
3957         __isl_take isl_ast_build *build)
3958 {
3959         struct ppcg_kernel_stmt *stmt;
3960         isl_id *id;
3961         isl_space *space;
3962         isl_ast_node *node;
3963         isl_ast_expr *expr;
3964
3965         isl_map_free(schedule);
3966
3967         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
3968         if (!stmt)
3969                 return NULL;
3970
3971         stmt->type = ppcg_kernel_sync;
3972
3973         space = isl_ast_build_get_schedule_space(build);
3974         space = isl_space_from_domain(space);
3975         space = isl_space_set_tuple_name(space, isl_dim_out, "sync");
3976         expr = isl_ast_build_call_from_pw_multi_aff(build,
3977                     isl_pw_multi_aff_from_multi_aff(isl_multi_aff_zero(space)));
3978         node = isl_ast_node_alloc_user(expr);
3979         isl_ast_build_free(build);
3980
3981         id = isl_id_alloc(gen->ctx, NULL, stmt);
3982         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
3983         return isl_ast_node_set_annotation(node, id);
3984 }
3985
3986 /* This function is called during the code generation at the point
3987  * where the schedule domain element is completely determined by
3988  * the generated code.  The input schedule contains the original
3989  * statements as well as synchronization and copy "statements".
3990  * The latter are scheduled at different points than any of the original
3991  * statements, so they will only arrive here in isolation.
3992  *
3993  * If the current schedule only refers to a single statement,
3994  * we check if it is a copy or synchronization statement and
3995  * call the appropriate functions.
3996  * Otherwise, we assume we are dealing with the original statements
3997  * and we call create_domain_leaf.
3998  */
3999 static __isl_give isl_ast_node *create_kernel_leaf(
4000         __isl_take isl_ast_build *build, void *user)
4001 {
4002         struct gpu_gen *gen = (struct gpu_gen *) user;
4003         isl_map *map;
4004         isl_union_map *schedule;
4005         const char *name;
4006
4007         schedule = isl_ast_build_get_schedule(build);
4008
4009         if (isl_union_map_n_map(schedule) != 1)
4010                 return create_domain_leaf(schedule, build, user);
4011
4012         map = isl_map_from_union_map(schedule);
4013         name = isl_map_get_tuple_name(map, isl_dim_in);
4014         if (!strcmp(name, "read") || !strcmp(name, "write"))
4015                 return create_access_leaf(gen, map, build);
4016         if (!strcmp(name, "sync"))
4017                 return create_sync_leaf(gen, map, build);
4018
4019         return create_domain_leaf(isl_union_map_from_map(map), build, user);
4020 }
4021
4022 /* Mark all odd schedule dimensions as "atomic" (when the even dimensions
4023  * have value 0) and all even schedule dimensions as "unroll".
4024  *
4025  * That is, the options look as follows
4026  *
4027  *      { [0, b, 0, d, ..., 0] -> atomic[i] : exists a : i = 2 a + 1;
4028  *        [a, b, c, d, ..., z] -> unroll[i] : exists a : i = 2 a }
4029  *
4030  * The even positions are used to be able to schedule copying blocks
4031  * and synchronization before or after each level of the shared memory
4032  * tile loops and we want to make sure that code for these is generated
4033  * separately (within each level).
4034  */
4035 static __isl_give isl_ast_build *set_atomic_and_unroll(
4036         __isl_take isl_ast_build *build,
4037         __isl_take isl_space *space, int sched_len)
4038 {
4039         isl_ctx *ctx;
4040         isl_map *map;
4041         isl_constraint *c;
4042         isl_union_map *opt;
4043         isl_local_space *ls;
4044         int i, n;
4045
4046         ctx = isl_ast_build_get_ctx(build);
4047
4048         space = isl_space_params(space);
4049         space = isl_space_add_dims(space, isl_dim_set, sched_len);
4050         space = isl_space_from_domain(space);
4051         space = isl_space_add_dims(space, isl_dim_out, 2);
4052         map = isl_map_universe(isl_space_copy(space));
4053         for (i = 0; i < sched_len; i += 2)
4054                 map = isl_map_fix_si(map, isl_dim_in, i, 0);
4055         ls = isl_local_space_from_space(isl_map_get_space(map));
4056         c = isl_equality_alloc(ls);
4057         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 0, 1);
4058         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 1, 2);
4059         c = isl_constraint_set_constant_si(c, 1);
4060         map = isl_map_add_constraint(map, c);
4061         map = isl_map_project_out(map, isl_dim_out, 1, 1);
4062         map = isl_map_set_tuple_name(map, isl_dim_out, "atomic");
4063         opt = isl_union_map_from_map(map);
4064
4065         map = isl_map_universe(space);
4066         ls = isl_local_space_from_space(isl_map_get_space(map));
4067         c = isl_equality_alloc(ls);
4068         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 0, 1);
4069         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 1, 2);
4070         map = isl_map_add_constraint(map, c);
4071         map = isl_map_project_out(map, isl_dim_out, 1, 1);
4072         map = isl_map_set_tuple_name(map, isl_dim_out, "unroll");
4073         opt = isl_union_map_add_map(opt, map);
4074
4075         build = isl_ast_build_set_options(build, opt);
4076
4077         return build;
4078 }
4079
4080 /* Return a map that maps a space of dimension gen->shared_len
4081  * to its last dimensions starting at gen->tile_first.
4082  * The range is of dimension
4083  *
4084  *      2 * (gen->shared_len - gen->tile_first) + 1
4085  *
4086  * The input dimensions are mapped to the odd dimensions in the output,
4087  * while the even dimensions (except 2*pos) are fixed to 0.
4088  * Output dimension 2*pos (if pos >= 0) is fixed to "val".
4089  * If pos >= 0, then only the pos first dimensions starting at gen->tile_first
4090  * are mapped to the output.  The remaining input dimensions are projected
4091  * out and the corresponding output dimensions are fixed to 0.
4092  */
4093 static __isl_give isl_map *insert_even(struct gpu_gen *gen,
4094         __isl_take isl_space *space, int pos, int val)
4095 {
4096         int i, n;
4097         isl_map *proj;
4098
4099         space = isl_space_set_from_params(space);
4100         space = isl_space_add_dims(space, isl_dim_set, gen->shared_len);
4101         space = isl_space_map_from_set(space);
4102         proj = isl_map_identity(space);
4103         proj = isl_map_project_out(proj, isl_dim_out, 0, gen->tile_first);
4104         n = gen->shared_len - gen->tile_first;
4105         for (i = 0; i <= n; ++i) {
4106                 proj = isl_map_insert_dims(proj, isl_dim_out, 2 * i, 1);
4107                 if (i == pos)
4108                         proj = isl_map_fix_si(proj, isl_dim_out, 2 * i, val);
4109                 else
4110                         proj = isl_map_fix_si(proj, isl_dim_out, 2 * i, 0);
4111         }
4112
4113         if (pos < 0)
4114                 return proj;
4115
4116         proj = isl_map_eliminate(proj, isl_dim_in, gen->tile_first + pos,
4117                                 gen->shared_len - (gen->tile_first + pos));
4118         for (i = pos; i < n; ++i)
4119                 proj = isl_map_fix_si(proj, isl_dim_out, 2 * i + 1, 0);
4120
4121         return proj;
4122 }
4123
4124 /* Given the AST context schedule "schedule" and the mapping from
4125  * domains to the shared tile loops "shared_sched", add a schedule
4126  * for a synchronization operation at position "val" of loop level "pos".
4127  *
4128  * schedule is of the form
4129  *
4130  *      D -> L
4131  *
4132  * (with D the iteration domains and L the already generated loops),
4133  * while shared_sched is of the form
4134  *
4135  *      D -> S
4136  *
4137  * We combine them into
4138  *
4139  *      L -> S
4140  *
4141  * apply a mapping
4142  *
4143  *      [s_0,...] -> [0,s_{tile_first},0,..., val, 0, 0, ... 0]
4144  *
4145  * and use the result as a schedule for "sync".
4146  */
4147 static __isl_give isl_union_map *add_sync_schedule(struct gpu_gen *gen,
4148         __isl_take isl_union_map *res, __isl_keep isl_union_map *schedule,
4149         __isl_keep isl_union_map *shared_sched, int pos, int val)
4150 {
4151         isl_space *space;
4152         isl_map *proj, *map;
4153
4154         shared_sched = isl_union_map_copy(shared_sched);
4155         schedule = isl_union_map_copy(schedule);
4156
4157         space = isl_union_map_get_space(shared_sched);
4158         schedule = isl_union_map_apply_domain(shared_sched, schedule);
4159         map = isl_map_from_union_map(schedule);
4160
4161         proj = insert_even(gen, space, pos, val);
4162         map = isl_map_apply_range(map, proj);
4163         map = isl_map_from_range(isl_map_wrap(map));
4164         map = isl_map_set_tuple_name(map, isl_dim_in, "sync");
4165
4166         res = isl_union_map_add_map(res, map);
4167
4168         return res;
4169 }
4170
4171 /* Given the AST context schedule "schedule" and the mapping from
4172  * domains to the shared tile loops "shared_sched", add a schedule
4173  * for copying an array reference group to/from shared/private memory.
4174  * "read" is set if data should be copied from global memory
4175  * to shared/private memory.
4176  * "k" represents the current group
4177  * "s" is the total number of groups
4178  *
4179  * We schedule an operation before or after the innermost loop
4180  * of "shared_sched" that affects the tile of the array reference group.
4181  *
4182  * schedule is of the form
4183  *
4184  *      D -> L
4185  *
4186  * (with D the iteration domains and L the already generated loops),
4187  * while shared_sched is of the form
4188  *
4189  *      D -> S
4190  *
4191  * We first compute the access relation for the reference group
4192  *
4193  *      D -> A
4194  *
4195  * and combine it with shared_sched into
4196  *
4197  *      D -> [S -> A]
4198  *
4199  * If this results in an empty relation, no copying needs to be performed
4200  * at this point.
4201  * Otherwise, we invert the relation and combine it with "schedule" into
4202  *
4203  *      [S -> A] -> L
4204  *
4205  * The actual additional piece of the schedule is obtained from combining
4206  *
4207  *      [S -> A] -> S
4208  *
4209  * with a mapping
4210  *
4211  *      [s_0,...] -> [0,s_{tile_first},0,..., val, 0, 0, ... 0]
4212  *
4213  * The position of "val" corresponds to the innermost loop that affects
4214  * the tile and the value indicates where the copying is scheduled
4215  * with respect to the actual kernel code (at value 0).
4216  * Reads are schedule before the code, writes to global memory from
4217  * private memory are scheduled at values 1 to s, writes to global
4218  * memory from shared memory are scheduled at values s + 2 to 2 * s + 1.
4219  *
4220  * If we are scheduling a read from global memory to shared memory,
4221  * we insert a synchronization before the kernel code (at the innermost
4222  * level).
4223  * If we are scheduling a write to global memory, then we add
4224  * a synchronization after all writes (at value 2 *s + 2).
4225  * However, there is no need for a synchronization after the outermost loop.
4226  * A write to global memory from private memory at the innermost level
4227  * does not require a synchronization, because it is covered by
4228  * the synchronization after the kernel inserted by body_schedule.
4229  */
4230 static __isl_give isl_union_map *add_group_schedule(struct gpu_gen *gen,
4231         __isl_take isl_union_map *res, __isl_keep isl_union_map *schedule,
4232         __isl_keep isl_union_map *shared_sched,
4233         struct gpu_array_ref_group *group, int read, int k, int s)
4234 {
4235         int n;
4236         int pos, val;
4237         isl_space *space;
4238         isl_union_map *access;
4239         isl_map *map, *proj, *access_map;
4240         isl_id *id;
4241
4242         access = group_access_relation(group, read, !read);
4243         access = isl_union_map_range_product(isl_union_map_copy(shared_sched),
4244                                                 access);
4245
4246         if (isl_union_map_is_empty(access)) {
4247                 isl_union_map_free(access);
4248                 return res;
4249         }
4250
4251         access = isl_union_map_reverse(access);
4252         access = isl_union_map_apply_range(access,
4253                                             isl_union_map_copy(schedule));
4254         access_map = isl_map_from_union_map(access);
4255
4256         space = isl_space_copy(group->array->space);
4257         space = isl_space_from_range(space);
4258         space = isl_space_add_dims(space, isl_dim_in, gen->shared_len);
4259         map = isl_map_domain_map(isl_map_universe(space));
4260
4261         space = isl_union_map_get_space(schedule);
4262         pos = group->last_shared + 1 - gen->tile_first;
4263         assert(pos >= 0);
4264         if (read)
4265                 val = -2 - k;
4266         else if (group->private_tile)
4267                 val = 1 + k;
4268         else
4269                 val = 1 + s + 1 + k;
4270         proj = insert_even(gen, space, pos, val);
4271         map = isl_map_apply_range(map, proj);
4272
4273         access_map = isl_map_range_product(access_map, map);
4274
4275         id = isl_id_alloc(gen->ctx, read ? "read" : "write", group);
4276         access_map = isl_map_set_tuple_id(access_map, isl_dim_in, id);
4277
4278         res = isl_union_map_add_map(res, access_map);
4279
4280         n = gen->shared_len - gen->tile_first;
4281         if (read) {
4282                 if (!group->private_tile)
4283                         res = add_sync_schedule(gen, res, schedule,
4284                                                 shared_sched, n, -1);
4285         } else {
4286                 if (pos == 0)
4287                         return res;
4288                 if (pos == n && group->private_tile)
4289                         return res;
4290                 res = add_sync_schedule(gen, res, schedule, shared_sched,
4291                                         pos, 2 * s + 2);
4292         }
4293
4294         return res;
4295 }
4296
4297 /* Return a schedule for the shared tile loops based on the current
4298  * AST context schedule.
4299  *
4300  * We create a "shared_sched" that maps the domains to the first
4301  * shared_len dimensions of the computed schedule, project out the
4302  * first tile_first dimensions (as these are already covered by
4303  * the host code) and insert "statement-level" dimensions at even
4304  * positions so that we can schedule copy blocks and synchronization
4305  * before/after each level.
4306  *
4307  * In particular, copy blocks are inserted inside the innermost
4308  * level that affect the tile.  For the copying to global memory,
4309  * those from private memory are scheduled before those from shared
4310  * memory such that synchronization can be inserted between the two
4311  * at the innermost level.
4312  * Synchronization is inserted at the innermost level before the
4313  * actual kernel code if there is any copying from global memory
4314  * to shared memory.  It is inserted unconditionally at the innermost
4315  * level after the actual kernel code and the copying to global memory
4316  * from private memory (if any).  Finally, it is inserted after
4317  * any copying to global memory, except at the outermost level
4318  * and at the innermost level if there is no copying from shared
4319  * memory.  The copying from private memory is covered by the unconditional
4320  * synchronization at the innermost level.
4321  */
4322 static __isl_give isl_union_map *body_schedule(struct gpu_gen *gen,
4323         __isl_take isl_union_map *schedule)
4324 {
4325         isl_space *space;
4326         isl_union_map *res;
4327         isl_union_map *shared_sched;
4328         isl_union_map *sched;
4329         isl_map *proj, *map;
4330         int i, j, k, s;
4331
4332         shared_sched = isl_union_map_copy(gen->tiled_sched);
4333         proj = projection(isl_union_map_get_space(shared_sched),
4334                                 gen->tiled_len, gen->shared_len);
4335         shared_sched = isl_union_map_apply_range(shared_sched,
4336                                 isl_union_map_from_map(proj));
4337         space = isl_union_map_get_space(shared_sched);
4338         proj = insert_even(gen, space, -1, 0);
4339         sched = isl_union_map_apply_range(isl_union_map_copy(shared_sched),
4340                                 isl_union_map_from_map(proj));
4341
4342         res = isl_union_map_range_product(isl_union_map_copy(schedule), sched);
4343
4344         s = 0;
4345         for (i = 0; i < gen->prog->n_array; ++i)
4346                 s += gen->prog->array[i].n_group;
4347
4348         k = 0;
4349         for (i = 0; i < gen->prog->n_array; ++i) {
4350                 struct gpu_array_info *array = &gen->prog->array[i];
4351
4352                 for (j = 0; j < array->n_group; ++j) {
4353                         struct gpu_array_ref_group *group;
4354
4355                         group = array->groups[j];
4356                         if (!group->private_tile && !group->shared_tile)
4357                                 continue;
4358                         res = add_group_schedule(gen, res, schedule,
4359                                                 shared_sched, group, 0, k, s);
4360                         res = add_group_schedule(gen, res, schedule,
4361                                                 shared_sched, group, 1, k, s);
4362                         ++k;
4363                 }
4364         }
4365
4366         res = add_sync_schedule(gen, res, schedule, shared_sched,
4367                             gen->shared_len - gen->tile_first, 1 + s);
4368
4369         isl_union_map_free(shared_sched);
4370         isl_union_map_free(schedule);
4371
4372         return res;
4373 }
4374
4375 /* Generate code for "kernel" in the given "context".
4376  *
4377  * We first generate code for the shared tile loops (T1T, T1P and T2)
4378  * in a context that includes the block ids.
4379  * Within each iteration of these loops an additional code generation
4380  * is performed (within create_kernel_leaf) for the rest of the schedule
4381  * in a context that includes the thread ids.
4382  */
4383 static __isl_give isl_ast_node *generate_kernel(struct gpu_gen *gen,
4384         __isl_keep isl_ast_build *build, __isl_keep isl_set *host_domain,
4385         __isl_keep isl_multi_pw_aff *grid_size)
4386 {
4387         isl_space *space;
4388         isl_set *set;
4389         isl_id_list *iterators;
4390         isl_union_map *schedule;
4391         isl_ast_node *tree;
4392         int sched_len;
4393
4394         schedule = isl_ast_build_get_schedule(build);
4395
4396         build = isl_ast_build_copy(build);
4397         build = isl_ast_build_restrict(build, isl_set_copy(host_domain));
4398         space = isl_ast_build_get_schedule_space(build);
4399         set = isl_set_universe(isl_space_copy(space));
4400         set = add_bounded_parameters_dynamic(set, grid_size, "b");
4401         build = isl_ast_build_restrict(build, set);
4402
4403         schedule = body_schedule(gen, schedule);
4404
4405         sched_len = 2 * (gen->shared_len - gen->tile_first) + 1;
4406
4407         build = set_atomic_and_unroll(build, space, sched_len);
4408         iterators = generate_names(gen->ctx, sched_len, "g");
4409         build = isl_ast_build_set_iterators(build, iterators);
4410         build = isl_ast_build_set_create_leaf(build, &create_kernel_leaf, gen);
4411         tree = isl_ast_build_ast_from_schedule(build, schedule);
4412         isl_ast_build_free(build);
4413
4414         return tree;
4415 }
4416
4417 /* Attach "id" to the given node.
4418  */
4419 static __isl_give isl_ast_node *attach_id(__isl_take isl_ast_node *node,
4420         __isl_keep isl_ast_build *build, void *user)
4421 {
4422         isl_id *id = user;
4423
4424         node = isl_ast_node_set_annotation(node, id);
4425
4426         return node;
4427 }
4428
4429 /* Construct an AST node for performing a kernel launch and attach
4430  * the information about the kernel to that node.
4431  *
4432  * The kernel AST has been constructed in the context of the range
4433  * of "schedule".  In particular, the grid size has been computed
4434  * in the context.  We therefore still need to make sure that these
4435  * constraints are expressed in the code.  We do this by creating a schedule
4436  *
4437  *      kernel[] -> [S -> []]
4438  *
4439  * where S is the schedule domain, i.e., the range of "schedule".
4440  * The AST generation will then create a single call surrounded by
4441  * all the condition in "S" that have not been expressed yet.
4442  *
4443  * The kernel information is attached to this node in attach_id.
4444  */
4445 static __isl_give isl_ast_node *construct_launch(
4446         __isl_take isl_ast_build *build, __isl_take isl_union_map *schedule,
4447         __isl_take struct ppcg_kernel *kernel)
4448 {
4449         isl_id *id;
4450         isl_ctx *ctx;
4451         isl_union_set *domain;
4452         isl_set *set;
4453         isl_map *map;
4454         isl_ast_node *node;
4455
4456         ctx = isl_ast_build_get_ctx(build);
4457
4458         id = isl_id_alloc(ctx, NULL, kernel);
4459         id = isl_id_set_free_user(id, &ppcg_kernel_free);
4460
4461         domain = isl_union_map_range(schedule);
4462         set = isl_set_from_union_set(domain);
4463         map = isl_map_from_domain(set);
4464         map = isl_map_from_range(isl_map_wrap(map));
4465         map = isl_map_set_tuple_name(map, isl_dim_in, "kernel");
4466         schedule = isl_union_map_from_map(map);
4467
4468         build = isl_ast_build_set_at_each_domain(build, &attach_id, id);
4469         node = isl_ast_build_ast_from_schedule(build, schedule);
4470         isl_ast_build_free(build);
4471
4472         return node;
4473 }
4474
4475 /* This function is called for each leaf in the AST of the host code.
4476  * We first specialize the schedule to the site of the leaf, compute
4477  * the size of shared memory and then construct the body of the host code
4478  * and the associated kernel.
4479  *
4480  * The necessary information for printing the kernel launch is
4481  * stored in a struct ppcg_kernel and attached to the leaf node
4482  * created to represent the launch.
4483  */
4484 static __isl_give isl_ast_node *create_host_leaf(
4485         __isl_take isl_ast_build *build, void *user)
4486 {
4487         struct gpu_gen *gen = (struct gpu_gen *) user;
4488         isl_id *id;
4489         isl_ast_node *node;
4490         struct ppcg_kernel *kernel;
4491         isl_set *host_domain;
4492         isl_union_map *schedule;
4493         isl_union_map *local_sched;
4494         isl_union_map *access;
4495         isl_union_set *domain;
4496         int i;
4497
4498         schedule = isl_ast_build_get_schedule(build);
4499
4500         isl_union_map_foreach_map(schedule, &extract_tile_len, gen);
4501         read_sizes(gen);
4502
4503         domain = isl_union_map_domain(isl_union_map_copy(schedule));
4504
4505         local_sched = isl_union_map_copy(gen->sched);
4506         local_sched = isl_union_map_intersect_domain(local_sched, domain);
4507         access = isl_union_map_union(isl_union_map_copy(gen->prog->read),
4508                                      isl_union_map_copy(gen->prog->write));
4509         access = isl_union_map_apply_domain(access,
4510                                             isl_union_map_copy(local_sched));
4511
4512         gen->tiled_sched = tile_schedule(gen, local_sched);
4513         gen->tiled_sched = parametrize_tiled_schedule(gen, gen->tiled_sched);
4514         gen->tiled_sched = scale_tile_loops(gen, gen->tiled_sched);
4515
4516         gen->local_sched = isl_union_map_copy(gen->tiled_sched);
4517         gen->local_sched = thread_tile_schedule(gen, gen->local_sched);
4518         gen->local_sched = scale_thread_tile_loops(gen, gen->local_sched);
4519
4520         kernel = gen->kernel = isl_calloc_type(gen->ctx, struct ppcg_kernel);
4521         if (!kernel)
4522                 goto error;
4523
4524         kernel->id = gen->kernel_id++;
4525         kernel->context = isl_union_map_params(isl_union_map_copy(schedule));
4526         kernel->grid_size = extract_grid_size(gen, kernel);
4527         extract_block_size(gen, kernel);
4528         kernel->arrays = isl_union_map_range(access);
4529         kernel->space = isl_ast_build_get_schedule_space(build);
4530
4531         gen->private_access = NULL;
4532         compute_shared_sched(gen);
4533         gen->privatization = compute_privatization(gen);
4534         group_references(gen);
4535         compute_private_access(gen);
4536         check_shared_memory_bound(gen);
4537         compute_group_tilings(gen);
4538         host_domain = isl_set_from_union_set(isl_union_map_range(
4539                                                 isl_union_map_copy(schedule)));
4540         localize_bounds(gen, kernel, host_domain);
4541
4542         gen->local_sched = interchange_for_unroll(gen, gen->local_sched);
4543
4544         kernel->tree = generate_kernel(gen, build, host_domain,
4545                                         kernel->grid_size);
4546         create_kernel_vars(gen, kernel);
4547
4548         free_local_array_info(gen);
4549         isl_map_free(gen->privatization);
4550         isl_union_map_free(gen->private_access);
4551         isl_union_map_free(gen->local_sched);
4552         isl_union_map_free(gen->tiled_sched);
4553         isl_union_map_free(gen->shared_sched);
4554         isl_union_map_free(gen->shared_proj);
4555         isl_set_free(host_domain);
4556         free(gen->tile_size);
4557
4558         node = construct_launch(build, schedule, kernel);
4559
4560         return node;
4561 error:
4562         isl_union_map_free(schedule);
4563         return NULL;
4564 }
4565
4566 /* Use isl to generate code for the outer gen->tile_first loops
4567  * of the global schedule in gen->sched, resulting in the host code.
4568  * Within each iteration of this partial schedule, i.e., for each kernel
4569  * launch, create_host_leaf takes care of generating the kernel code.
4570  */
4571 static __isl_give isl_ast_node *generate_host_code(struct gpu_gen *gen)
4572 {
4573         isl_ast_build *build;
4574         isl_ast_node *tree;
4575         isl_union_map *sched;
4576         isl_map *proj;
4577         isl_id_list *iterators;
4578
4579         sched = isl_union_map_copy(gen->sched);
4580         proj = projection(isl_union_map_get_space(sched),
4581                             gen->untiled_len, gen->tile_first);
4582         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
4583
4584         isl_options_set_ast_build_group_coscheduled(gen->ctx, 1);
4585         build = isl_ast_build_from_context(isl_set_copy(gen->prog->context));
4586         iterators = generate_names(gen->ctx, gen->tile_first, "h");
4587         build = isl_ast_build_set_iterators(build, iterators);
4588         build = isl_ast_build_set_create_leaf(build, &create_host_leaf, gen);
4589         tree = isl_ast_build_ast_from_schedule(build, sched);
4590         isl_ast_build_free(build);
4591
4592         return tree;
4593 }
4594
4595 __isl_give isl_union_map *extract_sizes_from_str(isl_ctx *ctx, const char *str)
4596 {
4597         if (!str)
4598                 return NULL;
4599         return isl_union_map_read_from_str(ctx, str);
4600 }
4601
4602 /* Information about the outermost tilable bands in the forest of bands.
4603  *
4604  * tile_len and n_parallel are only sets on band_info structures
4605  * that correspond to outermost bands.  For other bands (in particular,
4606  * ancestors of the outermost bands), n_parallal is set to 0.
4607  *
4608  * prefix is the (padded) schedule leading up to the outermost tilable bands.
4609  *
4610  * tile_first is the number of schedule dimensions in prefix.
4611  *
4612  * suffix is the schedule of the outermost tilable bands and their descendants.
4613  */
4614 struct band_info {
4615         struct gpu_gen *gen;
4616         int tile_first;
4617         int tile_len;
4618         int n_parallel;
4619         isl_union_map *prefix;
4620         isl_union_map *suffix;
4621 };
4622
4623 /* Set tile_len and n_parallel of the statement to that of
4624  * their outermost band, recorded in the band_info.
4625  */
4626 static int set_stmt_tile_len(__isl_take isl_map *map, void *user)
4627 {
4628         struct band_info *info = user;
4629         struct gpu_stmt *stmt;
4630         isl_id *id;
4631
4632         id = isl_map_get_tuple_id(map, isl_dim_in);
4633         stmt = find_stmt(info->gen->prog, id);
4634         isl_id_free(id);
4635
4636         stmt->tile_len = info->tile_len;
4637         stmt->n_parallel = info->n_parallel;
4638
4639         isl_map_free(map);
4640
4641         return 0;
4642 }
4643
4644 static void list_select_outer_band(struct gpu_gen *gen,
4645         __isl_take isl_band_list *list, int pos, struct band_info *list_info);
4646
4647 /* Check if this band has any parallel loops.  If so, take it as
4648  * the outermost tilable band.  If not, continue looking for the
4649  * outermost tilable band in the children of the current band.
4650  */
4651 static void band_select_outer_band(struct gpu_gen *gen,
4652         __isl_take isl_band *band, int pos, struct band_info *info)
4653 {
4654         int n = isl_band_n_member(band);
4655         int n_parallel;
4656
4657         for (n_parallel = 0; n_parallel < n; ++n_parallel)
4658                 if (!isl_band_member_is_zero_distance(band, n_parallel))
4659                         break;
4660
4661         info->n_parallel = n_parallel;
4662         if (n_parallel) {
4663                 gen->any_parallelism = 1;
4664                 info->gen = gen;
4665                 info->tile_first = pos;
4666                 info->tile_len = n;
4667                 info->prefix = isl_band_get_prefix_schedule(band);
4668                 info->suffix = isl_union_map_flat_range_product(
4669                                 isl_band_get_partial_schedule(band),
4670                                 isl_band_get_suffix_schedule(band));
4671                 isl_union_map_foreach_map(info->prefix,
4672                                             &set_stmt_tile_len, info);
4673         } else if (isl_band_has_children(band)) {
4674                 isl_band_list *children;
4675                 children = isl_band_get_children(band);
4676                 list_select_outer_band(gen, children, pos + n, info);
4677         } else {
4678                 info->gen = gen;
4679                 info->tile_first = pos + n;
4680                 info->tile_len = 0;
4681                 info->prefix = isl_union_map_flat_range_product(
4682                                 isl_band_get_prefix_schedule(band),
4683                                 isl_band_get_partial_schedule(band));
4684                 info->suffix = isl_band_get_suffix_schedule(band);
4685                 isl_union_map_foreach_map(info->prefix,
4686                                             &set_stmt_tile_len, info);
4687         }
4688
4689         isl_band_free(band);
4690 }
4691
4692 /* Comparison function that returns a non-zero value for band_infos
4693  * with different tile_len fields or different n_parallel fields.
4694  */
4695 static int cmp_band(const void *p1, const void *p2)
4696 {
4697         const struct band_info *info1 = p1;
4698         const struct band_info *info2 = p2;
4699
4700         if (info1->tile_len != info2->tile_len)
4701                 return info1->tile_len - info2->tile_len;
4702
4703         return info1->n_parallel - info2->n_parallel;
4704 }
4705
4706 /* Extend "umap" with coordinates with fixed value "val"
4707  * to a total length of "dst_len", assuming the original dimension is "src_len".
4708  */
4709 static __isl_give isl_union_map *extend_range(
4710         __isl_take isl_union_map *umap, int src_len, int dst_len, int val)
4711 {
4712         isl_space *dim;
4713         isl_map *map;
4714         int i;
4715
4716         dim = isl_union_map_get_space(umap);
4717         map = isl_map_reverse(projection(dim, dst_len, src_len));
4718         for (i = src_len; i < dst_len; ++i)
4719                 map = isl_map_fix_si(map, isl_dim_out, i, val);
4720
4721         umap = isl_union_map_apply_range(umap, isl_union_map_from_map(map));
4722
4723         return umap;
4724 }
4725
4726 /* Group bands with the same values for tile_len and n_parallel.
4727  * The prefix schedule is then extended with a fixed coordinate that
4728  * is different for each such group.
4729  * Note that the actual values for this coordinate are not important.
4730  * The bands have already been effectively separated at a higher level
4731  * or they are independent and may be executed in parallel.
4732  * The list of band_info has been sorted before this functions is called.
4733  */
4734 static void separate_bands(struct band_info *info, int n)
4735 {
4736         int i;
4737         int j = 0;
4738
4739         for (i = 0; i < n; ++i) {
4740                 int l = info[i].tile_first;
4741
4742                 if (i &&
4743                     (info[i].tile_len != info[i - 1].tile_len ||
4744                      info[i].n_parallel != info[i - 1].n_parallel))
4745                         j++;
4746
4747                 info[i].prefix = extend_range(info[i].prefix,
4748                                                 l, l + 1, j);
4749                 info[i].tile_first = l + 1;
4750         }
4751 }
4752
4753 /* Select the outermost bands in the elements of the list, align
4754  * their prefix schedules, separate bands with different values
4755  * for tile_len and/or n_parallel and then combine the resulting
4756  * prefix and suffix schedules into a single pair of prefix and
4757  * suffix schedules for the entire list.
4758  */
4759 static void list_select_outer_band(struct gpu_gen *gen,
4760         __isl_take isl_band_list *list, int pos, struct band_info *list_info)
4761 {
4762         isl_band *band;
4763         int i;
4764         int n = isl_band_list_n_band(list);
4765         isl_ctx *ctx = isl_band_list_get_ctx(list);
4766         struct band_info *info;
4767         int max_tile_first;
4768         isl_union_map *prefix;
4769         isl_union_map *suffix;
4770
4771         assert(n >= 1);
4772         info = isl_calloc_array(ctx, struct band_info, n);
4773         assert(info);
4774
4775         max_tile_first = 0;
4776         for (i = 0; i < n; ++i) {
4777                 band = isl_band_list_get_band(list, i);
4778                 band_select_outer_band(gen, band, pos, &info[i]);
4779                 if (info[i].tile_first > max_tile_first)
4780                         max_tile_first = info[i].tile_first;
4781         }
4782
4783         for (i = 0; i < n; ++i) {
4784                 if (info[i].tile_first == max_tile_first)
4785                         continue;
4786                 info[i].prefix = extend_range(info[i].prefix,
4787                                         info[i].tile_first, max_tile_first, 0);
4788                 info[i].tile_first = max_tile_first;
4789         }
4790
4791         qsort(info, n, sizeof(struct band_info), &cmp_band);
4792
4793         for (i = 0; i < n - 1; ++i)
4794                 if (info[i].tile_len != info[i + 1].tile_len ||
4795                     info[i].n_parallel != info[i + 1].n_parallel)
4796                         break;
4797
4798         if (i < n -1)
4799                 separate_bands(info, n);
4800
4801         prefix = info[0].prefix;
4802         suffix = info[0].suffix;
4803
4804         for (i = 1; i < n; ++i) {
4805                 prefix = isl_union_map_union(prefix, info[i].prefix);
4806                 suffix = isl_union_map_union(suffix, info[i].suffix);
4807         }
4808
4809         list_info->tile_first = info[0].tile_first;
4810         list_info->tile_len = -1;
4811         list_info->prefix = prefix;
4812         list_info->suffix = suffix;
4813
4814         isl_band_list_free(list);
4815         free(info);
4816 }
4817
4818 /* Select the outermost tilable band that (by construction)
4819  * has at least one parallel loop.
4820  * The starting position of the aligned band is stored in the pair
4821  * gen->tile_first.
4822  * The sizes and number of parallel loops may be different in different
4823  * parts of the band forest and are therefore stored in the gpu_stmts.
4824  *
4825  * Return the complete schedule, with the tilable bands aligned
4826  * at gen->tile_first and padded with zero, if needed.
4827  */
4828 static __isl_give isl_union_map *select_outer_tilable_band(struct gpu_gen *gen,
4829         __isl_keep isl_schedule *schedule)
4830 {
4831         isl_band_list *list;
4832         struct band_info info;
4833
4834         gen->n_parallel = 0;
4835         gen->tile_len = -1;
4836
4837         list = isl_schedule_get_band_forest(schedule);
4838
4839         if (isl_band_list_n_band(list) == 0) {
4840                 isl_band_list_free(list);
4841                 return isl_schedule_get_map(schedule);
4842         }
4843
4844         list_select_outer_band(gen, list, 0, &info);
4845
4846         gen->tile_first = info.tile_first;
4847         info.suffix = align_range(info.suffix);
4848
4849         return isl_union_map_flat_range_product(info.prefix, info.suffix);
4850 }
4851
4852 /* Set gen->untiled_len to the number of scheduling dimensions
4853  * for the schedule of the first domain.
4854  * We assume here that this number is the same for all domains.
4855  */
4856 static int set_untiled_len(__isl_take isl_map *map, void *user)
4857 {
4858         unsigned *untiled_len = user;
4859
4860         *untiled_len = isl_map_dim(map, isl_dim_out);
4861
4862         isl_map_free(map);
4863         return -1;
4864 }
4865
4866 /* Compute an appropriate schedule based on the accesses in
4867  * gen->read and gen->write.
4868  *
4869  * We use the dependences in gen->prog->scop to compute
4870  * a schedule that has a parallel loop in each tilable band.
4871  * Finally, we select the outermost tilable band.
4872  */
4873 static void compute_schedule(struct gpu_gen *gen)
4874 {
4875         isl_union_set *domain;
4876         isl_union_map *dep_raw, *dep;
4877         isl_union_map *sched;
4878         isl_schedule *schedule;
4879
4880         dep_raw = isl_union_map_copy(gen->prog->scop->dep_flow);
4881
4882         dep = isl_union_map_copy(gen->prog->scop->dep_false);
4883         dep = isl_union_map_union(dep, dep_raw);
4884         dep = isl_union_map_coalesce(dep);
4885
4886         domain = isl_union_set_copy(gen->prog->scop->domain);
4887         domain = isl_union_set_intersect_params(domain,
4888                                 isl_set_copy(gen->prog->scop->context));
4889         schedule = isl_union_set_compute_schedule(isl_union_set_copy(domain),
4890                                 isl_union_map_copy(dep), dep);
4891         if (gen->options->debug->dump_schedule)
4892                 isl_schedule_dump(schedule);
4893
4894         sched = select_outer_tilable_band(gen, schedule);
4895
4896         isl_union_map_foreach_map(sched, &set_untiled_len, &gen->untiled_len);
4897         sched = isl_union_map_intersect_domain(sched, domain);
4898         gen->sched = sched;
4899
4900         isl_schedule_free(schedule);
4901 }
4902
4903 /* Compute the sets of array elements that need to be copied in and out.
4904  *
4905  * In particular, for each array that is written anywhere in gen->prog and
4906  * that is visible outside the corresponding scop, we copy out its entire
4907  * extent.
4908  *
4909  * Any array elements that is read without first being written needs
4910  * to be copied in. Furthermore, if there are any array elements that
4911  * are copied out, but that are not written inside gen->prog, then
4912  * they also need to be copied in to ensure that the value after execution
4913  * is the same as the value before execution.
4914  * While computing the set of array elements that
4915  * are copied out but not written, we intersect both sets with the context.
4916  * This helps in those cases where the arrays are declared with a fixed size,
4917  * while the accesses are parametric and the context assigns a fixed value
4918  * to the parameters.
4919  */
4920 static void compute_copy_in_and_out(struct gpu_gen *gen)
4921 {
4922         int i;
4923         isl_union_set *write;
4924         isl_union_set *copy_in, *copy_out;
4925         isl_union_set *not_written;
4926         isl_union_map *uninitialized;
4927
4928         write = isl_union_map_range(isl_union_map_copy(gen->prog->write));
4929         write = isl_union_set_intersect_params(write,
4930                                             isl_set_copy(gen->prog->context));
4931         copy_out = isl_union_set_empty(isl_union_set_get_space(write));
4932
4933         for (i = 0; i < gen->prog->n_array; ++i) {
4934                 isl_space *space;
4935                 isl_set *write_i;
4936                 int empty;
4937
4938                 if (gen->prog->array[i].local)
4939                         continue;
4940
4941                 space = isl_space_copy(gen->prog->array[i].space);
4942                 write_i = isl_union_set_extract_set(write, space);
4943                 empty = isl_set_fast_is_empty(write_i);
4944                 isl_set_free(write_i);
4945                 if (empty)
4946                         continue;
4947
4948                 write_i = isl_set_copy(gen->prog->array[i].extent);
4949                 copy_out = isl_union_set_add_set(copy_out, write_i);
4950         }
4951
4952         copy_out = isl_union_set_intersect_params(copy_out,
4953                                             isl_set_copy(gen->prog->context));
4954
4955         gen->prog->copy_out = isl_union_set_copy(copy_out);
4956
4957         uninitialized = isl_union_map_copy(gen->prog->scop->live_in);
4958         copy_in = isl_union_map_range(uninitialized);
4959
4960         not_written = isl_union_set_subtract(copy_out, write);
4961         copy_in = isl_union_set_union(copy_in, not_written);
4962         gen->prog->copy_in = copy_in;
4963 }
4964
4965 static struct gpu_stmt_access **expr_extract_access(struct pet_expr *expr,
4966         struct gpu_stmt_access **next_access)
4967 {
4968         struct gpu_stmt_access *access;
4969         isl_ctx *ctx = isl_map_get_ctx(expr->acc.access);
4970
4971         access = isl_alloc_type(ctx, struct gpu_stmt_access);
4972         assert(access);
4973         access->next = NULL;
4974         access->read = expr->acc.read;
4975         access->write = expr->acc.write;
4976         access->access = isl_map_copy(expr->acc.access);
4977         access->ref_id = isl_id_copy(expr->acc.ref_id);
4978
4979         *next_access = access;
4980         next_access = &(*next_access)->next;
4981         return next_access;
4982 }
4983
4984 static struct gpu_stmt_access **expr_extract_accesses(struct pet_expr *expr,
4985         struct gpu_stmt_access **next_access)
4986 {
4987         int i;
4988
4989         for (i = 0; i < expr->n_arg; ++i)
4990                 next_access = expr_extract_accesses(expr->args[i],
4991                                                         next_access);
4992
4993         if (expr->type == pet_expr_access)
4994                 next_access = expr_extract_access(expr, next_access);
4995
4996         return next_access;
4997 }
4998
4999 static void pet_stmt_extract_accesses(struct gpu_stmt *stmt)
5000 {
5001         struct gpu_stmt_access **next_access = &stmt->accesses;
5002
5003         stmt->accesses = NULL;
5004         expr_extract_accesses(stmt->stmt->body, next_access);
5005 }
5006
5007 /* Return an array of gpu_stmt representing the statements in "scop".
5008  */
5009 static struct gpu_stmt *extract_stmts(isl_ctx *ctx, struct ppcg_scop *scop,
5010         __isl_keep isl_set *context)
5011 {
5012         int i;
5013         struct gpu_stmt *stmts;
5014
5015         stmts = isl_calloc_array(ctx, struct gpu_stmt, scop->n_stmt);
5016         if (!stmts)
5017                 return NULL;
5018
5019         for (i = 0; i < scop->n_stmt; ++i) {
5020                 struct gpu_stmt *s = &stmts[i];
5021
5022                 s->id = isl_set_get_tuple_id(scop->stmts[i]->domain);
5023                 s->stmt = scop->stmts[i];
5024                 pet_stmt_extract_accesses(s);
5025         }
5026
5027         return stmts;
5028 }
5029
5030 /* Callback for ppcg_print_guarded that calls the callback for generate_gpu.
5031  */
5032 static __isl_give isl_printer *print_gpu(__isl_take isl_printer *p, void *user)
5033 {
5034         struct gpu_gen *gen = user;
5035
5036         return gen->print(p, gen->prog, gen->tree, gen->print_user);
5037 }
5038
5039 /* Generate CUDA code for "scop" and print it to "p".
5040  * After generating an AST for the transformed scop as explained below,
5041  * we call "gen->print" to print the AST in the desired output format
5042  * to "p".
5043  *
5044  * If it turns out that it does not make sense to generate GPU code,
5045  * then we generate CPU code instead.
5046  *
5047  * The GPU code is generated in a context where at least one
5048  * statement instance is executed.  The corresponding guard (if any) is printed
5049  * around the entire generated GPU code, except for the declaration
5050  * of the arrays that are visible outside of the scop and that therefore
5051  * cannot be declared inside the body of any possible guard.
5052  *
5053  * We first compute a schedule that respects the dependences
5054  * of the original program and select the outermost band
5055  * of tilable dimensions that has at least one parallel loop.
5056  * We then have three blocks of dimensions
5057  *
5058  *      H               B                       G
5059  *
5060  * The tilable band "B" is first tiled according to "tile" sizes, resulting
5061  * in
5062  *
5063  *      H       T               P               G
5064  *
5065  * For each iteration of the T loop and for each array, we compute
5066  * the array elements accessed by that iteration, construct a rectangular
5067  * box around it and shift it to the origin.  The result is used
5068  * as shared memory for the array.
5069  *
5070  * We then split off at most 2 parallel loops from the T loops and
5071  * at most 3 parallel loops from the P loops
5072  *
5073  *      H       T1      T2      P1      P2      G
5074  *
5075  * The T1/P1 loops are then tiled or "wrapped" over the blocks/threads,
5076  * according to "grid"/"block" sizes.
5077  *
5078  *      H       T1T T1P T2      P1T P1P P2      G
5079  *
5080  * Finally, the T1P and P1P iterators are equated to the block and
5081  * thread dimensions respectively and so are effectively removed.
5082  * The H loops are run on the host.  The T1T, T2, P1T, P2 and G loops
5083  * are run on the GPU.
5084  *
5085  * Code is generated in three stages.  We first generate code for the
5086  * host (the H loops), with iterators h%d.  Then, for each leaf node
5087  * of the resulting AST, we generate code for the shared loops (up to
5088  * and including T2), with iterators g%d and after equating the H loops
5089  * to h%d parameters and the T1P loops to the block dimensions.
5090  * Finally, we generate code for the remaining loops in a similar fashion.
5091  */
5092 static __isl_give isl_printer *generate(__isl_take isl_printer *p,
5093         struct gpu_gen *gen, struct ppcg_scop *scop,
5094         struct ppcg_options *options)
5095 {
5096         struct gpu_prog *prog;
5097         isl_ctx *ctx;
5098         isl_set *context, *guard;
5099
5100         if (!scop)
5101                 return isl_printer_free(p);
5102
5103         ctx = isl_printer_get_ctx(p);
5104         prog = gpu_prog_alloc(ctx, scop);
5105         if (!prog)
5106                 return isl_printer_free(p);
5107
5108         context = isl_set_copy(prog->context);
5109         guard = isl_union_set_params(isl_union_set_copy(prog->scop->domain));
5110         prog->context = isl_set_intersect(prog->context, isl_set_copy(guard));
5111
5112         gen->prog = prog;
5113         gen->any_parallelism = 0;
5114         compute_schedule(gen);
5115
5116         if (!gen->any_parallelism) {
5117                 isl_set_free(context);
5118                 isl_set_free(guard);
5119                 p = print_cpu(p, scop, options);
5120         } else {
5121                 compute_copy_in_and_out(gen);
5122                 gen->tree = generate_host_code(gen);
5123                 p = ppcg_print_exposed_declarations(p, prog->scop);
5124                 p = ppcg_print_guarded(p, guard, context, &print_gpu, gen);
5125                 isl_ast_node_free(gen->tree);
5126         }
5127
5128         isl_union_map_free(gen->sched);
5129
5130         gpu_prog_free(prog);
5131
5132         return p;
5133 }
5134
5135 /* Wrapper around generate for use as a ppcg_transform callback.
5136  */
5137 static __isl_give isl_printer *generate_wrap(__isl_take isl_printer *p,
5138         struct ppcg_scop *scop, void *user)
5139 {
5140         struct gpu_gen *gen = user;
5141
5142         return generate(p, gen, scop, gen->options);
5143 }
5144
5145 /* Transform the code in the file called "input" by replacing
5146  * all scops by corresponding GPU code and write the results to "out".
5147  */
5148 int generate_gpu(isl_ctx *ctx, const char *input, FILE *out,
5149         struct ppcg_options *options,
5150         __isl_give isl_printer *(*print)(__isl_take isl_printer *p,
5151                 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
5152                 void *user), void *user)
5153 {
5154         struct gpu_gen gen;
5155         int r;
5156
5157         gen.ctx = ctx;
5158         gen.sizes = extract_sizes_from_str(ctx, options->sizes);
5159         gen.options = options;
5160         gen.kernel_id = 0;
5161         gen.print = print;
5162         gen.print_user = user;
5163
5164         r = ppcg_transform(ctx, input, out, options, &generate_wrap, &gen);
5165
5166         isl_union_map_free(gen.sizes);
5167
5168         return r;
5169 }
5170
5171 struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop)
5172 {
5173         struct gpu_prog *prog;
5174
5175         if (!scop)
5176                 return NULL;
5177
5178         prog = isl_calloc_type(ctx, struct gpu_prog);
5179         assert(prog);
5180
5181         prog->ctx = ctx;
5182         prog->scop = scop;
5183         prog->context = isl_set_copy(scop->context);
5184         prog->n_stmts = scop->n_stmt;
5185         prog->stmts = extract_stmts(ctx, scop, prog->context);
5186         prog->read = isl_union_map_copy(scop->reads);
5187         prog->write = isl_union_map_copy(scop->writes);
5188
5189         if (!prog->stmts)
5190                 return gpu_prog_free(prog);
5191
5192         if (collect_array_info(prog) < 0)
5193                 return gpu_prog_free(prog);
5194
5195         return prog;
5196 }
5197
5198 void *gpu_prog_free(struct gpu_prog *prog)
5199 {
5200         if (!prog)
5201                 return NULL;
5202         free_array_info(prog);
5203         free_stmts(prog->stmts, prog->n_stmts);
5204         isl_union_set_free(prog->copy_in);
5205         isl_union_set_free(prog->copy_out);
5206         isl_union_map_free(prog->read);
5207         isl_union_map_free(prog->write);
5208         isl_set_free(prog->context);
5209         free(prog);
5210         return NULL;
5211 }