gpu.c

   1 /*
   2  * Copyright 2010-2011 INRIA Saclay
   3  * Copyright 2012-2013 Ecole Normale Superieure
   4  *
   5  * Use of this software is governed by the MIT license
   6  *
   7  * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
   8  * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
   9  * 91893 Orsay, France
  10  * and Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
  11  */
  12
  13 #include <assert.h>
  14 #include <stdlib.h>
  15 #include <string.h>
  16
  17 #include <isl/polynomial.h>
  18 #include <isl/union_set.h>
  19 #include <isl/aff.h>
  20 #include <isl/ilp.h>
  21 #include <isl/flow.h>
  22 #include <isl/band.h>
  23 #include <isl/schedule.h>
  24 #include <isl/options.h>
  25 #include <isl/ast_build.h>
  26
  27 #include "cpu.h"
  28 #include "gpu.h"
  29 #include "schedule.h"
  30 #include "ppcg_options.h"
  31 #include "print.h"
  32
  33 /* The fields stride, shift and shift_map only contain valid information
  34  * if shift != NULL.
  35  * If so, they express that current index is such that if you add shift,
  36  * then the result is always a multiple of stride.
  37  * shift_map contains the mapping
  38  *
  39  *      i -> (i + shift)/stride
  40  *
  41  * Let D represent the initial shared_len dimensions of the computed schedule.
  42  * The spaces of "lb" and "shift" are of the form
  43  *
  44  *      D -> [b]
  45  *
  46  * "shift_map" is of the form
  47  *
  48  *      [D -> i] -> [D -> (i + shift(D))/stride]
  49  */
  50 struct gpu_array_bound {
  51         isl_val *size;
  52         isl_aff *lb;
  53
  54         isl_val *stride;
  55         isl_aff *shift;
  56         isl_basic_map *shift_map;
  57 };
  58
  59 /* A tile of an array.
  60  *
  61  * n is the dimension of the array.
  62  * bound is an array of size "n" representing the lower bound
  63  *      and size for each index.
  64  *
  65  * tiling maps a tile in the global array to the corresponding
  66  * shared/private memory tile and is of the form
  67  *
  68  *      { [D[i] -> A[a]] -> T[(a + shift(i))/stride - lb(i)] }
  69  *
  70  * where D represents the initial shared_len dimensions
  71  * of the computed schedule.
  72  */
  73 struct gpu_array_tile {
  74         int n;
  75         struct gpu_array_bound *bound;
  76         isl_multi_aff *tiling;
  77 };
  78
  79 struct gpu_array_info;
  80
  81 /* A group of array references in a kernel that should be handled together.
  82  * If private_tile is not NULL, then it is mapped to registers.
  83  * Otherwise, if shared_tile is not NULL, it is mapped to shared memory.
  84  * Otherwise, it is accessed from global memory.
  85  */
  86 struct gpu_array_ref_group {
  87         /* The references in this group access this array. */
  88         struct gpu_array_info *array;
  89         /* Position of this group in the list of reference groups of array. */
  90         int nr;
  91
  92         /* The following fields are use during the construction of the groups.
  93          * access is the combined access relation relative to the shared
  94          * memory tiling.  In particular, the domain of the map corresponds
  95          * to the first shared_len dimensions of the computed schedule.
  96          * write is set if any access in the group is a write.
  97          * exact_write is set if all writes are definite writes.
  98          * slice is set if there is at least one access in the group
  99          * that refers to more than one element
 100          */
 101         isl_map *access;
 102         int write;
 103         int exact_write;
 104         int slice;
 105
 106         /* The shared memory tile, NULL if none. */
 107         struct gpu_array_tile *shared_tile;
 108
 109         /* The private memory tile, NULL if none. */
 110         struct gpu_array_tile *private_tile;
 111
 112         /* References in this group; point to elements of a linked list. */
 113         int n_ref;
 114         struct gpu_stmt_access **refs;
 115
 116         /* Last shared memory tile dimension that affects tile of this group. */
 117         int last_shared;
 118 };
 119
 120 struct gpu_gen {
 121         isl_ctx *ctx;
 122         struct ppcg_options *options;
 123
 124         /* Callback for printing of AST in appropriate format. */
 125         __isl_give isl_printer *(*print)(__isl_take isl_printer *p,
 126                 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
 127                 struct gpu_types *types, void *user);
 128         void *print_user;
 129
 130         struct gpu_prog *prog;
 131         /* The generated AST. */
 132         isl_ast_node *tree;
 133
 134         /* The sequence of types for which a definition has been printed. */
 135         struct gpu_types types;
 136
 137         /* User specified tile, grid and block sizes for each kernel */
 138         isl_union_map *sizes;
 139
 140         /* Effectively used tile, grid and block sizes for each kernel */
 141         isl_union_map *used_sizes;
 142
 143         /* Identifier of current kernel. */
 144         int kernel_id;
 145         /* Pointer to the current kernel. */
 146         struct ppcg_kernel *kernel;
 147         /* Does the computed schedule exhibit any parallelism? */
 148         int any_parallelism;
 149
 150         /* First tile dimension. */
 151         int tile_first;
 152         /* Number of tile dimensions. */
 153         int tile_len;
 154         /* Number of initial parallel loops among tile dimensions. */
 155         int n_parallel;
 156
 157         /* Number of dimensions determining shared memory. */
 158         int shared_len;
 159
 160         /* Number of rows in the untiled schedule. */
 161         int untiled_len;
 162         /* Number of rows in the tiled schedule. */
 163         int tiled_len;
 164         /* Number of rows in schedule after tiling/wrapping over threads. */
 165         int thread_tiled_len;
 166
 167         /* Global untiled schedule. */
 168         isl_union_map *sched;
 169         /* Local (per kernel launch) tiled schedule. */
 170         isl_union_map *tiled_sched;
 171         /* Local schedule per shared memory tile loop iteration. */
 172         isl_union_map *local_sched;
 173
 174         /* Local tiled schedule projected onto the shared tile loops and
 175          * the loops that will be wrapped over the threads,
 176          * with all shared tile loops parametrized.
 177          */
 178         isl_union_map *shared_sched;
 179         /* Projects out the loops that will be wrapped over the threads
 180          * from shared_sched.
 181          */
 182         isl_union_map *shared_proj;
 183
 184         /* A map that takes the range of shared_sched as input,
 185          * wraps the appropriate loops over the threads and then projects
 186          * out these loops.
 187          */
 188         isl_map *privatization;
 189
 190         /* The array reference group corresponding to copy_sched. */
 191         struct gpu_array_ref_group *copy_group;
 192
 193         /* Is any array in the current kernel marked force_private? */
 194         int any_force_private;
 195
 196         /* First loop to unroll (or -1 if none) in the current part of the
 197          * schedule.
 198          */
 199         int first_unroll;
 200
 201         int n_grid;
 202         int n_block;
 203         /* Note: in the input file, the sizes of the grid and the blocks
 204          * are specified in the order x, y, z, but internally, the sizes
 205          * are stored in reverse order, so that the last element always
 206          * refers to the x dimension.
 207          */
 208         int grid_dim[2];
 209         int block_dim[3];
 210         int *tile_size;
 211 };
 212
 213 /* Print the name of the local copy of a given group of array references.
 214  */
 215 static __isl_give isl_printer *print_array_name(__isl_take isl_printer *p,
 216         struct gpu_array_ref_group *group)
 217 {
 218         int global = 0;
 219
 220         if (group->private_tile)
 221                 p = isl_printer_print_str(p, "private_");
 222         else if (group->shared_tile)
 223                 p = isl_printer_print_str(p, "shared_");
 224         else
 225                 global = 1;
 226         p = isl_printer_print_str(p, group->array->name);
 227         if (!global && group->array->n_group > 1) {
 228                 p = isl_printer_print_str(p, "_");
 229                 p = isl_printer_print_int(p, group->nr);
 230         }
 231
 232         return p;
 233 }
 234
 235 /* Collect all references to the given array and store pointers to them
 236  * in array->refs.
 237  *
 238  * If the array contains structures, then there is no need to collect
 239  * the references since we will not be computing any reference groups.
 240  */
 241 static void collect_references(struct gpu_prog *prog,
 242         struct gpu_array_info *array)
 243 {
 244         int i;
 245         int n;
 246
 247         if (array->has_compound_element)
 248                 return;
 249
 250         n = 0;
 251         for (i = 0; i < prog->n_stmts; ++i) {
 252                 struct gpu_stmt *stmt = &prog->stmts[i];
 253                 struct gpu_stmt_access *access;
 254
 255                 for (access = stmt->accesses; access; access = access->next) {
 256                         const char *name;
 257                         name = isl_map_get_tuple_name(access->access,
 258                                                       isl_dim_out);
 259                         if (name && !strcmp(array->name, name))
 260                                 n++;
 261                 }
 262         }
 263
 264         array->n_ref = n;
 265         array->refs = isl_alloc_array(prog->ctx, struct gpu_stmt_access *, n);
 266         assert(array->refs);
 267
 268         n = 0;
 269         for (i = 0; i < prog->n_stmts; ++i) {
 270                 struct gpu_stmt *stmt = &prog->stmts[i];
 271                 struct gpu_stmt_access *access;
 272
 273                 for (access = stmt->accesses; access; access = access->next) {
 274                         const char *name;
 275                         name = isl_map_get_tuple_name(access->access,
 276                                                       isl_dim_out);
 277                         if (!name || strcmp(array->name, name))
 278                                 continue;
 279
 280                         array->refs[n++] = access;
 281                 }
 282         }
 283 }
 284
 285 /* Create a gpu_array_tile for an array of dimension "n_index".
 286  */
 287 static struct gpu_array_tile *create_tile(isl_ctx *ctx, int n_index)
 288 {
 289         int i;
 290         struct gpu_array_tile *tile;
 291
 292         tile = isl_calloc_type(ctx, struct gpu_array_tile);
 293         assert(tile);
 294
 295         tile->n = n_index;
 296
 297         tile->bound = isl_alloc_array(ctx, struct gpu_array_bound, n_index);
 298         assert(tile->bound);
 299
 300         for (i = 0; i < n_index; ++i) {
 301                 tile->bound[i].size = NULL;
 302                 tile->bound[i].lb = NULL;
 303                 tile->bound[i].stride = NULL;
 304                 tile->bound[i].shift = NULL;
 305                 tile->bound[i].shift_map = NULL;
 306         }
 307
 308         return tile;
 309 }
 310
 311 static void *free_tile(struct gpu_array_tile *tile)
 312 {
 313         int j;
 314
 315         if (!tile)
 316                 return NULL;
 317
 318         for (j = 0; j < tile->n; ++j) {
 319                 isl_val_free(tile->bound[j].size);
 320                 isl_val_free(tile->bound[j].stride);
 321                 isl_aff_free(tile->bound[j].lb);
 322                 isl_aff_free(tile->bound[j].shift);
 323                 isl_basic_map_free(tile->bound[j].shift_map);
 324         }
 325         free(tile->bound);
 326         isl_multi_aff_free(tile->tiling);
 327         free(tile);
 328
 329         return NULL;
 330 }
 331
 332 static struct pet_array *find_array(struct ppcg_scop *scop,
 333         __isl_keep isl_set *accessed)
 334 {
 335         int i;
 336         isl_id *id;
 337
 338         id = isl_set_get_tuple_id(accessed);
 339
 340         for (i = 0; i < scop->pet->n_array; ++i) {
 341                 isl_id *id_i;
 342
 343                 id_i = isl_set_get_tuple_id(scop->pet->arrays[i]->extent);
 344                 isl_id_free(id_i);
 345                 if (id == id_i)
 346                         break;
 347         }
 348         isl_id_free(id);
 349
 350         return i < scop->pet->n_array ? scop->pet->arrays[i] : NULL;
 351 }
 352
 353 /* Compute and return the extent of "array", taking into account the set of
 354  * accessed elements.
 355  *
 356  * In particular, the extent in the outer dimension is taken
 357  * from "accessed", while then extent in the remaing dimensions
 358  * are taken from array->extent.
 359  *
 360  * The extent in the outer dimension cannot be taken from array->extent
 361  * because that may be unbounded.  Furthermore, even if it is bounded,
 362  * it may be larger than the piece of the array that is being accessed.
 363  */
 364 static __isl_give isl_set *compute_extent(struct pet_array *array,
 365         __isl_keep isl_set *accessed)
 366 {
 367         int n_index;
 368         isl_id *id;
 369         isl_set *outer;
 370         isl_set *extent;
 371
 372         extent = isl_set_copy(array->extent);
 373
 374         n_index = isl_set_dim(accessed, isl_dim_set);
 375         if (n_index == 0)
 376                 return extent;
 377
 378         extent = isl_set_project_out(extent, isl_dim_set, 0, 1);
 379         outer = isl_set_copy(accessed);
 380         outer = isl_set_project_out(outer, isl_dim_set, 1, n_index - 1);
 381         extent = isl_set_flat_product(outer, extent);
 382         id = isl_set_get_tuple_id(accessed);
 383         extent = isl_set_set_tuple_id(extent, id);
 384
 385         return extent;
 386 }
 387
 388 /* Is the array "array" being extracted a read-only scalar?
 389  *
 390  * That is, is "array" a scalar that is never possibly written to.
 391  * An array containing structures is never considered to be a scalar.
 392  */
 393 static int is_read_only_scalar(struct gpu_array_info *array,
 394         struct gpu_prog *prog)
 395 {
 396         isl_set *space;
 397         isl_union_map *write;
 398         int empty;
 399
 400         if (array->has_compound_element)
 401                 return 0;
 402         if (array->n_index != 0)
 403                 return 0;
 404
 405         write = isl_union_map_copy(prog->may_write);
 406         space = isl_set_universe(isl_space_copy(array->space));
 407         write = isl_union_map_intersect_range(write,
 408                                                 isl_union_set_from_set(space));
 409         empty = isl_union_map_is_empty(write);
 410         isl_union_map_free(write);
 411
 412         return empty;
 413 }
 414
 415 /* Compute bounds on the host arrays based on the accessed elements
 416  * and collect all references to the array.
 417  *
 418  * If the array is zero-dimensional and does not contain structures,
 419  * i.e., if the array is a scalar, we check whether it is read-only.
 420  */
 421 static int extract_array_info(__isl_take isl_set *array, void *user)
 422 {
 423         int i;
 424         struct gpu_prog *prog = (struct gpu_prog *)user;
 425         const char *name;
 426         int n_index;
 427         isl_pw_aff **bounds;
 428         struct pet_array *pa;
 429         struct gpu_array_info *info;
 430         isl_set *extent;
 431
 432         info = &prog->array[prog->n_array];
 433         prog->n_array++;
 434
 435         n_index = isl_set_dim(array, isl_dim_set);
 436         name = isl_set_get_tuple_name(array);
 437         bounds = isl_alloc_array(isl_set_get_ctx(array),
 438                                  isl_pw_aff *, n_index);
 439         if (!bounds)
 440                 goto error;
 441
 442         info->space = isl_set_get_space(array);
 443         info->name = strdup(name);
 444         info->n_index = n_index;
 445         info->bound = bounds;
 446         info->linearize = prog->scop->options->linearize_device_arrays;
 447
 448         pa = find_array(prog->scop, array);
 449         if (!pa)
 450                 isl_die(isl_set_get_ctx(array), isl_error_internal,
 451                         "unable to find array in scop", goto error);
 452
 453         info->type = strdup(pa->element_type);
 454         info->size = pa->element_size;
 455         info->local = pa->declared && !pa->exposed;
 456         info->has_compound_element = pa->element_is_record;
 457         info->read_only_scalar = is_read_only_scalar(info, prog);
 458
 459         extent = compute_extent(pa, array);
 460         info->extent = extent;
 461         for (i = 0; i < n_index; ++i) {
 462                 isl_set *dom;
 463                 isl_local_space *ls;
 464                 isl_aff *one;
 465                 isl_pw_aff *bound;
 466
 467                 dom = isl_set_copy(extent);
 468                 dom = isl_set_project_out(dom, isl_dim_set, i + 1,
 469                                             n_index - (i + 1));
 470                 dom = isl_set_project_out(dom, isl_dim_set, 0, i);
 471                 if (!isl_set_dim_has_upper_bound(dom, isl_dim_set, 0)) {
 472                         fprintf(stderr, "unable to determine extent of '%s' "
 473                                 "in dimension %d\n", info->name, i);
 474                         dom = isl_set_free(dom);
 475                 }
 476                 bound = isl_set_dim_max(dom, 0);
 477                 dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
 478                 ls = isl_local_space_from_space(isl_set_get_space(dom));
 479                 one = isl_aff_zero_on_domain(ls);
 480                 one = isl_aff_add_constant_si(one, 1);
 481                 bound = isl_pw_aff_add(bound, isl_pw_aff_alloc(dom, one));
 482                 bound = isl_pw_aff_gist(bound, isl_set_copy(prog->context));
 483
 484                 bounds[i] = bound;
 485                 if (!isl_pw_aff_is_cst(bound))
 486                         info->linearize = 1;
 487         }
 488
 489         collect_references(prog, info);
 490
 491         isl_set_free(array);
 492         return 0;
 493 error:
 494         isl_set_free(array);
 495         return -1;
 496 }
 497
 498 /* Remove independence from the order constraints "order" on array "array".
 499  * Since the pairs of iterations in the filter relation of an independence
 500  * are guaranteed to be completely independent by the user, there is
 501  * no need to ensure that live ranges are ordered along thong pairs.
 502  * We make an exception for local variables, though, as the independence
 503  * guarantee does not apply to those.
 504  *
 505  * The order constraints are used in two places.
 506  * Those on scalars are used in check_scalar_live_ranges to check if
 507  * we need to force the scalar to be private.  Any non-local scalar
 508  * should not be forced scalar if it only appears in independent loops.
 509  * Those on non-scalars are added to the coincidence constraints
 510  * in compute_schedule because we do not support any array expansion.
 511  * Accesses to non-local arrays should not prevent a loop from being
 512  * considered coincident so we should indeed remove those constraints
 513  * from the order constraints.
 514  */
 515 static __isl_give isl_union_map *remove_independences(struct gpu_prog *prog,
 516         struct gpu_array_info *array, __isl_take isl_union_map *order)
 517 {
 518         int i;
 519
 520         for (i = 0; i < prog->scop->pet->n_independence; ++i) {
 521                 struct pet_independence *pi = prog->scop->pet->independences[i];
 522                 if (isl_union_set_contains(pi->local, array->space))
 523                         continue;
 524
 525                 order = isl_union_map_subtract(order,
 526                                                 isl_union_map_copy(pi->filter));
 527         }
 528
 529         return order;
 530 }
 531
 532 /* For each array in "prog", store the (untagged) order dependences
 533  * derived from the array in array->dep_order.
 534  * In particular, consider all references that access the given array
 535  * and take the order dependences that have one of these references
 536  * as source.  (Since an order dependence relates two references to
 537  * the same array, the target of these order dependences will also
 538  * be one of these references.)
 539  * Additionally, store the union of these array->dep_order relations
 540  * for all non-scalar arrays in prog->array_order.
 541  */
 542 void collect_order_dependences(struct gpu_prog *prog)
 543 {
 544         int i;
 545         isl_space *space;
 546         isl_union_map *accesses;
 547
 548         space = isl_union_map_get_space(prog->read);
 549         prog->array_order = isl_union_map_empty(space);
 550
 551         accesses = isl_union_map_copy(prog->scop->tagged_reads);
 552         accesses = isl_union_map_union(accesses,
 553                             isl_union_map_copy(prog->scop->tagged_may_writes));
 554         accesses = isl_union_map_universe(accesses);
 555         accesses = isl_union_map_apply_range(accesses,
 556                                             isl_union_map_copy(prog->to_outer));
 557
 558         for (i = 0; i < prog->n_array; ++i) {
 559                 struct gpu_array_info *array = &prog->array[i];
 560                 isl_set *set;
 561                 isl_union_set *uset;
 562                 isl_union_map *order;
 563
 564                 set = isl_set_universe(isl_space_copy(array->space));
 565                 uset = isl_union_set_from_set(set);
 566                 uset = isl_union_map_domain(
 567                     isl_union_map_intersect_range(isl_union_map_copy(accesses),
 568                                                     uset));
 569                 order = isl_union_map_copy(prog->scop->tagged_dep_order);
 570                 order = isl_union_map_intersect_domain(order, uset);
 571                 order = isl_union_map_zip(order);
 572                 order = isl_union_set_unwrap(isl_union_map_domain(order));
 573                 order = remove_independences(prog, array, order);
 574                 array->dep_order = order;
 575
 576                 if (gpu_array_is_scalar(array))
 577                         continue;
 578
 579                 prog->array_order = isl_union_map_union(prog->array_order,
 580                                         isl_union_map_copy(array->dep_order));
 581         }
 582
 583         isl_union_map_free(accesses);
 584 }
 585
 586 /* Construct a gpu_array_info for each array possibly accessed by "prog" and
 587  * collect them in prog->array.
 588  *
 589  * If there are any member accesses involved, then they are first mapped
 590  * to the outer arrays of structs.
 591  *
 592  * If we are allowing live range reordering, then also set
 593  * the dep_order field.  Otherwise leave it NULL.
 594  */
 595 static int collect_array_info(struct gpu_prog *prog)
 596 {
 597         int r;
 598         isl_union_set *arrays;
 599
 600         arrays = isl_union_map_range(isl_union_map_copy(prog->read));
 601         arrays = isl_union_set_union(arrays,
 602                     isl_union_map_range(isl_union_map_copy(prog->may_write)));
 603
 604         arrays = isl_union_set_apply(arrays,
 605                                         isl_union_map_copy(prog->to_outer));
 606
 607         arrays = isl_union_set_coalesce(arrays);
 608
 609         prog->n_array = isl_union_set_n_set(arrays);
 610         prog->array = isl_calloc_array(prog->ctx,
 611                                      struct gpu_array_info, prog->n_array);
 612         assert(prog->array);
 613         prog->n_array = 0;
 614         r = isl_union_set_foreach_set(arrays, &extract_array_info, prog);
 615         isl_union_set_free(arrays);
 616
 617         if (prog->scop->options->live_range_reordering)
 618                 collect_order_dependences(prog);
 619
 620         return r;
 621 }
 622
 623 static void free_array_info(struct gpu_prog *prog)
 624 {
 625         int i, j;
 626
 627         for (i = 0; i < prog->n_array; ++i) {
 628                 int n_index = prog->array[i].n_index;
 629                 free(prog->array[i].type);
 630                 free(prog->array[i].name);
 631                 for (j = 0; j < n_index; ++j)
 632                         isl_pw_aff_free(prog->array[i].bound[j]);
 633                 isl_space_free(prog->array[i].space);
 634                 isl_set_free(prog->array[i].extent);
 635                 free(prog->array[i].bound);
 636                 free(prog->array[i].refs);
 637                 isl_union_map_free(prog->array[i].dep_order);
 638         }
 639         free(prog->array);
 640 }
 641
 642 /* Check if a gpu array is a scalar.  A scalar is a value that is not stored
 643  * as an array or through a pointer reference, but as a single data element.
 644  * At the moment, scalars are represented as zero-dimensional arrays.
 645  * A zero-dimensional array containing structures is not considered
 646  * to be a scalar.
 647  */
 648 int gpu_array_is_scalar(struct gpu_array_info *array)
 649 {
 650         return !array->has_compound_element && array->n_index == 0;
 651 }
 652
 653 /* Is "array" a read-only scalar?
 654  */
 655 int gpu_array_is_read_only_scalar(struct gpu_array_info *array)
 656 {
 657         return array->read_only_scalar;
 658 }
 659
 660 /* Return the set of parameter values for which the array has a positive
 661  * size in all dimensions.
 662  * If the sizes are only valid for some parameter values, then those
 663  * constraints are also taken into account.
 664  */
 665 __isl_give isl_set *gpu_array_positive_size_guard(struct gpu_array_info *array)
 666 {
 667         int i;
 668         isl_space *space;
 669         isl_set *guard;
 670
 671         space = isl_space_params(isl_space_copy(array->space));
 672         guard = isl_set_universe(space);
 673
 674         for (i = 0; i < array->n_index; ++i) {
 675                 isl_pw_aff *bound;
 676                 isl_set *guard_i, *zero;
 677
 678                 bound = isl_pw_aff_copy(array->bound[i]);
 679                 guard_i = isl_pw_aff_nonneg_set(isl_pw_aff_copy(bound));
 680                 zero = isl_pw_aff_zero_set(bound);
 681                 guard_i = isl_set_subtract(guard_i, zero);
 682                 guard = isl_set_intersect(guard, guard_i);
 683         }
 684
 685         return guard;
 686 }
 687
 688 /* Internal data structure for extract_size_of_type.
 689  * "type" specifies the name of the space that we want to extract.
 690  * "res" is used to store the subset of that space.
 691  */
 692 struct ppcg_extract_size_data {
 693         const char *type;
 694         isl_set *res;
 695 };
 696
 697 /* This function is called for each set in a union_set.
 698  * If the name of the set matches data->type, we store the
 699  * set in data->res.
 700  */
 701 static int extract_size_of_type(__isl_take isl_set *size, void *user)
 702 {
 703         struct ppcg_extract_size_data *data = user;
 704         const char *name;
 705
 706         name = isl_set_get_tuple_name(size);
 707         if (name && !strcmp(name, data->type)) {
 708                 data->res = size;
 709                 return -1;
 710         }
 711
 712         isl_set_free(size);
 713         return 0;
 714 }
 715
 716 /* Given a union map { kernel[i] -> *[...] },
 717  * return the range in the space called "type" for the kernel with
 718  * sequence number "id".
 719  */
 720 static __isl_give isl_set *extract_sizes(__isl_keep isl_union_map *sizes,
 721         const char *type, int id)
 722 {
 723         isl_space *space;
 724         isl_set *dom;
 725         isl_union_set *local_sizes;
 726         struct ppcg_extract_size_data data = { type, NULL };
 727
 728         if (!sizes)
 729                 return NULL;
 730
 731         space = isl_union_map_get_space(sizes);
 732         space = isl_space_set_from_params(space);
 733         space = isl_space_add_dims(space, isl_dim_set, 1);
 734         space = isl_space_set_tuple_name(space, isl_dim_set, "kernel");
 735         dom = isl_set_universe(space);
 736         dom = isl_set_fix_si(dom, isl_dim_set, 0, id);
 737
 738         local_sizes = isl_union_set_apply(isl_union_set_from_set(dom),
 739                                         isl_union_map_copy(sizes));
 740         isl_union_set_foreach_set(local_sizes, &extract_size_of_type, &data);
 741         isl_union_set_free(local_sizes);
 742         return data.res;
 743 }
 744
 745 /* Given a singleton set, extract the first (at most *len) elements
 746  * of the single integer tuple into *sizes and update *len if needed.
 747  */
 748 static void read_sizes_from_set(__isl_take isl_set *set, int *sizes, int *len)
 749 {
 750         int i;
 751         int dim;
 752
 753         if (!set)
 754                 return;
 755
 756         dim = isl_set_dim(set, isl_dim_set);
 757         if (dim < *len)
 758                 *len = dim;
 759
 760         for (i = 0; i < *len; ++i) {
 761                 isl_val *v;
 762
 763                 v = isl_set_plain_get_val_if_fixed(set, isl_dim_set, i);
 764                 assert(v);
 765
 766                 sizes[i] = isl_val_get_num_si(v);
 767                 isl_val_free(v);
 768         }
 769
 770         isl_set_free(set);
 771 }
 772
 773 /* Add the map { kernel[id] -> type[sizes] } to gen->used_sizes,
 774  * if the option debug->dump_sizes is set.
 775  */
 776 static void set_used_sizes(struct gpu_gen *gen, const char *type, int id,
 777         int *sizes, int len)
 778 {
 779         int i;
 780         isl_space *space;
 781         isl_map *map;
 782
 783         if (!gen->options->debug->dump_sizes)
 784                 return;
 785
 786         space = isl_union_map_get_space(gen->used_sizes);
 787         space = isl_space_set_from_params(space);
 788         space = isl_space_add_dims(space, isl_dim_set, 1);
 789         space = isl_space_set_tuple_name(space, isl_dim_set, "kernel");
 790         space = isl_space_from_domain(space);
 791         space = isl_space_add_dims(space, isl_dim_out, len);
 792         space = isl_space_set_tuple_name(space, isl_dim_out, type);
 793
 794         map = isl_map_universe(space);
 795         map = isl_map_fix_si(map, isl_dim_in, 0, id);
 796         for (i = 0; i < len; ++i)
 797                 map = isl_map_fix_si(map, isl_dim_out, i, sizes[i]);
 798
 799         gen->used_sizes = isl_union_map_add_map(gen->used_sizes, map);
 800 }
 801
 802 /* Extract user specified "tile" sizes from the "sizes" command line option,
 803  * defaulting to option->tile_size in each dimension.
 804  * Add the effectively used sizes to gen->used_sizes.
 805  */
 806 static void read_tile_sizes(struct gpu_gen *gen)
 807 {
 808         int n;
 809         isl_set *size;
 810
 811         gen->tile_size = isl_alloc_array(gen->ctx, int, gen->tile_len);
 812         assert(gen->tile_size);
 813         for (n = 0; n < gen->tile_len; ++n)
 814                 gen->tile_size[n] = gen->options->tile_size;
 815
 816         size = extract_sizes(gen->sizes, "tile", gen->kernel_id);
 817         read_sizes_from_set(size, gen->tile_size, &gen->tile_len);
 818         set_used_sizes(gen, "tile", gen->kernel_id,
 819                         gen->tile_size, gen->tile_len);
 820
 821         if (gen->n_parallel > gen->tile_len)
 822                 gen->n_parallel = gen->tile_len;
 823 }
 824
 825 /* Extract user specified "block" sizes from the "sizes" command line option,
 826  * after filling in some potentially useful defaults.
 827  * Add the effectively used sizes to gen->used_sizes.
 828  */
 829 static void read_block_sizes(struct gpu_gen *gen)
 830 {
 831         int n;
 832         isl_set *size;
 833
 834         n = gen->n_parallel;
 835         gen->n_block = (n <= 3) ? n : 3;
 836         switch (gen->n_block) {
 837         case 1:
 838                 gen->block_dim[0] = 512;
 839                 break;
 840         case 2:
 841                 gen->block_dim[0] = 32;
 842                 gen->block_dim[1] = 16;
 843                 break;
 844         default:
 845                 gen->block_dim[0] = 32;
 846                 gen->block_dim[1] = 4;
 847                 gen->block_dim[2] = 4;
 848                 break;
 849         }
 850
 851         size = extract_sizes(gen->sizes, "block", gen->kernel_id);
 852         read_sizes_from_set(size, gen->block_dim, &gen->n_block);
 853         set_used_sizes(gen, "block", gen->kernel_id,
 854                         gen->block_dim, gen->n_block);
 855 }
 856
 857 /* Extract user specified "grid" sizes from the "sizes" command line option,
 858  * after filling in some potentially useful defaults.
 859  * Add the effectively used sizes to gen->used_sizes.
 860  */
 861 static void read_grid_sizes(struct gpu_gen *gen)
 862 {
 863         int n = gen->n_parallel;
 864         isl_set *size;
 865
 866         gen->n_grid = (n <= 2) ? n : 2;
 867         switch (gen->n_grid) {
 868         case 1:
 869                 gen->grid_dim[0] = 32768;
 870                 break;
 871         default:
 872                 gen->grid_dim[0] = 256;
 873                 gen->grid_dim[1] = 256;
 874                 break;
 875         }
 876
 877         size = extract_sizes(gen->sizes, "grid", gen->kernel_id);
 878         read_sizes_from_set(size, gen->grid_dim, &gen->n_grid);
 879         set_used_sizes(gen, "grid", gen->kernel_id, gen->grid_dim, gen->n_grid);
 880 }
 881
 882 /* Extract user specified sizes from the "sizes" command line option
 883  * after filling in some potentially useful defaults.
 884  */
 885 static void read_sizes(struct gpu_gen *gen)
 886 {
 887         read_tile_sizes(gen);
 888         read_block_sizes(gen);
 889         read_grid_sizes(gen);
 890 }
 891
 892 static void *free_stmts(struct gpu_stmt *stmts, int n)
 893 {
 894         int i;
 895
 896         if (!stmts)
 897                 return NULL;
 898
 899         for (i = 0; i < n; ++i) {
 900                 struct gpu_stmt_access *access, *next;
 901
 902                 for (access = stmts[i].accesses; access; access = next) {
 903                         next = access->next;
 904                         isl_id_free(access->ref_id);
 905                         isl_map_free(access->access);
 906                         isl_map_free(access->tagged_access);
 907                         free(access);
 908                 }
 909
 910                 isl_id_free(stmts[i].id);
 911         }
 912         free(stmts);
 913
 914         return NULL;
 915 }
 916
 917 /* Construct a map from a domain of dimensionality "len"
 918  * to a domain of dimensionality "len" + "tile_len" that tiles
 919  * the "tile_len" coordinates starting at "first".
 920  * In particular, [s_i] -> [s_i / tile_size[i], s_i % tile_size[i]].
 921  * "dim" prescribes the parameters.
 922  */
 923 static __isl_give isl_map *tile(__isl_take isl_space *dim, int len,
 924         int first, int tile_len, int *tile_size)
 925 {
 926         int i;
 927         isl_basic_map *bmap;
 928         isl_constraint *c;
 929         isl_local_space *ls;
 930
 931         dim = isl_space_add_dims(dim, isl_dim_in, len);
 932         dim = isl_space_add_dims(dim, isl_dim_out, len + tile_len);
 933         bmap = isl_basic_map_universe(isl_space_copy(dim));
 934         ls = isl_local_space_from_space(dim);
 935
 936         for (i = 0; i < len - tile_len; ++i) {
 937                 int j = i < first ? i : i + tile_len;
 938                 int k = i < first ? i : i + 2 * tile_len;
 939
 940                 c = isl_equality_alloc(isl_local_space_copy(ls));
 941                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, j, -1);
 942                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, k, 1);
 943                 bmap = isl_basic_map_add_constraint(bmap, c);
 944         }
 945
 946         for (i = 0; i < tile_len; ++i) {
 947                 c = isl_equality_alloc(isl_local_space_copy(ls));
 948                 c = isl_constraint_set_coefficient_si(c, isl_dim_in,
 949                                                 first + i, -1);
 950                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 951                                                 first + i, tile_size[i]);
 952                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 953                                                 first + i + tile_len, 1);
 954                 bmap = isl_basic_map_add_constraint(bmap, c);
 955
 956                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 957                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 958                                                    first + i + tile_len, 1);
 959                 bmap = isl_basic_map_add_constraint(bmap, c);
 960
 961                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 962                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 963                                                    first + i + tile_len, -1);
 964                 c = isl_constraint_set_constant_si(c, tile_size[i] - 1);
 965                 bmap = isl_basic_map_add_constraint(bmap, c);
 966         }
 967
 968         isl_local_space_free(ls);
 969
 970         return isl_map_from_basic_map(bmap);
 971 }
 972
 973 /* Construct a map from a domain of dimensionality "len"
 974  * to a domain of dimensionality "len" + "wrap_len" that "wraps"
 975  * the "wrap_len" coordinates starting at "first" according to "wrap_size".
 976  * In particular, [s_i] -> [s_i, s_i % wrap_size[i]].
 977  * To do so, we need extra variables corresponding to [s_i / wrap_size[i]],
 978  * that are projected out at the end.
 979  * "dim" prescribes the parameters.
 980  */
 981 static __isl_give isl_map *wrap(__isl_take isl_space *dim, int len,
 982         int first, int wrap_len, int *wrap_size)
 983 {
 984         int i;
 985         isl_basic_map *bmap;
 986         isl_constraint *c;
 987         isl_local_space *ls;
 988
 989         dim = isl_space_add_dims(dim, isl_dim_in, len);
 990         dim = isl_space_add_dims(dim, isl_dim_out, len + 2 * wrap_len);
 991         bmap = isl_basic_map_universe(isl_space_copy(dim));
 992         ls = isl_local_space_from_space(dim);
 993
 994         for (i = 0; i < len; ++i) {
 995                 int k = i < first + wrap_len ? i : i + 2 * wrap_len;
 996
 997                 c = isl_equality_alloc(isl_local_space_copy(ls));
 998                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, -1);
 999                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, k, 1);
1000                 bmap = isl_basic_map_add_constraint(bmap, c);
1001         }
1002
1003         for (i = 0; i < wrap_len; ++i) {
1004                 c = isl_equality_alloc(isl_local_space_copy(ls));
1005                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
1006                                                     first + i, -1);
1007                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
1008                                                     first + wrap_len + i, 1);
1009                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
1010                                     first + 2 * wrap_len + i, wrap_size[i]);
1011                 bmap = isl_basic_map_add_constraint(bmap, c);
1012
1013                 c = isl_inequality_alloc(isl_local_space_copy(ls));
1014                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
1015                                                     first + wrap_len + i, 1);
1016                 bmap = isl_basic_map_add_constraint(bmap, c);
1017
1018                 c = isl_inequality_alloc(isl_local_space_copy(ls));
1019                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
1020                                                     first + wrap_len + i, -1);
1021                 c = isl_constraint_set_constant_si(c, wrap_size[i] - 1);
1022                 bmap = isl_basic_map_add_constraint(bmap, c);
1023         }
1024
1025         isl_local_space_free(ls);
1026
1027         bmap = isl_basic_map_project_out(bmap, isl_dim_out,
1028                                 first + 2 * wrap_len, wrap_len);
1029
1030         return isl_map_from_basic_map(bmap);
1031 }
1032
1033 /* Add "n" parameters named prefix%d.
1034  */
1035 static __isl_give isl_set *add_params( __isl_take isl_set *set,
1036         int n, const char *prefix)
1037 {
1038         int i;
1039         unsigned nparam;
1040         char name[20];
1041
1042         nparam = isl_set_dim(set, isl_dim_param);
1043         set = isl_set_add_dims(set, isl_dim_param, n);
1044
1045         for (i = 0; i < n; ++i) {
1046                 snprintf(name, sizeof(name), "%s%d", prefix, i);
1047                 set = isl_set_set_dim_name(set, isl_dim_param,
1048                                             nparam + i, name);
1049         }
1050
1051         return set;
1052 }
1053
1054 /* Equate the "n" dimensions of "set" starting at "first" to
1055  * freshly created parameters named prefix%d.
1056  */
1057 static __isl_give isl_set *parametrize(__isl_take isl_set *set,
1058         int first, int n, const char *prefix)
1059 {
1060         int i;
1061         unsigned nparam;
1062
1063         nparam = isl_set_dim(set, isl_dim_param);
1064
1065         set = add_params(set, n, prefix);
1066
1067         for (i = 0; i < n; ++i)
1068                 set = isl_set_equate(set, isl_dim_param, nparam + i,
1069                                         isl_dim_set, first + i);
1070
1071         return set;
1072 }
1073
1074 /* Given a parameter space "space", create a set of dimension "len"
1075  * of which the "n" dimensions starting at "first" are equated to
1076  * freshly created parameters named prefix%d.
1077  */
1078 static __isl_give isl_set *parametrization(__isl_take isl_space *space,
1079         int len, int first, int n, const char *prefix)
1080 {
1081         isl_set *set;
1082
1083         space = isl_space_set_from_params(space);
1084         space = isl_space_add_dims(space, isl_dim_set, len);
1085         set = isl_set_universe(space);
1086
1087         return parametrize(set, first, n, prefix);
1088 }
1089
1090 /* Tile the B loops over the tile sizes and then tile/wrap
1091  * the T1 loops over the blocks.
1092  */
1093 static __isl_give isl_union_map *tile_schedule(struct gpu_gen *gen,
1094         __isl_take isl_union_map *sched)
1095 {
1096         isl_space *dim;
1097         isl_map *tiling, *block_tiling;
1098
1099         dim = isl_union_map_get_space(sched);
1100         tiling = tile(isl_space_copy(dim), gen->untiled_len,
1101                       gen->tile_first, gen->tile_len, gen->tile_size);
1102
1103         if (gen->options->wrap)
1104                 block_tiling = wrap(dim, gen->untiled_len + gen->tile_len,
1105                                 gen->tile_first, gen->n_grid, gen->grid_dim);
1106         else
1107                 block_tiling = tile(dim, gen->untiled_len + gen->tile_len,
1108                                 gen->tile_first, gen->n_grid, gen->grid_dim);
1109
1110         gen->tiled_len = gen->untiled_len + gen->tile_len + gen->n_grid;
1111
1112         tiling = isl_map_apply_range(tiling, block_tiling);
1113
1114         sched = isl_union_map_apply_range(sched,
1115                                              isl_union_map_from_map(tiling));
1116
1117         gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
1118
1119         return sched;
1120 }
1121
1122 /* Equate the "T1P" iterators in the tiled schedule "sched"
1123  * to the block dimensions.
1124  */
1125 static __isl_give isl_union_map *parametrize_tiled_schedule(
1126         struct gpu_gen *gen, __isl_take isl_union_map *sched)
1127 {
1128         isl_space *dim;
1129         isl_set *par;
1130
1131         dim = isl_union_map_get_space(sched);
1132         par = parametrization(dim, gen->tiled_len,
1133                 gen->tile_first + gen->n_grid, gen->n_grid, "b");
1134         sched = isl_union_map_intersect_range(sched,
1135                                                 isl_union_set_from_set(par));
1136
1137         return sched;
1138 }
1139
1140 /* Tile/wrap the P1 loops over the threads.
1141  */
1142 static __isl_give isl_union_map *thread_tile_schedule(struct gpu_gen *gen,
1143         __isl_take isl_union_map *sched)
1144 {
1145         isl_space *dim;
1146         isl_map *tiling;
1147         isl_set *par;
1148
1149         dim = isl_union_map_get_space(sched);
1150
1151         if (gen->options->wrap)
1152                 tiling = wrap(isl_space_copy(dim), gen->tiled_len,
1153                                 gen->shared_len, gen->n_block, gen->block_dim);
1154         else
1155                 tiling = tile(isl_space_copy(dim), gen->tiled_len,
1156                                 gen->shared_len, gen->n_block, gen->block_dim);
1157         gen->thread_tiled_len = gen->tiled_len + gen->n_block;
1158
1159         sched = isl_union_map_apply_range(sched,
1160                                              isl_union_map_from_map(tiling));
1161
1162         par = parametrization(dim, gen->thread_tiled_len,
1163                 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
1164                 gen->n_block, "t");
1165         sched = isl_union_map_intersect_range(sched,
1166                                                 isl_union_set_from_set(par));
1167
1168         gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
1169
1170         return sched;
1171 }
1172
1173 /* If the user asked for it, scale the shared memory tile loops
1174  * (T1T and T2) of "sched" by gen->tile_size[i].
1175  * If we are not performing "wrapping", then additionally scale the T1P
1176  * loops by gen->grid_dim[i].
1177  */
1178 static __isl_give isl_union_map *scale_tile_loops(struct gpu_gen *gen,
1179         __isl_take isl_union_map *sched)
1180 {
1181         int i;
1182         isl_space *dim;
1183         isl_basic_map *scale;
1184         isl_constraint *c;
1185         isl_local_space *ls;
1186
1187         if (!gen->options->scale_tile_loops)
1188                 return sched;
1189
1190         dim = isl_union_map_get_space(sched);
1191         dim = isl_space_add_dims(dim, isl_dim_in, gen->tiled_len);
1192         dim = isl_space_add_dims(dim, isl_dim_out, gen->tiled_len);
1193         scale = isl_basic_map_universe(isl_space_copy(dim));
1194         ls = isl_local_space_from_space(dim);
1195
1196         for (i = 0; i < gen->tiled_len; ++i) {
1197                 int f = 1;
1198
1199                 if (i >= gen->tile_first && i < gen->tile_first + gen->n_grid) {
1200                         f = gen->tile_size[i - gen->tile_first];
1201                         if (!gen->options->wrap)
1202                                 f *= gen->grid_dim[i - gen->tile_first];
1203                 } else if (i >= gen->tile_first + gen->n_grid &&
1204                            i < gen->tile_first + gen->n_grid + gen->tile_len) {
1205                         f = gen->tile_size[i - (gen->tile_first + gen->n_grid)];
1206                 }
1207
1208                 c = isl_equality_alloc(isl_local_space_copy(ls));
1209                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1210                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1211                 scale = isl_basic_map_add_constraint(scale, c);
1212         }
1213
1214         isl_local_space_free(ls);
1215
1216         sched = isl_union_map_apply_range(sched,
1217                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1218
1219         return sched;
1220 }
1221
1222 /* If we are not performing "wrapping" and if the user asked for it,
1223  * scale the thread tile loops (P1T) of "sched" by gen->block_dim[i].
1224  */
1225 static __isl_give isl_union_map *scale_thread_tile_loops(struct gpu_gen *gen,
1226         __isl_take isl_union_map *sched)
1227 {
1228         int i;
1229         isl_space *dim;
1230         isl_basic_map *scale;
1231         isl_constraint *c;
1232         isl_local_space *ls;
1233
1234         if (gen->options->wrap)
1235                 return sched;
1236         if (!gen->options->scale_tile_loops)
1237                 return sched;
1238
1239         dim = isl_union_map_get_space(sched);
1240         dim = isl_space_add_dims(dim, isl_dim_in, gen->thread_tiled_len);
1241         dim = isl_space_add_dims(dim, isl_dim_out, gen->thread_tiled_len);
1242         scale = isl_basic_map_universe(isl_space_copy(dim));
1243         ls = isl_local_space_from_space(dim);
1244
1245         for (i = 0; i < gen->thread_tiled_len; ++i) {
1246                 int f = 1;
1247
1248                 if (i >= gen->shared_len &&
1249                     i < gen->shared_len + gen->n_block)
1250                         f = gen->block_dim[i - gen->shared_len];
1251
1252                 c = isl_equality_alloc(isl_local_space_copy(ls));
1253                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1254                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1255                 scale = isl_basic_map_add_constraint(scale, c);
1256         }
1257
1258         isl_local_space_free(ls);
1259
1260         sched = isl_union_map_apply_range(sched,
1261                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1262
1263         return sched;
1264 }
1265
1266 /* If we are not performing "wrapping" and if the user asked for it,
1267  * scale the "n_tile" loops starting at "first" of "sched" by gen->block_dim[i].
1268  */
1269 static __isl_give isl_union_map *scale_access_tile_loops(struct gpu_gen *gen,
1270         __isl_take isl_union_map *sched, int len, int first, int n_tile)
1271 {
1272         int i;
1273         isl_space *dim;
1274         isl_basic_map *scale;
1275         isl_constraint *c;
1276         isl_local_space *ls;
1277
1278         if (gen->options->wrap)
1279                 return sched;
1280         if (!gen->options->scale_tile_loops)
1281                 return sched;
1282
1283         dim = isl_union_map_get_space(sched);
1284         dim = isl_space_add_dims(dim, isl_dim_in, len);
1285         dim = isl_space_add_dims(dim, isl_dim_out, len);
1286         scale = isl_basic_map_universe(isl_space_copy(dim));
1287         ls = isl_local_space_from_space(dim);
1288
1289         for (i = 0; i < len; ++i) {
1290                 int f = 1;
1291
1292                 if (i >= first && i < first + n_tile)
1293                         f = gen->kernel->block_dim[i - first];
1294
1295                 c = isl_equality_alloc(isl_local_space_copy(ls));
1296                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1297                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1298                 scale = isl_basic_map_add_constraint(scale, c);
1299         }
1300
1301         isl_local_space_free(ls);
1302
1303         sched = isl_union_map_apply_range(sched,
1304                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1305
1306         return sched;
1307 }
1308
1309 /* Add "len" parameters p[i] called prefix%d,
1310  * with bounds to 0 <= p[i] < size[i].
1311  */
1312 __isl_give isl_set *add_bounded_parameters(__isl_take isl_set *set,
1313         int len, int *size, const char *prefix)
1314 {
1315         int i;
1316         unsigned nparam;
1317         isl_space *dim;
1318         isl_basic_set *bset;
1319         isl_constraint *c;
1320         isl_local_space *ls;
1321         char name[20];
1322
1323         nparam = isl_set_dim(set, isl_dim_param);
1324         set = isl_set_add_dims(set, isl_dim_param, len);
1325
1326         for (i = 0; i < len; ++i) {
1327                 snprintf(name, sizeof(name), "%s%d", prefix, i);
1328                 set = isl_set_set_dim_name(set, isl_dim_param,
1329                                             nparam + i, name);
1330         }
1331
1332         dim = isl_set_get_space(set);
1333         bset = isl_basic_set_universe(isl_space_copy(dim));
1334         ls = isl_local_space_from_space(dim);
1335
1336         for (i = 0; i < len; ++i) {
1337                 c = isl_inequality_alloc(isl_local_space_copy(ls));
1338                 c = isl_constraint_set_coefficient_si(c, isl_dim_param,
1339                                                         nparam + i, 1);
1340                 bset = isl_basic_set_add_constraint(bset, c);
1341
1342                 c = isl_inequality_alloc(isl_local_space_copy(ls));
1343                 c = isl_constraint_set_coefficient_si(c, isl_dim_param,
1344                                                         nparam + i, -1);
1345                 c = isl_constraint_set_constant_si(c, size[i] - 1);
1346                 bset = isl_basic_set_add_constraint(bset, c);
1347         }
1348
1349         isl_local_space_free(ls);
1350
1351         return isl_set_intersect(set, isl_set_from_basic_set(bset));
1352 }
1353
1354 /* Add "len" parameters p[i] called prefix%d and intersect "set"
1355  * with
1356  *
1357  *      { : 0 <= p[i] < size[i] }
1358  *
1359  * or an overapproximation.
1360  */
1361 static __isl_give isl_set *add_bounded_parameters_dynamic(
1362         __isl_take isl_set *set, __isl_keep isl_multi_pw_aff *size,
1363         const char *prefix)
1364 {
1365         int i, len;
1366         unsigned nparam;
1367         isl_space *space;
1368         isl_local_space *ls;
1369         char name[20];
1370
1371         len = isl_multi_pw_aff_dim(size, isl_dim_out);
1372         nparam = isl_set_dim(set, isl_dim_param);
1373         set = isl_set_add_dims(set, isl_dim_param, len);
1374
1375         for (i = 0; i < len; ++i) {
1376                 snprintf(name, sizeof(name), "%s%d", prefix, i);
1377                 set = isl_set_set_dim_name(set, isl_dim_param,
1378                                             nparam + i, name);
1379         }
1380
1381         space = isl_space_params(isl_set_get_space(set));
1382         ls = isl_local_space_from_space(space);
1383         for (i = 0; i < len; ++i) {
1384                 isl_pw_aff *param, *size_i, *zero;
1385                 isl_set *bound;
1386
1387                 param = isl_pw_aff_var_on_domain(isl_local_space_copy(ls),
1388                                                 isl_dim_param, nparam + i);
1389
1390                 size_i = isl_multi_pw_aff_get_pw_aff(size, i);
1391                 bound = isl_pw_aff_lt_set(isl_pw_aff_copy(param), size_i);
1392                 bound = isl_set_from_basic_set(isl_set_simple_hull(bound));
1393                 set = isl_set_intersect_params(set, bound);
1394
1395                 zero = isl_pw_aff_zero_on_domain(isl_local_space_copy(ls));
1396                 bound = isl_pw_aff_ge_set(param, zero);
1397                 set = isl_set_intersect_params(set, bound);
1398         }
1399         isl_local_space_free(ls);
1400
1401         return set;
1402 }
1403
1404 /* Construct a map from an access to group->array to the corresponding
1405  * shared/private memory tile.
1406  * The map is of the form
1407  *
1408  *      { [D[i] -> A[a]] -> T[t] }
1409  *
1410  * where D represents the initial shared_len dimensions
1411  * of the computed schedule.
1412  */
1413 static __isl_give isl_map *shift_access(struct gpu_array_ref_group *group)
1414 {
1415         struct gpu_array_tile *tile;
1416         isl_multi_aff *tiling;
1417
1418         tile = group->private_tile;
1419         if (!tile)
1420                 tile = group->shared_tile;
1421
1422         tiling = isl_multi_aff_copy(tile->tiling);
1423
1424         return isl_map_from_multi_aff(tiling);
1425 }
1426
1427 /* Does "map" have an obviously fixed value at variable "pos" of "type"?
1428  */
1429 static int map_plain_is_fixed(isl_map *map, enum isl_dim_type type,
1430         unsigned pos)
1431 {
1432         isl_val *v;
1433         int fixed;
1434
1435         v = isl_map_plain_get_val_if_fixed(map, type, pos);
1436         if (!v)
1437                 return -1;
1438         fixed = isl_val_is_int(v);
1439         isl_val_free(v);
1440
1441         return fixed;
1442 }
1443
1444 /* Given a schedule that iterates over all elements in a piece of an array,
1445  * perform tiling/wrapping over the threads.
1446  *
1447  * In particular, we tile the final iterators so that the final thread
1448  * dimension runs over the final array dimension.
1449  * However, if those final iterators have only a single iteration,
1450  * we try to tile earlier iterators instead.
1451  */
1452 static __isl_give isl_map *tile_access_schedule(struct gpu_gen *gen,
1453         __isl_take isl_map *sched)
1454 {
1455         isl_space *dim;
1456         isl_union_map *usched;
1457         isl_map *tiling;
1458         isl_set *par;
1459         unsigned nvar = isl_map_dim(sched, isl_dim_out);
1460         int n_tile;
1461         int first;
1462
1463         n_tile = gen->kernel->n_block;
1464         if (n_tile > nvar) {
1465                 int i;
1466                 sched = isl_map_insert_dims(sched,
1467                                                 isl_dim_out, 0, n_tile - nvar);
1468                 for (i = 0; i < n_tile - nvar; ++i)
1469                         sched = isl_map_fix_si(sched, isl_dim_out, i, 0);
1470                 nvar = n_tile;
1471         }
1472
1473         first = nvar - n_tile;
1474
1475         for (; first > 0; first --)
1476                 if (!map_plain_is_fixed(sched, isl_dim_out, first + n_tile - 1))
1477                         break;
1478
1479         dim = isl_map_get_space(sched);
1480         dim = isl_space_params(dim);
1481         if (gen->options->wrap)
1482                 tiling = wrap(isl_space_copy(dim), nvar, first,
1483                                 n_tile, gen->kernel->block_dim);
1484         else
1485                 tiling = tile(isl_space_copy(dim), nvar, first,
1486                                 n_tile, gen->kernel->block_dim);
1487         sched = isl_map_apply_range(sched, tiling);
1488
1489         par = parametrization(dim, nvar + n_tile, first + n_tile, n_tile, "t");
1490         sched = isl_map_intersect_range(sched, par);
1491
1492         usched = isl_union_map_from_map(sched);
1493         usched = scale_access_tile_loops(gen, usched, nvar + n_tile,
1494                                          first, n_tile);
1495         sched = isl_map_from_union_map(usched);
1496
1497         return sched;
1498 }
1499
1500 /* Return the union of all read (read = 1) and/or write (write = 1)
1501  * access relations in the group.
1502  */
1503 static __isl_give isl_union_map *group_access_relation(
1504         struct gpu_array_ref_group *group, int read, int write)
1505 {
1506         int i;
1507         isl_union_map *access;
1508
1509         access = isl_union_map_empty(isl_map_get_space(group->access));
1510         for (i = 0; i < group->n_ref; ++i) {
1511                 isl_map *map_i;
1512
1513                 if (!((read && group->refs[i]->read) ||
1514                      (write && group->refs[i]->write)))
1515                         continue;
1516                 map_i = isl_map_copy(group->refs[i]->access);
1517                 access = isl_union_map_union(access,
1518                                             isl_union_map_from_map(map_i));
1519         }
1520
1521         return access;
1522 }
1523
1524 /* Return the union of all tagged access relations in the group.
1525  */
1526 static __isl_give isl_union_map *group_tagged_access_relation(
1527         struct gpu_array_ref_group *group)
1528 {
1529         int i;
1530         isl_union_map *access;
1531
1532         access = isl_union_map_empty(isl_map_get_space(group->access));
1533         for (i = 0; i < group->n_ref; ++i) {
1534                 isl_map *map_i;
1535
1536                 map_i = isl_map_copy(group->refs[i]->tagged_access);
1537                 access = isl_union_map_union(access,
1538                                             isl_union_map_from_map(map_i));
1539         }
1540
1541         return access;
1542 }
1543
1544 /* Return the extent of "array", recomputed from the bounds.
1545  * The recomputed extent may be simpler than the original extent.
1546  */
1547 static __isl_give isl_set *array_extent(struct gpu_array_info *array)
1548 {
1549         int i;
1550         isl_id *id;
1551         isl_space *space;
1552         isl_local_space *ls;
1553         isl_set *extent;
1554
1555         id = isl_set_get_tuple_id(array->extent);
1556         space = isl_set_get_space(array->extent);
1557         extent = isl_set_universe(isl_space_copy(space));
1558         ls = isl_local_space_from_space(space);
1559         for (i = 0; i < array->n_index; ++i) {
1560                 isl_pw_aff *bound;
1561                 isl_aff *aff;
1562                 isl_pw_aff *index;
1563                 isl_set *lt;
1564
1565                 extent = isl_set_lower_bound_si(extent, isl_dim_set, i, 0);
1566
1567                 aff = isl_aff_var_on_domain(isl_local_space_copy(ls),
1568                                                 isl_dim_set, i);
1569                 index = isl_pw_aff_from_aff(aff);
1570                 bound = isl_pw_aff_copy(array->bound[i]);
1571                 bound = isl_pw_aff_from_range(bound);
1572                 bound = isl_pw_aff_add_dims(bound, isl_dim_in, array->n_index);
1573                 bound = isl_pw_aff_set_tuple_id(bound, isl_dim_in,
1574                                                 isl_id_copy(id));
1575                 lt = isl_pw_aff_lt_set(index, bound);
1576                 extent = isl_set_intersect(extent, lt);
1577         }
1578         isl_local_space_free(ls);
1579         isl_id_free(id);
1580
1581         return extent;
1582 }
1583
1584 /* Return a map from the first shared_len dimensions of the computed
1585  * schedule to the array tile in
1586  * global memory that corresponds to the shared memory copy.
1587  *
1588  * In particular, return a map
1589  *
1590  *      { D[i] -> A[a] }
1591  *
1592  * with constraints
1593  *
1594  *      tile_offset(i) <= a <= tile_offset(i) + tile_size - 1           (1)
1595  *
1596  * and
1597  *
1598  *      0 <= a <= array_size - 1                                        (2)
1599  *
1600  * Note that if some stride has been detected (i.e., when
1601  * group->shared_tile->bound[i].shift is set), then a in (1) refers
1602  * to the shifted and scaled down version.
1603  *
1604  * Constraints (1) are obtained by mapping the size constraints on the
1605  * shared/private memory tile back to the access relation.
1606  * Constraints (2) are obtained from the (recomputed) extent.
1607  */
1608 static __isl_give isl_map *group_tile(struct gpu_array_ref_group *group)
1609 {
1610         int i;
1611         int n_index = group->array->n_index;
1612         isl_map *tile;
1613         isl_space *space;
1614         isl_set *local;
1615         isl_set *extent;
1616
1617         space = isl_multi_aff_get_space(group->shared_tile->tiling);
1618         space = isl_space_range(space);
1619         local = isl_set_universe(space);
1620         for (i = 0; i < n_index; ++i) {
1621                 isl_val *bound;
1622
1623                 local = isl_set_lower_bound_si(local, isl_dim_set, i, 0);
1624                 bound = isl_val_copy(group->shared_tile->bound[i].size);
1625                 bound = isl_val_sub_ui(bound, 1);
1626                 local = isl_set_upper_bound_val(local, isl_dim_set, i, bound);
1627         }
1628         local = isl_set_preimage_multi_aff(local,
1629                                 isl_multi_aff_copy(group->shared_tile->tiling));
1630         tile = isl_set_unwrap(local);
1631         extent = array_extent(group->array);
1632         tile = isl_map_intersect_range(tile, extent);
1633
1634         return tile;
1635 }
1636
1637 /* Given a mapping "iterator_map" from the AST schedule to a domain,
1638  * return the corresponding mapping from the AST schedule to
1639  * to the first shared_len dimensions of the schedule computed by PPCG.
1640  */
1641 static __isl_give isl_pw_multi_aff *compute_sched_to_shared(struct gpu_gen *gen,
1642         __isl_take isl_pw_multi_aff *iterator_map)
1643 {
1644         isl_union_map *umap;
1645         isl_space *space;
1646         isl_map *map, *sched;;
1647
1648         space = isl_space_range(isl_pw_multi_aff_get_space(iterator_map));
1649         space = isl_space_from_domain(space);
1650         space = isl_space_add_dims(space, isl_dim_out, gen->shared_len);
1651
1652         umap = isl_union_map_copy(gen->shared_sched);
1653         umap = isl_union_map_apply_range(umap,
1654                         isl_union_map_copy(gen->shared_proj));
1655         map = isl_union_map_extract_map(umap, space);
1656         isl_union_map_free(umap);
1657
1658         sched = isl_map_preimage_domain_pw_multi_aff(map, iterator_map);
1659         sched = isl_map_detect_equalities(sched);
1660
1661         return isl_pw_multi_aff_from_map(sched);
1662 }
1663
1664 /* Set unroll[j] if the input dimension j is involved in
1665  * the index expression represented by ma.
1666  */
1667 static int check_unroll(__isl_take isl_set *set, __isl_take isl_multi_aff *ma,
1668         void *user)
1669 {
1670         int i, j;
1671         int n_in = isl_multi_aff_dim(ma, isl_dim_in);
1672         int n_out = isl_multi_aff_dim(ma, isl_dim_out);
1673         int *unroll = user;
1674
1675         for (i = 0; i < n_out; ++i) {
1676                 isl_aff *aff;
1677
1678                 aff = isl_multi_aff_get_aff(ma, i);
1679                 for (j = 0; j < n_in; ++j)
1680                         if (isl_aff_involves_dims(aff, isl_dim_in, j, 1))
1681                                 unroll[j] = 1;
1682                 isl_aff_free(aff);
1683         }
1684
1685         isl_set_free(set);
1686         isl_multi_aff_free(ma);
1687         return 0;
1688 }
1689
1690 /* Given an array pos mapping input dimensions to the corresponding
1691  * output dimension, construct the corresponding map.
1692  */
1693 static __isl_give isl_map *permutation(__isl_take isl_space *dim,
1694         int *pos, int len)
1695 {
1696         int i;
1697         isl_constraint *c;
1698         isl_basic_map *bmap;
1699         isl_local_space *ls;
1700
1701         dim = isl_space_add_dims(dim, isl_dim_in, len);
1702         dim = isl_space_add_dims(dim, isl_dim_out, len);
1703         bmap = isl_basic_map_universe(isl_space_copy(dim));
1704         ls = isl_local_space_from_space(dim);
1705
1706         for (i = 0; i < len; ++i) {
1707                 c = isl_equality_alloc(isl_local_space_copy(ls));
1708                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i,
1709                                                       -1);
1710                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, pos[i],
1711                                                       1);
1712                 bmap = isl_basic_map_add_constraint(bmap, c);
1713         }
1714         isl_local_space_free(ls);
1715
1716         return isl_map_from_basic_map(bmap);
1717 }
1718
1719 /* Remove the private tiles from all array reference groups,
1720  * except for the groups of arrays that are marked force_private.
1721  */
1722 static void remove_private_tiles(struct gpu_gen *gen)
1723 {
1724         int i, j;
1725
1726         for (i = 0; i < gen->prog->n_array; ++i) {
1727                 struct gpu_array_info *array = &gen->prog->array[i];
1728
1729                 if (array->force_private)
1730                         continue;
1731
1732                 for (j = 0; j < array->n_group; ++j) {
1733                         struct gpu_array_ref_group *group = array->groups[j];
1734
1735                         group->private_tile = free_tile(group->private_tile);
1736                 }
1737         }
1738 }
1739
1740 /* Find all loops involved in any of the index expressions for any of
1741  * the private accesses, move them innermost and then mark them as
1742  * requiring unrolling by setting gen->first_unroll.
1743  * The loops involved should all be parallel because of the checks
1744  * we performed in check_private_group_access.  Moving them innermost
1745  * is therefore a valid transformation.
1746  *
1747  * If any of the arrays are marked force_private, however, then
1748  * those loops may not be parallel with respect to the marked arrays.
1749  * If any of the loops would have to be moved innermost for the
1750  * (non forced) private accesses and if there are any force_private
1751  * arrays, then we revert the decision to map the selected arrays
1752  * to private memory.  An alternative solution would be to expand
1753  * the force_private arrays.
1754  *
1755  * Loops up to gen->shared_len are generated before the mapping to
1756  * threads is applied.  They should therefore be ignored.
1757  *
1758  * We compute the hidden equalities of the schedule first
1759  * since we will need them in our calls to isl_pw_multi_aff_from_map
1760  * and because we want to make sure that the same equalities
1761  * are also available to the code generator.
1762  */
1763 static __isl_give isl_union_map *interchange_for_unroll(struct gpu_gen *gen,
1764         __isl_take isl_union_map *sched)
1765 {
1766         int i, j;
1767         int unroll[gen->thread_tiled_len];
1768         int perm[gen->thread_tiled_len];
1769         isl_space *dim;
1770         isl_map *permute;
1771         int len = gen->shared_len + gen->n_parallel + gen->n_block;
1772
1773         gen->first_unroll = -1;
1774
1775         sched = isl_union_map_detect_equalities(sched);
1776         for (i = 0; i < gen->thread_tiled_len; ++i)
1777                 unroll[i] = 0;
1778         for (i = 0; i < gen->prog->n_array; ++i) {
1779                 struct gpu_array_info *array = &gen->prog->array[i];
1780
1781                 for (j = 0; j < array->n_group; ++j) {
1782                         isl_union_map *access;
1783                         isl_map *acc;
1784                         isl_pw_multi_aff *pma;
1785
1786                         if (!array->groups[j]->private_tile)
1787                                 continue;
1788
1789                         access = group_access_relation(array->groups[j], 1, 1);
1790                         access = isl_union_map_apply_domain(access,
1791                                                 isl_union_map_copy(sched));
1792
1793                         acc = isl_map_from_union_map(access);
1794                         pma = isl_pw_multi_aff_from_map(acc);
1795                         isl_pw_multi_aff_foreach_piece(pma,
1796                                                         &check_unroll, unroll);
1797
1798                         isl_pw_multi_aff_free(pma);
1799                 }
1800         }
1801
1802         for (i = gen->shared_len; i < len; ++i)
1803                 if (unroll[i])
1804                         break;
1805
1806         if (i >= len)
1807                 return sched;
1808
1809         for (i = len; i < gen->thread_tiled_len; ++i)
1810                 if (unroll[i])
1811                         return sched;
1812
1813         if (gen->any_force_private) {
1814                 remove_private_tiles(gen);
1815                 return sched;
1816         }
1817
1818         j = 0;
1819         for (i = 0; i < gen->shared_len; ++i)
1820                 perm[i] = j++;
1821         for (i = gen->shared_len; i < gen->thread_tiled_len; ++i)
1822                 if (!unroll[i])
1823                         perm[i] = j++;
1824         gen->first_unroll = j - gen->shared_len;
1825         for (i = gen->shared_len; i < len; ++i)
1826                 if (unroll[i])
1827                         perm[i] = j++;
1828
1829         dim = isl_union_map_get_space(sched);
1830         permute = permutation(dim, perm, gen->thread_tiled_len);
1831         sched = isl_union_map_apply_range(sched,
1832                                           isl_union_map_from_map(permute));
1833
1834         return sched;
1835 }
1836
1837 /* Given a constraint
1838  *
1839  *              a(p,i) + j = g f(e)
1840  *
1841  * or -a(p,i) - j = g f(e) if sign < 0,
1842  * store a(p,i) in bound->shift and g (stride) in bound->stride.
1843  * a(p,i) is assumed to be an expression in only the parameters
1844  * and the input dimensions.
1845  */
1846 static void extract_stride(__isl_keep isl_constraint *c,
1847         struct gpu_array_bound *bound, __isl_keep isl_val *stride, int sign)
1848 {
1849         int i;
1850         isl_val *v;
1851         isl_space *space;
1852         unsigned nparam;
1853         unsigned nvar;
1854         isl_aff *aff;
1855
1856         isl_val_free(bound->stride);
1857         bound->stride = isl_val_copy(stride);
1858
1859         space = isl_constraint_get_space(c);
1860         space = isl_space_domain(space);
1861
1862         nparam = isl_space_dim(space, isl_dim_param);
1863         nvar = isl_space_dim(space, isl_dim_set);
1864
1865         v = isl_constraint_get_constant_val(c);
1866         if (sign < 0)
1867                 v = isl_val_neg(v);
1868         aff = isl_aff_zero_on_domain(isl_local_space_from_space(space));
1869         aff = isl_aff_set_constant_val(aff, v);
1870
1871         for (i = 0; i < nparam; ++i) {
1872                 if (!isl_constraint_involves_dims(c, isl_dim_param, i, 1))
1873                         continue;
1874                 v = isl_constraint_get_coefficient_val(c, isl_dim_param, i);
1875                 if (sign < 0)
1876                         v = isl_val_neg(v);
1877                 aff = isl_aff_add_coefficient_val(aff, isl_dim_param, i, v);
1878         }
1879
1880         for (i = 0; i < nvar; ++i) {
1881                 if (!isl_constraint_involves_dims(c, isl_dim_in, i, 1))
1882                         continue;
1883                 v = isl_constraint_get_coefficient_val(c, isl_dim_in, i);
1884                 if (sign < 0)
1885                         v = isl_val_neg(v);
1886                 aff = isl_aff_add_coefficient_val(aff, isl_dim_in, i, v);
1887         }
1888
1889         bound->shift = aff;
1890 }
1891
1892 /* Given an equality constraint of a map with a single output dimension j,
1893  * check if the constraint is of the form
1894  *
1895  *              a(p,i) + j = g f(e)
1896  *
1897  * with a(p,i) an expression in the parameters and input dimensions
1898  * and f(e) an expression in the existentially quantified variables.
1899  * If so, and if g is larger than any such g from a previously considered
1900  * constraint, then call extract_stride to record the stride information
1901  * in bound.
1902  */
1903 static int check_stride_constraint(__isl_take isl_constraint *c, void *user)
1904 {
1905         int i;
1906         isl_ctx *ctx;
1907         isl_val *v;
1908         unsigned n_div;
1909         struct gpu_array_bound *bound = user;
1910
1911         ctx = isl_constraint_get_ctx(c);
1912         n_div = isl_constraint_dim(c, isl_dim_div);
1913         v = isl_constraint_get_coefficient_val(c, isl_dim_out, 0);
1914
1915         if (n_div && (isl_val_is_one(v) || isl_val_is_negone(v))) {
1916                 int s = isl_val_sgn(v);
1917                 isl_val *stride = isl_val_zero(ctx);
1918
1919                 isl_val_free(v);
1920                 for (i = 0; i < n_div; ++i) {
1921                         v = isl_constraint_get_coefficient_val(c,
1922                                                                 isl_dim_div, i);
1923                         stride = isl_val_gcd(stride, v);
1924                 }
1925                 if (!isl_val_is_zero(stride) &&
1926                     isl_val_gt(stride, bound->stride))
1927                         extract_stride(c, bound, stride, s);
1928
1929                 isl_val_free(stride);
1930         } else
1931                 isl_val_free(v);
1932
1933         isl_constraint_free(c);
1934         return 0;
1935 }
1936
1937 /* Given contraints on an array index i, check if we can find
1938  * a shift a(p) and a stride g such that
1939  *
1940  *      a(p) + i = 0 mod g
1941  *
1942  * If so, record the information in bound and apply the mapping
1943  * i -> (i + a(p))/g to the array index in bounds and return
1944  * the new constraints.
1945  * If not, simply return the original constraints.
1946  *
1947  * If bounds is a subset of the space
1948  *
1949  *      D -> i
1950  *
1951  * then the bound recorded in bound->shift is of the form
1952  *
1953  *      D -> s(D)
1954  *
1955  * with s(D) equal to a(p) above.
1956  * The mapping recorded in bound->shift_map is of the form
1957  *
1958  *      [D -> i] -> [D -> (i + S(D))/g]
1959  *
1960  * This mapping is computed as follows.
1961  * We first introduce "i" in the domain through precomposition
1962  * with [D -> i] -> D obtaining
1963  *
1964  *      [D -> i] -> s(D)
1965  *
1966  * Adding [D -> i] -> i produces
1967  *
1968  *      [D -> i] -> i + s(D)
1969  *
1970  * and the domain product with [D -> i] -> D yields
1971  *
1972  *      [D -> i] -> [D -> i + s(D)]
1973  *
1974  * Composition with [D -> i] -> [D -> i/g] gives the desired result.
1975  */
1976 static __isl_give isl_basic_map *check_stride(struct gpu_array_bound *bound,
1977         __isl_take isl_basic_map *bounds)
1978 {
1979         isl_space *space;
1980         isl_basic_map *hull;
1981         isl_basic_map *shift, *id, *bmap, *scale;
1982         isl_basic_set *bset;
1983         isl_aff *aff;
1984
1985         bound->stride = NULL;
1986
1987         hull = isl_basic_map_affine_hull(isl_basic_map_copy(bounds));
1988
1989         isl_basic_map_foreach_constraint(hull, &check_stride_constraint, bound);
1990
1991         isl_basic_map_free(hull);
1992
1993         if (!bound->stride)
1994                 return bounds;
1995
1996         shift = isl_basic_map_from_aff(isl_aff_copy(bound->shift));
1997         space = isl_basic_map_get_space(bounds);
1998         bmap = isl_basic_map_domain_map(isl_basic_map_universe(space));
1999         shift = isl_basic_map_apply_range(bmap, shift);
2000         space = isl_basic_map_get_space(bounds);
2001         id = isl_basic_map_range_map(isl_basic_map_universe(space));
2002         shift = isl_basic_map_sum(id, shift);
2003         space = isl_basic_map_get_space(bounds);
2004         id = isl_basic_map_domain_map(isl_basic_map_universe(space));
2005         shift = isl_basic_map_range_product(id, shift);
2006
2007         space = isl_space_domain(isl_basic_map_get_space(bounds));
2008         id = isl_basic_map_identity(isl_space_map_from_set(space));
2009         space = isl_space_range(isl_basic_map_get_space(bounds));
2010         aff = isl_aff_zero_on_domain(isl_local_space_from_space(space));
2011         aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1);
2012         aff = isl_aff_scale_down_val(aff, isl_val_copy(bound->stride));
2013         scale = isl_basic_map_from_aff(aff);
2014         scale = isl_basic_map_product(id, scale);
2015
2016         bound->shift_map = isl_basic_map_apply_range(shift, scale);
2017         bmap = isl_basic_map_copy(bound->shift_map);
2018         bset = isl_basic_set_apply(isl_basic_map_wrap(bounds), bmap);
2019         bounds = isl_basic_set_unwrap(bset);
2020
2021         return bounds;
2022 }
2023
2024 /* Data used in compute_array_dim_size and compute_size_in_direction.
2025  *
2026  * pos is the position of the variable representing the array index,
2027  * i.e., the variable for which want to compute the size.  This variable
2028  * is also the last variable in the set.
2029  */
2030 struct gpu_size_info {
2031         isl_basic_set *bset;
2032         struct gpu_array_bound *bound;
2033         int pos;
2034 };
2035
2036 /* Given a constraint from the basic set describing the bounds on
2037  * an array index, check if it is a lower bound, say m i >= b(x), and,
2038  * if so, check whether the expression "i - ceil(b(x)/m) + 1" has a constant
2039  * upper bound.  If so, and if this bound is smaller than any bound
2040  * derived from earlier constraints, set the size to this bound on
2041  * the expression and the lower bound to ceil(b(x)/m).
2042  */
2043 static int compute_size_in_direction(__isl_take isl_constraint *c, void *user)
2044 {
2045         struct gpu_size_info *size = user;
2046         unsigned nparam;
2047         unsigned n_div;
2048         isl_val *v;
2049         isl_aff *aff;
2050         isl_aff *lb;
2051
2052         nparam = isl_basic_set_dim(size->bset, isl_dim_param);
2053         n_div = isl_constraint_dim(c, isl_dim_div);
2054
2055         if (isl_constraint_involves_dims(c, isl_dim_div, 0, n_div) ||
2056             !isl_constraint_is_lower_bound(c, isl_dim_set, size->pos)) {
2057                 isl_constraint_free(c);
2058                 return 0;
2059         }
2060
2061         aff = isl_constraint_get_bound(c, isl_dim_set, size->pos);
2062         aff = isl_aff_ceil(aff);
2063
2064         lb = isl_aff_copy(aff);
2065
2066         aff = isl_aff_neg(aff);
2067         aff = isl_aff_add_coefficient_si(aff, isl_dim_in, size->pos, 1);
2068
2069         v = isl_basic_set_max_val(size->bset, aff);
2070         isl_aff_free(aff);
2071
2072         if (isl_val_is_int(v)) {
2073                 v = isl_val_add_ui(v, 1);
2074                 if (!size->bound->size || isl_val_lt(v, size->bound->size)) {
2075                         isl_val_free(size->bound->size);
2076                         size->bound->size = isl_val_copy(v);
2077                         lb = isl_aff_drop_dims(lb, isl_dim_in, size->pos, 1);
2078                         isl_aff_free(size->bound->lb);
2079                         size->bound->lb = isl_aff_copy(lb);
2080                 }
2081         }
2082         isl_val_free(v);
2083         isl_aff_free(lb);
2084
2085         isl_constraint_free(c);
2086
2087         return 0;
2088 }
2089
2090 /* Given a basic map "bounds" that maps parameters and input dimensions
2091  * to a single output dimension, look for an expression in the parameters
2092  * and input dimensions such that the range of the output dimension shifted
2093  * by this expression is a constant.
2094  *
2095  * In particular, we currently only consider lower bounds on the output
2096  * dimension as candidate expressions.
2097  */
2098 static int compute_array_dim_size(struct gpu_array_bound *bound,
2099         __isl_take isl_basic_map *bounds)
2100 {
2101         struct gpu_size_info size;
2102
2103         bounds = isl_basic_map_detect_equalities(bounds);
2104         bounds = check_stride(bound, bounds);
2105
2106         bound->size = NULL;
2107         bound->lb = NULL;
2108
2109         size.bound = bound;
2110         size.pos = isl_basic_map_dim(bounds, isl_dim_in);
2111         size.bset = isl_basic_map_wrap(bounds);
2112         size.bset = isl_basic_set_flatten(size.bset);
2113         size.bset = isl_set_simple_hull(isl_basic_set_compute_divs(size.bset));
2114         isl_basic_set_foreach_constraint(size.bset, &compute_size_in_direction,
2115                                         &size);
2116         isl_basic_set_free(size.bset);
2117
2118         return bound->size ? 0 : -1;
2119 }
2120
2121 /* Check if we can find a memory tile for the given array
2122  * based on the given accesses, and if so, put the results in "tile".
2123  *
2124  * We project the accesses on each index in turn and look for a parametric
2125  * offset such that the size is constant.
2126  */
2127 static int can_tile(__isl_keep isl_map *access, struct gpu_array_tile *tile)
2128 {
2129         int i;
2130
2131         for (i = 0; i < tile->n; ++i) {
2132                 isl_map *access_i;
2133                 isl_basic_map *hull;
2134
2135                 access_i = isl_map_copy(access);
2136                 access_i = isl_map_project_out(access_i, isl_dim_out, 0, i);
2137                 access_i = isl_map_project_out(access_i, isl_dim_out,
2138                                             1, tile->n - (i + 1));
2139                 access_i = isl_map_compute_divs(access_i);
2140                 hull = isl_map_simple_hull(access_i);
2141                 if (compute_array_dim_size(&tile->bound[i], hull) < 0)
2142                         return 0;
2143         }
2144
2145         return 1;
2146 }
2147
2148 /* Construct a map with input the shared tile loops and the loops that
2149  * will be wrapped around the threads that relates these later loops
2150  * to the thread indices and then projects them out.
2151  */
2152 static __isl_give isl_map *compute_privatization(struct gpu_gen *gen)
2153 {
2154         isl_map *priv;
2155         isl_map *tiling;
2156         isl_map *proj;
2157         isl_set *par;
2158         isl_space *dim;
2159
2160         dim = isl_union_map_get_space(gen->shared_sched);
2161
2162         if (gen->options->wrap)
2163                 tiling = wrap(isl_space_copy(dim), gen->shared_len + gen->n_block,
2164                                 gen->shared_len, gen->n_block, gen->block_dim);
2165         else
2166                 tiling = tile(isl_space_copy(dim), gen->shared_len + gen->n_block,
2167                                 gen->shared_len, gen->n_block, gen->block_dim);
2168
2169         priv = tiling;
2170
2171         par = parametrization(dim, gen->shared_len + 2 * gen->n_block,
2172                 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
2173                 gen->n_block, "t");
2174
2175         priv = isl_map_align_params(priv, isl_set_get_space(par));
2176         priv = isl_map_intersect_range(priv, par);
2177
2178         dim = isl_map_get_space(priv);
2179         dim = isl_space_drop_dims(dim, isl_dim_in, 0, isl_space_dim(dim, isl_dim_in));
2180         dim = isl_space_drop_dims(dim, isl_dim_out, 0, isl_space_dim(dim, isl_dim_out));
2181         proj = projection(dim, gen->shared_len + 2 * gen->n_block,
2182                           gen->shared_len);
2183
2184         priv = isl_map_apply_range(priv, proj);
2185
2186         return priv;
2187 }
2188
2189 /* Construct a map from domain_dim to domain_dim that increments
2190  * the dimension at position "pos" and leaves all other dimensions
2191  * constant.
2192  */
2193 static __isl_give isl_map *next(__isl_take isl_space *domain_dim, int pos)
2194 {
2195         int i;
2196         int len = isl_space_dim(domain_dim, isl_dim_set);
2197         isl_space *dim;
2198         isl_basic_map *next;
2199         isl_local_space *ls;
2200
2201         dim = isl_space_map_from_set(domain_dim);
2202         next = isl_basic_map_universe(isl_space_copy(dim));
2203         ls = isl_local_space_from_space(dim);
2204
2205         for (i = 0; i < len; ++i) {
2206                 isl_constraint *c;
2207
2208                 c = isl_equality_alloc(isl_local_space_copy(ls));
2209                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, 1);
2210                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
2211                 if (i == pos)
2212                         c = isl_constraint_set_constant_si(c, 1);
2213                 next = isl_basic_map_add_constraint(next, c);
2214         }
2215
2216         isl_local_space_free(ls);
2217
2218         return isl_map_from_basic_map(next);
2219 }
2220
2221 /* Check if the given access is coalesced.
2222  * That is, check whether incrementing the dimension that will get
2223  * wrapped over the last thread index results in incrementing
2224  * the last array index.
2225  *
2226  * This function is only called for access relations without reuse and
2227  * kernels with at least one block dimension.
2228  */
2229 static int access_is_coalesced(struct gpu_gen *gen,
2230         __isl_keep isl_union_map *access)
2231 {
2232         isl_space *dim;
2233         isl_map *access_map;
2234         isl_map *next_thread_x;
2235         isl_map *next_element;
2236         isl_map *map;
2237         int coalesced;
2238
2239         access = isl_union_map_copy(access);
2240         access = isl_union_map_apply_domain(access,
2241                                 isl_union_map_copy(gen->tiled_sched));
2242         access_map = isl_map_from_union_map(access);
2243
2244         dim = isl_map_get_space(access_map);
2245         dim = isl_space_domain(dim);
2246         next_thread_x = next(dim, gen->shared_len + gen->n_block - 1);
2247
2248         dim = isl_map_get_space(access_map);
2249         dim = isl_space_range(dim);
2250         next_element = next(dim, isl_space_dim(dim, isl_dim_set) - 1);
2251
2252         map = isl_map_apply_domain(next_thread_x, isl_map_copy(access_map));
2253         map = isl_map_apply_range(map, access_map);
2254
2255         coalesced = isl_map_is_subset(map, next_element);
2256
2257         isl_map_free(next_element);
2258         isl_map_free(map);
2259
2260         return coalesced;
2261 }
2262
2263 /* Given an access relation in terms of the first gen->shared_len + gen->n_block
2264  * dimensions of the computed schedule, check if it is bijective for
2265  * fixed values of the first gen->shared_len dimensions.
2266  * We perform this check by equating these dimensions to parameters.
2267  */
2268 static int access_is_bijective(struct gpu_gen *gen, __isl_keep isl_map *access)
2269 {
2270         int res;
2271         isl_set *par;
2272         isl_space *space;
2273
2274         access = isl_map_copy(access);
2275         space = isl_space_params(isl_map_get_space(access));
2276         par = parametrization(space, gen->shared_len + gen->n_block,
2277                                 0, gen->shared_len, "s");
2278         access = isl_map_intersect_domain(access, par);
2279         res = isl_map_is_bijective(access);
2280         isl_map_free(access);
2281
2282         return res;
2283 }
2284
2285 /* Look for the last shared tile loop that affects the offset of "tile"
2286  * and return the result.
2287  * If there is no such loop, then return the index of the loop
2288  * before the first shared tile loop, in particular gen->tile_first - 1.
2289  */
2290 static int compute_tile_last_shared(struct gpu_gen *gen,
2291         struct gpu_array_tile *tile)
2292 {
2293         int i, j;
2294
2295         for (j = gen->shared_len - 1; j >= gen->tile_first; --j) {
2296                 for (i = 0; i < tile->n; ++i) {
2297                         isl_aff *lb;
2298                         isl_aff *shift;
2299
2300                         lb = tile->bound[i].lb;
2301                         if (isl_aff_involves_dims(lb, isl_dim_in, j, 1))
2302                                 break;
2303
2304                         shift = tile->bound[i].shift;
2305                         if (!shift)
2306                                 continue;
2307                         if (isl_aff_involves_dims(shift, isl_dim_in, j, 1))
2308                                 break;
2309                 }
2310                 if (i < tile->n)
2311                         break;
2312         }
2313
2314         return j;
2315 }
2316
2317 /* Look for the last shared tile loop that affects the offset of the
2318  * shared or private tile and store the result in group->last_shared.
2319  * If there is no such loop, then group->last_shared is set to a value
2320  * before the first shared tile loop, in particular gen->tile_first - 1.
2321  * If there is no tile defined on the array reference group,
2322  * then set group->last_shared to gen->shared_len - 1.
2323  */
2324 static void set_last_shared(struct gpu_gen *gen,
2325         struct gpu_array_ref_group *group)
2326 {
2327         struct gpu_array_tile *tile;
2328
2329         group->last_shared = gen->shared_len - 1;
2330
2331         tile = group->private_tile;
2332         if (!tile)
2333                 tile = group->shared_tile;
2334         if (!tile)
2335                 return;
2336
2337         group->last_shared = compute_tile_last_shared(gen, tile);
2338 }
2339
2340 /* Compute the size of the tile specified by "tile"
2341  * in number of elements and return the result.
2342  */
2343 static __isl_give isl_val *tile_size(isl_ctx *ctx, struct gpu_array_tile *tile)
2344 {
2345         int i;
2346         isl_val *size;
2347
2348         size = isl_val_one(ctx);
2349
2350         for (i = 0; i < tile->n; ++i)
2351                 size = isl_val_mul(size, isl_val_copy(tile->bound[i].size));
2352
2353         return size;
2354 }
2355
2356 /* If max_shared_memory is not set to infinity (-1), then make
2357  * sure that the total amount of shared memory required by the
2358  * array reference groups mapped to shared memory is no larger
2359  * than this maximum.
2360  *
2361  * We apply a greedy approach and discard (keep in global memory)
2362  * those groups that would result in a total memory size that
2363  * is larger than the maximum.
2364  *
2365  * This function should be called after any function that may
2366  * affect the decision on whether to place a reference group
2367  * in private, shared or global memory.
2368  */
2369 static void check_shared_memory_bound(struct gpu_gen *gen)
2370 {
2371         int i, j;
2372         isl_val *left, *size;
2373
2374         if (gen->options->max_shared_memory < 0)
2375                 return;
2376
2377         left = isl_val_int_from_si(gen->ctx, gen->options->max_shared_memory);
2378
2379         for (i = 0; i < gen->prog->n_array; ++i) {
2380                 struct gpu_array_info *array = &gen->prog->array[i];
2381
2382                 for (j = 0; j < array->n_group; ++j) {
2383                         struct gpu_array_ref_group *group;
2384
2385                         group = array->groups[j];
2386                         if (group->private_tile)
2387                                 continue;
2388                         if (!group->shared_tile)
2389                                 continue;
2390
2391                         size = tile_size(gen->ctx, group->shared_tile);
2392                         size = isl_val_mul_ui(size, array->size);
2393
2394                         if (isl_val_le(size, left)) {
2395                                 left = isl_val_sub(left, size);
2396                                 continue;
2397                         }
2398                         isl_val_free(size);
2399
2400                         group->shared_tile = free_tile(group->shared_tile);
2401                 }
2402         }
2403
2404         isl_val_free(left);
2405 }
2406
2407 /* Given a description of an array tile "tile" and the "space"
2408  *
2409  *      { D -> A }
2410  *
2411  * where D represents the first shared_len schedule dimensions
2412  * and A represents the array, construct an isl_multi_aff
2413  *
2414  *      { [D[i] -> A[a]] -> A'[a'] }
2415  *
2416  * with A' a scaled down copy of A according to the shifts and strides
2417  * in "tile".  In particular,
2418  *
2419  *      a' = (a + shift(i))/stride
2420  *
2421  * "insert_array" represents
2422  *
2423  *      { [D -> A] -> D }
2424  *
2425  * and is used to insert A into the domain of functions that only
2426  * reference D.
2427  */
2428 static __isl_give isl_multi_aff *strided_tile(
2429         struct gpu_array_tile *tile, __isl_keep isl_space *space,
2430         __isl_keep isl_multi_aff *insert_array)
2431 {
2432         int i;
2433         isl_ctx *ctx;
2434         isl_multi_aff *shift;
2435         isl_multi_val *stride;
2436         isl_space *space2;
2437         isl_local_space *ls;
2438         isl_multi_aff *tiling;
2439
2440         ctx = isl_space_get_ctx(space);
2441         space2 = isl_space_domain(isl_space_copy(space));
2442         ls = isl_local_space_from_space(space2);
2443         space2 = isl_space_range(isl_space_copy(space));
2444         stride = isl_multi_val_zero(space2);
2445         shift = isl_multi_aff_zero(isl_space_copy(space));
2446
2447         for (i = 0; i < tile->n; ++i) {
2448                 struct gpu_array_bound *bound = &tile->bound[i];
2449                 isl_val *stride_i;
2450                 isl_aff *shift_i;
2451
2452                 if (tile->bound[i].shift) {
2453                         stride_i = isl_val_copy(bound->stride);
2454                         shift_i = isl_aff_copy(bound->shift);
2455                 } else {
2456                         stride_i = isl_val_one(ctx);
2457                         shift_i = isl_aff_zero_on_domain(
2458                                         isl_local_space_copy(ls));
2459                 }
2460
2461                 stride = isl_multi_val_set_val(stride, i, stride_i);
2462                 shift = isl_multi_aff_set_aff(shift, i, shift_i);
2463         }
2464         isl_local_space_free(ls);
2465
2466         shift = isl_multi_aff_pullback_multi_aff(shift,
2467                                     isl_multi_aff_copy(insert_array));
2468
2469         tiling = isl_multi_aff_range_map(isl_space_copy(space));
2470         tiling = isl_multi_aff_add(tiling, shift);
2471         tiling = isl_multi_aff_scale_down_multi_val(tiling, stride);
2472
2473         return tiling;
2474 }
2475
2476 /* Compute a tiling for the array reference group "group".
2477  *
2478  * The tiling is of the form
2479  *
2480  *      { [D[i] -> A[a]] -> T[t] }
2481  *
2482  * where D represents the first shared_len schedule dimensions,
2483  * A represents the global array and T represents the shared or
2484  * private memory tile.  The name of T is the name of the local
2485  * array.
2486  *
2487  * If there is any stride in the accesses, then the mapping is
2488  *
2489  *      t = (a + shift(i))/stride - lb(i)
2490  *
2491  * otherwise, it is simply
2492  *
2493  *      t = a - lb(i)
2494  */
2495 static void compute_group_tiling(struct gpu_array_ref_group *group)
2496 {
2497         int i;
2498         struct gpu_array_tile *tile;
2499         struct gpu_array_info *array = group->array;
2500         isl_space *space;
2501         isl_multi_aff *tiling, *lb, *insert_array;
2502         isl_printer *p;
2503         char *local_name;
2504
2505         tile = group->private_tile;
2506         if (!tile)
2507                 tile = group->shared_tile;
2508         if (!tile)
2509                 return;
2510
2511         space = isl_map_get_space(group->access);
2512         insert_array = isl_multi_aff_domain_map(isl_space_copy(space));
2513
2514         for (i = 0; i < tile->n; ++i)
2515                 if (tile->bound[i].shift)
2516                         break;
2517
2518         if (i < tile->n)
2519                 tiling = strided_tile(tile, space, insert_array);
2520         else
2521                 tiling = isl_multi_aff_range_map(isl_space_copy(space));
2522
2523         lb = isl_multi_aff_zero(space);
2524         for (i = 0; i < tile->n; ++i) {
2525                 isl_aff *lb_i = isl_aff_copy(tile->bound[i].lb);
2526                 lb = isl_multi_aff_set_aff(lb, i, lb_i);
2527         }
2528         lb = isl_multi_aff_pullback_multi_aff(lb, insert_array);
2529
2530         tiling = isl_multi_aff_sub(tiling, lb);
2531
2532         p = isl_printer_to_str(isl_multi_aff_get_ctx(tiling));
2533         p = print_array_name(p, group);
2534         local_name = isl_printer_get_str(p);
2535         isl_printer_free(p);
2536         tiling = isl_multi_aff_set_tuple_name(tiling, isl_dim_out, local_name);
2537         free(local_name);
2538
2539         tile->tiling = tiling;
2540 }
2541
2542 /* Compute a tiling for all the array reference groups.
2543  */
2544 static void compute_group_tilings(struct gpu_gen *gen)
2545 {
2546         int i, j;
2547
2548         for (i = 0; i < gen->prog->n_array; ++i) {
2549                 struct gpu_array_info *array = &gen->prog->array[i];
2550
2551                 for (j = 0; j < array->n_group; ++j)
2552                         compute_group_tiling(array->groups[j]);
2553         }
2554 }
2555
2556 /* Fill up the groups array with singleton groups, i.e., one group
2557  * per reference, initializing the array, access, write, n_ref and refs fields.
2558  * In particular the access field is initialized to the scheduled
2559  * access relation of the array reference.
2560  *
2561  * Return the number of elements initialized, i.e., the number of
2562  * active references in the current kernel.
2563  */
2564 static int populate_array_references(struct gpu_array_info *array,
2565         __isl_keep isl_union_map *sched, struct gpu_array_ref_group **groups)
2566 {
2567         int i;
2568         int n;
2569         isl_ctx *ctx = isl_union_map_get_ctx(sched);
2570
2571         n = 0;
2572         for (i = 0; i < array->n_ref; ++i) {
2573                 isl_union_map *umap;
2574                 isl_map *map;
2575                 struct gpu_array_ref_group *group;
2576                 struct gpu_stmt_access *access = array->refs[i];
2577
2578                 map = isl_map_copy(access->access);
2579                 umap = isl_union_map_from_map(map);
2580                 umap = isl_union_map_apply_domain(umap,
2581                                 isl_union_map_copy(sched));
2582
2583                 if (isl_union_map_is_empty(umap)) {
2584                         isl_union_map_free(umap);
2585                         continue;
2586                 }
2587
2588                 map = isl_map_from_union_map(umap);
2589                 map = isl_map_detect_equalities(map);
2590
2591                 group = isl_calloc_type(ctx, struct gpu_array_ref_group);
2592                 assert(group);
2593                 group->array = array;
2594                 group->access = map;
2595                 group->write = access->write;
2596                 group->exact_write = access->exact_write;
2597                 group->slice = access->n_index < array->n_index;
2598                 group->refs = &array->refs[i];
2599                 group->n_ref = 1;
2600
2601                 groups[n++] = group;
2602         }
2603
2604         return n;
2605 }
2606
2607 /* If group->n_ref == 1, then group->refs was set by
2608  * populate_array_references to point directly into
2609  * group->array->refs and should not be freed.
2610  * If group->n_ref > 1, then group->refs was set by join_groups
2611  * to point to a newly allocated array.
2612  */
2613 static void free_array_ref_group(struct gpu_array_ref_group *group)
2614 {
2615         if (!group)
2616                 return;
2617         free_tile(group->shared_tile);
2618         free_tile(group->private_tile);
2619         isl_map_free(group->access);
2620         if (group->n_ref > 1)
2621                 free(group->refs);
2622         free(group);
2623 }
2624
2625 /* Given a map where the input dimensions represent the tile loops,
2626  * eliminate the innermost of those that have a fixed value
2627  * until we reach one that does not (obviously) have a fixed value.
2628  */
2629 static __isl_give isl_map *eliminate_fixed_inner_loops(
2630         __isl_take isl_map *access)
2631 {
2632         int i, n;
2633
2634         n = isl_map_dim(access, isl_dim_in);
2635
2636         for (i = n - 1; i >= 0; --i) {
2637                 if (!map_plain_is_fixed(access, isl_dim_in, i))
2638                         break;
2639                 access = isl_map_eliminate(access, isl_dim_in, i, 1);
2640         }
2641         return access;
2642 }
2643
2644 /* Check if the access relations of group1 and group2 overlap within
2645  * the innermost loop.  In particular, ignore any inner dimension
2646  * with a fixed value.
2647  * The copying to and from shared memory will be performed within
2648  * the innermost actual loop so we are only allowed to consider
2649  * the dimensions up to that innermost loop while checking whether
2650  * two access relations overlap.
2651  */
2652 static int accesses_overlap(struct gpu_array_ref_group *group1,
2653         struct gpu_array_ref_group *group2)
2654 {
2655         int empty;
2656         isl_map *access1, *access2;
2657
2658         access1 = isl_map_copy(group1->access);
2659         access1 = eliminate_fixed_inner_loops(access1);
2660         access2 = isl_map_copy(group2->access);
2661         access2 = eliminate_fixed_inner_loops(access2);
2662         access1 = isl_map_intersect(access1, access2);
2663         empty = isl_map_is_empty(access1);
2664         isl_map_free(access1);
2665
2666         return !empty;
2667 }
2668
2669 /* Combine the given two groups into a single group, containing
2670  * the references of both groups.
2671  */
2672 static struct gpu_array_ref_group *join_groups(
2673         struct gpu_array_ref_group *group1,
2674         struct gpu_array_ref_group *group2)
2675 {
2676         int i;
2677         isl_ctx *ctx;
2678         struct gpu_array_ref_group *group;
2679
2680         ctx = isl_map_get_ctx(group1->access);
2681         group = isl_calloc_type(ctx, struct gpu_array_ref_group);
2682         assert(group);
2683         group->array = group1->array;
2684         group->access = isl_map_union(isl_map_copy(group1->access),
2685                                         isl_map_copy(group2->access));
2686         group->write = group1->write || group2->write;
2687         group->exact_write = group1->exact_write && group2->exact_write;
2688         group->slice = group1->slice || group2->slice;
2689         group->n_ref = group1->n_ref + group2->n_ref;
2690         group->refs = isl_alloc_array(ctx, struct gpu_stmt_access *,
2691                                         group->n_ref);
2692         assert(group->refs);
2693         for (i = 0; i < group1->n_ref; ++i)
2694                 group->refs[i] = group1->refs[i];
2695         for (i = 0; i < group2->n_ref; ++i)
2696                 group->refs[group1->n_ref + i] = group2->refs[i];
2697
2698         return group;
2699 }
2700
2701 /* Combine the given two groups into a single group and free
2702  * the original two groups.
2703  */
2704 static struct gpu_array_ref_group *join_groups_and_free(
2705         struct gpu_array_ref_group *group1,
2706         struct gpu_array_ref_group *group2)
2707 {
2708         struct gpu_array_ref_group *group;
2709
2710         group = join_groups(group1, group2);
2711         free_array_ref_group(group1);
2712         free_array_ref_group(group2);
2713         return group;
2714 }
2715
2716 /* Report that the array reference group with the given access relation
2717  * is not mapped to shared memory in the given kernel because
2718  * it does not exhibit any reuse and is considered to be coalesced.
2719  */
2720 static void report_no_reuse_and_coalesced(struct ppcg_kernel *kernel,
2721         __isl_keep isl_union_map *access)
2722 {
2723         isl_ctx *ctx;
2724         isl_printer *p;
2725
2726         ctx = isl_union_map_get_ctx(access);
2727         p = isl_printer_to_file(ctx, stdout);
2728         p = isl_printer_print_str(p, "Array reference group ");
2729         p = isl_printer_print_union_map(p, access);
2730         p = isl_printer_print_str(p,
2731             " not considered for mapping to shared memory in kernel");
2732         p = isl_printer_print_int(p, kernel->id);
2733         p = isl_printer_print_str(p,
2734             " because it exhibits no reuse and is considered to be coalesced");
2735         p = isl_printer_end_line(p);
2736         isl_printer_free(p);
2737 }
2738
2739 /* Compute the private and/or shared memory tiles for the array
2740  * reference group "group" of array "array".
2741  * Return 0 on success and -1 on error.
2742  *
2743  * If the array is a read-only scalar or if the user requested
2744  * not to use shared or private memory, then we do not need to do anything.
2745  *
2746  * If any reference in the reference group accesses more than one element,
2747  * then we would have to make sure that the layout in shared memory
2748  * is the same as that in global memory.  Since we do not handle this yet
2749  * (and it may not even be possible), we refuse to map to private or
2750  * shared memory in such cases.
2751  *
2752  * If the array group involves any may writes (that are not must writes),
2753  * then we would have to make sure that we load the data into shared/private
2754  * memory first in case the data is not written by the kernel
2755  * (but still written back out to global memory).
2756  * Since we don't have any such mechanism at the moment, we don't
2757  * compute shared/private tiles for groups involving may writes.
2758  *
2759  * We only try to compute a shared memory tile if there is any reuse
2760  * or if the access is not coalesced.
2761  *
2762  * For computing a private memory tile, we also require that there is
2763  * some reuse.  Moreover, we require that the access is private
2764  * to the thread.  That is, we check that any given array element
2765  * is only accessed by a single thread.
2766  * We compute an access relation that maps the shared tile loop iterators
2767  * and the shared point loop iterators that will be wrapped over the
2768  * threads to the array elements.
2769  * We actually check that those iterators that will be wrapped
2770  * partition the array space.  This check is stricter than necessary
2771  * since several iterations may be mapped onto the same thread
2772  * and then they could be allowed to access the same memory elements,
2773  * but our check does not allow this situation.
2774  *
2775  * We also check that the index expression only depends on parallel
2776  * loops.  That way, we can move those loops innermost and unroll them.
2777  * Again, we use a test that is stricter than necessary.
2778  * We actually check whether the index expression only depends
2779  * on the iterators that are wrapped over the threads.
2780  * These are necessarily parallel, but there may be more parallel loops.
2781  *
2782  * Combining the injectivity of the first test with the single-valuedness
2783  * of the second test, we simply test for bijectivity.
2784  *
2785  * If the array is marked force_private, then we bypass all checks
2786  * and assume we can (and should) use registers.
2787  *
2788  * If it turns out we can (or have to) use registers, we compute
2789  * the private memory tile size using can_tile, after introducing a dependence
2790  * on the thread indices.
2791  */
2792 static int compute_group_bounds_core(struct gpu_gen *gen,
2793         struct gpu_array_ref_group *group)
2794 {
2795         isl_ctx *ctx = isl_space_get_ctx(group->array->space);
2796         isl_union_map *access;
2797         int n_index = group->array->n_index;
2798         int no_reuse, coalesced;
2799         isl_map *acc;
2800         int force_private = group->array->force_private;
2801         int use_shared = gen->options->use_shared_memory && gen->n_block > 0;
2802         int use_private = force_private || gen->options->use_private_memory;
2803
2804         if (!use_shared && !use_private)
2805                 return 0;
2806         if (gpu_array_is_read_only_scalar(group->array))
2807                 return 0;
2808         if (!force_private && !group->exact_write)
2809                 return 0;
2810         if (group->slice)
2811                 return 0;
2812
2813         access = group_access_relation(group, 1, 1);
2814         no_reuse = isl_union_map_is_injective(access);
2815         if (use_shared && no_reuse)
2816                 coalesced = access_is_coalesced(gen, access);
2817
2818         if (gen->options->debug->verbose && use_shared && no_reuse && coalesced)
2819                 report_no_reuse_and_coalesced(gen->kernel, access);
2820
2821         if (use_shared && (!no_reuse || !coalesced)) {
2822                 group->shared_tile = create_tile(ctx, group->array->n_index);
2823                 if (!can_tile(group->access, group->shared_tile))
2824                         group->shared_tile = free_tile(group->shared_tile);
2825         }
2826
2827         if (!force_private && (!use_private || no_reuse)) {
2828                 isl_union_map_free(access);
2829                 return 0;
2830         }
2831
2832         access = isl_union_map_apply_domain(access,
2833                                         isl_union_map_copy(gen->shared_sched));
2834
2835         acc = isl_map_from_union_map(access);
2836
2837         if (!force_private && !access_is_bijective(gen, acc)) {
2838                 isl_map_free(acc);
2839                 return 0;
2840         }
2841
2842         group->private_tile = create_tile(gen->ctx, n_index);
2843         acc = isl_map_apply_domain(acc, isl_map_copy(gen->privatization));
2844         if (!can_tile(acc, group->private_tile))
2845                 group->private_tile = free_tile(group->private_tile);
2846
2847         isl_map_free(acc);
2848
2849         if (force_private && !group->private_tile)
2850                 isl_die(ctx, isl_error_internal,
2851                         "unable to map array reference group to registers",
2852                         return -1);
2853
2854         return 0;
2855 }
2856
2857 /* Compute the private and/or shared memory tiles for the array
2858  * reference group "group" of array "array" and set last_shared.
2859  * Return 0 on success and -1 on error.
2860  */
2861 static int compute_group_bounds(struct gpu_gen *gen,
2862         struct gpu_array_ref_group *group)
2863 {
2864         if (compute_group_bounds_core(gen, group) < 0)
2865                 return -1;
2866         set_last_shared(gen, group);
2867
2868         return 0;
2869 }
2870
2871 /* If two groups have overlapping access relations (as determined by
2872  * the "overlap" function) and if one of them involves a write,
2873  * then merge the two groups into one.
2874  * If "compute_bounds" is set, then call compute_group_bounds
2875  * on the merged groups.
2876  *
2877  * Return the updated number of groups.
2878  * Return -1 on error.
2879  */
2880 static int group_writes(struct gpu_gen *gen,
2881         int n, struct gpu_array_ref_group **groups,
2882         int (*overlap)(struct gpu_array_ref_group *group1,
2883                 struct gpu_array_ref_group *group2), int compute_bounds)
2884 {
2885         int i, j;
2886
2887         for (i = 0; i < n; ++i) {
2888                 for (j = n - 1; j > i; --j) {
2889                         if (!groups[i]->write && !groups[j]->write)
2890                                 continue;
2891
2892                         if (!overlap(groups[i], groups[j]))
2893                                 continue;
2894
2895                         groups[i] = join_groups_and_free(groups[i], groups[j]);
2896                         if (compute_bounds &&
2897                             compute_group_bounds(gen, groups[i]) < 0)
2898                                 return -1;
2899                         if (j != n - 1)
2900                                 groups[j] = groups[n - 1];
2901                         groups[n - 1] = NULL;
2902                         n--;
2903                 }
2904         }
2905
2906         return n;
2907 }
2908
2909 /* If two groups have overlapping access relations (within the innermost
2910  * loop) and if one of them involves a write, then merge the two groups
2911  * into one.
2912  *
2913  * Return the updated number of groups.
2914  */
2915 static int group_overlapping_writes(struct gpu_gen *gen,
2916         int n, struct gpu_array_ref_group **groups)
2917 {
2918         return group_writes(gen, n, groups, &accesses_overlap, 0);
2919 }
2920
2921 /* Check if the access relations of group1 and group2 overlap within
2922  * the outermost min(group1->last_shared, group2->last_shared) loops.
2923  */
2924 static int last_shared_accesses_overlap(struct gpu_array_ref_group *group1,
2925         struct gpu_array_ref_group *group2)
2926 {
2927         int last_shared;
2928         int dim;
2929         int empty;
2930         isl_map *map_i, *map_j, *map;
2931
2932         last_shared = group1->last_shared;
2933         if (group2->last_shared < last_shared)
2934                 last_shared = group2->last_shared;
2935         map_i = isl_map_copy(group1->access);
2936         dim = isl_map_dim(map_i, isl_dim_in);
2937         map_i = isl_map_eliminate(map_i, isl_dim_in,
2938                                 last_shared + 1, dim - (last_shared + 1));
2939         map_j = isl_map_copy(group2->access);
2940         map_j = isl_map_eliminate(map_j, isl_dim_in,
2941                                 last_shared + 1, dim - (last_shared + 1));
2942         map = isl_map_intersect(map_i, map_j);
2943         empty = isl_map_is_empty(map);
2944         isl_map_free(map);
2945
2946         return !empty;
2947 }
2948
2949 /* If two groups have overlapping access relations (within the outer
2950  * last_shared loops) and if one of them involves a write,
2951  * then merge the two groups into one.
2952  *
2953  * Return the updated number of groups.
2954  */
2955 static int group_last_shared_overlapping_writes(struct gpu_gen *gen, int n,
2956         struct gpu_array_ref_group **groups)
2957 {
2958         return group_writes(gen, n, groups, &last_shared_accesses_overlap, 1);
2959 }
2960
2961 /* Is the size of the tile specified by "tile" smaller than the sum of
2962  * the sizes of the tiles specified by "tile1" and "tile2"?
2963  */
2964 static int smaller_tile(isl_ctx *ctx, struct gpu_array_tile *tile,
2965         struct gpu_array_tile *tile1, struct gpu_array_tile *tile2)
2966 {
2967         int smaller;
2968         isl_val *size, *size1, *size2;
2969
2970         size = tile_size(ctx, tile);
2971         size1 = tile_size(ctx, tile1);
2972         size2 = tile_size(ctx, tile2);
2973
2974         size = isl_val_sub(size, size1);
2975         size = isl_val_sub(size, size2);
2976         smaller = isl_val_is_neg(size);
2977
2978         isl_val_free(size);
2979
2980         return smaller;
2981 }
2982
2983 /* Given an initial grouping of array references and shared memory tiles
2984  * for each group that allows for a shared memory tile, merge two groups
2985  * if both have a shared memory tile, the merged group also has
2986  * a shared memory tile and the size of the tile for the merge group
2987  * is smaller than the sum of the tile sizes of the individual groups.
2988  *
2989  * If merging two groups decreases the "last_shared" dimension of
2990  * one or both of the two groups, then we need to check for overlapping
2991  * writes again.
2992  *
2993  * Return the number of groups after merging.
2994  * Return -1 on error.
2995  */
2996 static int group_common_shared_memory_tile(struct gpu_gen *gen,
2997         struct gpu_array_info *array, int n,
2998         struct gpu_array_ref_group **groups)
2999 {
3000         int i, j;
3001         int recompute_overlap = 0;
3002         isl_ctx *ctx = isl_space_get_ctx(array->space);
3003
3004         for (i = 0; i < n; ++i) {
3005                 if (!groups[i]->shared_tile)
3006                         continue;
3007                 for (j = n - 1; j > i; --j) {
3008                         isl_map *map;
3009                         int empty;
3010                         struct gpu_array_ref_group *group;
3011
3012                         if (!groups[j]->shared_tile)
3013                                 continue;
3014
3015                         map = isl_map_intersect(isl_map_copy(groups[i]->access),
3016                                             isl_map_copy(groups[j]->access));
3017                         empty = isl_map_is_empty(map);
3018                         isl_map_free(map);
3019
3020                         if (empty)
3021                                 continue;
3022
3023                         group = join_groups(groups[i], groups[j]);
3024                         if (compute_group_bounds(gen, group) < 0) {
3025                                 free_array_ref_group(group);
3026                                 return -1;
3027                         }
3028                         if (!group->shared_tile ||
3029                             !smaller_tile(ctx, group->shared_tile,
3030                                         groups[i]->shared_tile,
3031                                         groups[j]->shared_tile)) {
3032                                 free_array_ref_group(group);
3033                                 continue;
3034                         }
3035
3036                         if (group->last_shared < groups[i]->last_shared ||
3037                             group->last_shared < groups[j]->last_shared)
3038                                 recompute_overlap = 1;
3039                         free_array_ref_group(groups[i]);
3040                         free_array_ref_group(groups[j]);
3041                         groups[i] = group;
3042                         if (j != n - 1)
3043                                 groups[j] = groups[n - 1];
3044                         n--;
3045                 }
3046         }
3047
3048         if (recompute_overlap)
3049                 n = group_last_shared_overlapping_writes(gen, n, groups);
3050         return n;
3051 }
3052
3053 /* Set array->n_group and array->groups to n and groups.
3054  *
3055  * Additionally, set the "nr" field of each group
3056  * and the "group" field of each reference in each group.
3057  */
3058 static void set_array_groups(struct gpu_array_info *array,
3059         int n, struct gpu_array_ref_group **groups)
3060 {
3061         int i, j;
3062
3063         array->n_group = n;
3064         array->groups = groups;
3065
3066         for (i = 0; i < n; ++i) {
3067                 groups[i]->nr = i;
3068
3069                 for (j = 0; j < groups[i]->n_ref; ++j)
3070                         groups[i]->refs[j]->group = i;
3071         }
3072 }
3073
3074 /* Group array references that should be considered together when
3075  * deciding whether to access them from private, shared or global memory.
3076  * Return -1 on error.
3077  *
3078  * In particular, if two array references overlap and if one of them
3079  * is a write, then the two references are grouped together.
3080  * We first perform an initial grouping based only on the access relation.
3081  * After computing shared and private memory tiles, we check for
3082  * overlapping writes again, but this time taking into account
3083  * the "last_shared" property.
3084  *
3085  * Furthermore, if two groups admit a shared memory tile and if the
3086  * combination of the two also admits a shared memory tile, we merge
3087  * the two groups.
3088  *
3089  * If the array contains structures, then there is no need to compute
3090  * reference groups since we do not map such arrays to private or shared
3091  * memory.
3092  */
3093 static int group_array_references(struct gpu_gen *gen,
3094         struct gpu_array_info *array, __isl_keep isl_union_map *sched)
3095 {
3096         int i;
3097         int n;
3098         isl_ctx *ctx = isl_union_map_get_ctx(sched);
3099         struct gpu_array_ref_group **groups;
3100
3101         if (array->has_compound_element)
3102                 return 0;
3103
3104         groups = isl_calloc_array(ctx, struct gpu_array_ref_group *,
3105                                         array->n_ref);
3106         if (!groups)
3107                 return -1;
3108
3109         n = populate_array_references(array, sched, groups);
3110
3111         n = group_overlapping_writes(gen, n, groups);
3112
3113         for (i = 0; i < n; ++i)
3114                 if (compute_group_bounds(gen, groups[i]) < 0)
3115                         n = -1;
3116
3117         n = group_last_shared_overlapping_writes(gen, n, groups);
3118
3119         n = group_common_shared_memory_tile(gen, array, n, groups);
3120
3121         set_array_groups(array, n, groups);
3122
3123         if (n >= 0)
3124                 return 0;
3125
3126         for (i = 0; i < array->n_ref; ++i)
3127                 free_array_ref_group(groups[i]);
3128         return -1;
3129 }
3130
3131 /* Take tiled_sched, project it onto the shared tile loops and
3132  * the loops that will be wrapped over the threads and
3133  * store the result in gen->shared_sched.
3134  * Also compute a projection that projects out the loops that will be
3135  * wrapped over the threads and store this projection in gen->shared_proj.
3136  */
3137 static void compute_shared_sched(struct gpu_gen *gen)
3138 {
3139         isl_space *dim;
3140         isl_map *proj;
3141         isl_set *par;
3142         isl_union_map *sched;
3143
3144         sched = isl_union_map_copy(gen->tiled_sched);
3145
3146         dim = isl_union_map_get_space(sched);
3147         proj = projection(dim, gen->tiled_len, gen->shared_len + gen->n_block);
3148         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
3149
3150         dim = isl_union_map_get_space(sched);
3151         proj = projection(dim, gen->shared_len + gen->n_block, gen->shared_len);
3152
3153         gen->shared_sched = sched;
3154         gen->shared_proj = isl_union_map_from_map(proj);
3155 }
3156
3157 /* For each scalar in the input program, check if there are any
3158  * order dependences active inside the current kernel, within
3159  * the same iteration of the host schedule.
3160  * If so, mark the scalar as force_private so that it will be
3161  * mapped to a register.
3162  */
3163 static void check_scalar_live_ranges(struct gpu_gen *gen)
3164 {
3165         int i;
3166         isl_map *proj;
3167         isl_union_map *sched;
3168         isl_union_set *domain;
3169         isl_union_map *same_host_iteration;
3170
3171         gen->any_force_private = 0;
3172
3173         if (!gen->options->live_range_reordering)
3174                 return;
3175
3176         sched = gen->shared_sched;
3177         sched = isl_union_map_universe(isl_union_map_copy(sched));
3178         domain = isl_union_map_domain(sched);
3179
3180         sched = isl_union_map_copy(gen->sched);
3181         proj = projection(isl_union_map_get_space(sched),
3182                             gen->untiled_len, gen->tile_first);
3183         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
3184         same_host_iteration = isl_union_map_apply_range(sched,
3185                             isl_union_map_reverse(isl_union_map_copy(sched)));
3186
3187         for (i = 0; i < gen->prog->n_array; ++i) {
3188                 struct gpu_array_info *array = &gen->prog->array[i];
3189                 isl_union_map *order;
3190
3191                 array->force_private = 0;
3192                 if (array->n_index != 0)
3193                         continue;
3194                 order = isl_union_map_copy(array->dep_order);
3195                 order = isl_union_map_intersect_domain(order,
3196                                                     isl_union_set_copy(domain));
3197                 order = isl_union_map_intersect_range(order,
3198                                                     isl_union_set_copy(domain));
3199                 order = isl_union_map_intersect(order,
3200                                     isl_union_map_copy(same_host_iteration));
3201                 if (!isl_union_map_is_empty(order)) {
3202                         array->force_private = 1;
3203                         gen->any_force_private = 1;
3204                 }
3205                 isl_union_map_free(order);
3206         }
3207
3208         isl_union_map_free(same_host_iteration);
3209         isl_union_set_free(domain);
3210 }
3211
3212 /* Group references of all arrays in the program.
3213  */
3214 static int group_references(struct gpu_gen *gen)
3215 {
3216         int i;
3217         int r = 0;
3218         isl_union_map *sched;
3219
3220         sched = isl_union_map_apply_range(isl_union_map_copy(gen->shared_sched),
3221                                           isl_union_map_copy(gen->shared_proj));
3222
3223         for (i = 0; i < gen->prog->n_array; ++i) {
3224                 r = group_array_references(gen, &gen->prog->array[i], sched);
3225                 if (r < 0)
3226                         break;
3227         }
3228
3229         isl_union_map_free(sched);
3230
3231         return r;
3232 }
3233
3234 /* Free all array information that is local to the current kernel.
3235  */
3236 static void free_local_array_info(struct gpu_gen *gen)
3237 {
3238         int i, j;
3239
3240         for (i = 0; i < gen->prog->n_array; ++i) {
3241                 struct gpu_array_info *array = &gen->prog->array[i];
3242
3243                 for (j = 0; j < array->n_group; ++j)
3244                         free_array_ref_group(array->groups[j]);
3245                 free(array->groups);
3246         }
3247 }
3248
3249 /* Compute the size of a bounding box around the origin and "set",
3250  * where "set" is assumed to contain only non-negative elements.
3251  * In particular, compute the maximal value of "set" in each direction
3252  * and add one.
3253  */
3254 static __isl_give isl_multi_pw_aff *extract_size(__isl_take isl_set *set,
3255         __isl_keep isl_set *context)
3256 {
3257         int i, n;
3258         isl_multi_pw_aff *mpa;
3259
3260         n = isl_set_dim(set, isl_dim_set);
3261         mpa = isl_multi_pw_aff_zero(isl_set_get_space(set));
3262         for (i = 0; i < n; ++i) {
3263                 isl_space *space;
3264                 isl_aff *one;
3265                 isl_pw_aff *bound;
3266
3267                 bound = isl_set_dim_max(isl_set_copy(set), i);
3268                 bound = isl_pw_aff_coalesce(bound);
3269                 bound = isl_pw_aff_gist(bound, isl_set_copy(context));
3270
3271                 space = isl_pw_aff_get_domain_space(bound);
3272                 one = isl_aff_zero_on_domain(isl_local_space_from_space(space));
3273                 one = isl_aff_add_constant_si(one, 1);
3274                 bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one));
3275                 mpa = isl_multi_pw_aff_set_pw_aff(mpa, i, bound);
3276         }
3277         isl_set_free(set);
3278
3279         return mpa;
3280 }
3281
3282 /* Compute the effective grid size as a list of the sizes in each dimension.
3283  *
3284  * The grid size specified by the user or set by default
3285  * in read_grid_sizes() and applied in tile_schedule(),
3286  * may be too large for the given code in the sense that
3287  * it may contain blocks that don't need to execute anything.
3288  * We therefore don't return this grid size, but instead the
3289  * smallest grid size that ensures that all blocks that actually
3290  * execute code are included in the grid.
3291  *
3292  * We first extract a description of the grid, i.e., the possible values
3293  * of the block ids, from gen->tiled_sched.
3294  * The block ids are parameters in gen->tiled_sched.
3295  * We simply need to change them into set dimensions.
3296  *
3297  * Then, for each block dimension, we compute the maximal value of the block id
3298  * and add one.
3299  */
3300 static __isl_give isl_multi_pw_aff *extract_grid_size(struct gpu_gen *gen,
3301         struct ppcg_kernel *kernel)
3302 {
3303         int i;
3304         isl_set *grid;
3305
3306         grid = isl_union_map_params(isl_union_map_copy(gen->tiled_sched));
3307         grid = isl_set_from_params(grid);
3308         grid = isl_set_add_dims(grid, isl_dim_set, gen->n_grid);
3309         for (i = 0; i < gen->n_grid; ++i) {
3310                 int pos;
3311                 char name[20];
3312
3313                 snprintf(name, sizeof(name), "b%d", i);
3314                 pos = isl_set_find_dim_by_name(grid, isl_dim_param, name);
3315                 assert(pos >= 0);
3316                 grid = isl_set_equate(grid, isl_dim_param, pos, isl_dim_set, i);
3317                 grid = isl_set_project_out(grid, isl_dim_param, pos, 1);
3318         }
3319
3320         return extract_size(grid, kernel->context);
3321 }
3322
3323 /* Compute the size of a fixed bounding box around the origin and "set",
3324  * where "set" is assumed to contain only non-negative elements,
3325  * and store the results in "size".
3326  * In particular, compute the maximal value of "set" in each direction
3327  * and add one.
3328  */
3329 static void extract_fixed_size(__isl_take isl_set *set, int *size)
3330 {
3331         int i, n;
3332         isl_local_space *ls;
3333         isl_aff *obj;
3334
3335         n = isl_set_dim(set, isl_dim_set);
3336         ls = isl_local_space_from_space(isl_set_get_space(set));
3337         obj = isl_aff_zero_on_domain(ls);
3338         for (i = 0; i < n; ++i) {
3339                 isl_val *max;
3340
3341                 obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 1);
3342                 max = isl_set_max_val(set, obj);
3343                 size[i] = isl_val_get_num_si(max) + 1;
3344                 isl_val_free(max);
3345                 obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 0);
3346         }
3347         isl_aff_free(obj);
3348         isl_set_free(set);
3349 }
3350
3351 /* Compute the effective block size as a list of the sizes in each dimension
3352  * and store the sizes in kernel->block_dim.
3353  *
3354  * The block size specified by the user or set by default
3355  * in read_block_sizes() and applied in thread_tile_schedule(),
3356  * may be too large for the given code in the sense that
3357  * it may contain threads that don't need to execute anything.
3358  * We therefore don't store this block size in kernel->block_dim,
3359  * but instead the smallest block size that ensures that all threads
3360  * that actually execute code are included in the block.
3361  *
3362  * The current implementation eliminates all parameters, ensuring
3363  * that the size is a fixed constant in each dimension.
3364  * In principle we could also compute parametric sizes.
3365  * We would have to make sure to project out all b%d and t%d parameters,
3366  * however.
3367  */
3368 static void extract_block_size(struct gpu_gen *gen, struct ppcg_kernel *kernel)
3369 {
3370         int i;
3371         int nparam;
3372         isl_set *block;
3373         isl_multi_pw_aff *mpa;
3374
3375         block = isl_union_map_params(isl_union_map_copy(gen->local_sched));
3376         block = isl_set_from_params(block);
3377         block = isl_set_add_dims(block, isl_dim_set, gen->n_block);
3378         kernel->n_block = gen->n_block;
3379         for (i = 0; i < gen->n_block; ++i) {
3380                 int pos;
3381                 char name[20];
3382
3383                 snprintf(name, sizeof(name), "t%d", i);
3384                 pos = isl_set_find_dim_by_name(block, isl_dim_param, name);
3385                 assert(pos >= 0);
3386                 block = isl_set_equate(block, isl_dim_param, pos,
3387                                         isl_dim_set, i);
3388         }
3389         nparam = isl_set_dim(block, isl_dim_param);
3390         block = isl_set_project_out(block, isl_dim_param, 0, nparam);
3391
3392         extract_fixed_size(block, kernel->block_dim);
3393 }
3394
3395 void ppcg_kernel_free(void *user)
3396 {
3397         struct ppcg_kernel *kernel = user;
3398         int i;
3399
3400         if (!kernel)
3401                 return;
3402
3403         isl_multi_pw_aff_free(kernel->grid_size);
3404         isl_set_free(kernel->context);
3405         isl_union_set_free(kernel->arrays);
3406         isl_space_free(kernel->space);
3407         isl_ast_node_free(kernel->tree);
3408
3409         for (i = 0; i < kernel->n_array; ++i)
3410                 isl_pw_aff_list_free(kernel->array[i].bound);
3411         free(kernel->array);
3412
3413         for (i = 0; i < kernel->n_var; ++i) {
3414                 free(kernel->var[i].name);
3415                 isl_vec_free(kernel->var[i].size);
3416         }
3417         free(kernel->var);
3418
3419         free(kernel);
3420 }
3421
3422 static void create_kernel_var(isl_ctx *ctx, struct gpu_array_ref_group *group,
3423         struct ppcg_kernel_var *var)
3424 {
3425         int j;
3426         struct gpu_array_tile *tile;
3427         isl_printer *p;
3428         char *name;
3429
3430         var->array = group->array;
3431
3432         tile = group->private_tile;
3433         var->type = ppcg_access_private;
3434         if (!tile) {
3435                 tile = group->shared_tile;
3436                 var->type = ppcg_access_shared;
3437         }
3438
3439         p = isl_printer_to_str(ctx);
3440         p = print_array_name(p, group);
3441         var->name = isl_printer_get_str(p);
3442         isl_printer_free(p);
3443
3444         var->size = isl_vec_alloc(ctx, group->array->n_index);
3445
3446         for (j = 0; j < group->array->n_index; ++j)
3447                 var->size = isl_vec_set_element_val(var->size, j,
3448                                             isl_val_copy(tile->bound[j].size));
3449 }
3450
3451 static void create_kernel_vars(struct gpu_gen *gen, struct ppcg_kernel *kernel)
3452 {
3453         int i, j, n;
3454
3455         n = 0;
3456         for (i = 0; i < gen->prog->n_array; ++i) {
3457                 struct gpu_array_info *array = &gen->prog->array[i];
3458
3459                 for (j = 0; j < array->n_group; ++j) {
3460                         struct gpu_array_ref_group *group = array->groups[j];
3461                         if (group->private_tile || group->shared_tile)
3462                                 ++n;
3463                 }
3464         }
3465
3466         kernel->n_var = n;
3467         kernel->var = isl_calloc_array(gen->ctx, struct ppcg_kernel_var, n);
3468         assert(kernel->var);
3469
3470         n = 0;
3471         for (i = 0; i < gen->prog->n_array; ++i) {
3472                 struct gpu_array_info *array = &gen->prog->array[i];
3473
3474                 for (j = 0; j < array->n_group; ++j) {
3475                         struct gpu_array_ref_group *group = array->groups[j];
3476                         if (!group->private_tile && !group->shared_tile)
3477                                 continue;
3478                         create_kernel_var(gen->ctx, group, &kernel->var[n]);
3479                         ++n;
3480                 }
3481         }
3482 }
3483
3484 /* The sizes of the arrays on the host that have been computed by
3485  * extract_array_info may depend on the parameters.  Use the extra
3486  * constraints on the parameters that are valid at "host_domain"
3487  * to simplify these expressions and store the results in kernel->array.
3488  *
3489  * We only need these localized bounds for arrays that are accessed
3490  * by the current kernel.  If we have found at least one reference group
3491  * then the array is accessed by the kernel.  If the array has compound
3492  * elements then we skipped the construction of array reference groups.
3493  */
3494 static void localize_bounds(struct gpu_gen *gen, struct ppcg_kernel *kernel,
3495         __isl_keep isl_set *host_domain)
3496 {
3497         int i, j;
3498         isl_set *context;
3499
3500         kernel->array = isl_calloc_array(gen->ctx,
3501                             struct gpu_local_array_info, gen->prog->n_array);
3502         assert(kernel->array);
3503         kernel->n_array = gen->prog->n_array;
3504
3505         context = isl_set_copy(host_domain);
3506         context = isl_set_params(context);
3507
3508         for (i = 0; i < gen->prog->n_array; ++i) {
3509                 struct gpu_array_info *array = &gen->prog->array[i];
3510                 isl_pw_aff_list *local;
3511
3512                 if (array->n_group == 0 && !array->has_compound_element)
3513                         continue;
3514
3515                 local = isl_pw_aff_list_alloc(gen->ctx, array->n_index);
3516
3517                 for (j = 0; j < array->n_index; ++j) {
3518                         isl_pw_aff *pwaff;
3519
3520                         pwaff = isl_pw_aff_copy(array->bound[j]);
3521                         pwaff = isl_pw_aff_gist(pwaff, isl_set_copy(context));
3522                         local = isl_pw_aff_list_add(local, pwaff);
3523                 }
3524
3525                 kernel->array[i].n_index = array->n_index;
3526                 kernel->array[i].bound = local;
3527         }
3528         isl_set_free(context);
3529 }
3530
3531 /* Find the element in gen->stmt that has the given "id".
3532  * Return NULL if no such gpu_stmt can be found.
3533  */
3534 static struct gpu_stmt *find_stmt(struct gpu_prog *prog, __isl_keep isl_id *id)
3535 {
3536         int i;
3537
3538         for (i = 0; i < prog->n_stmts; ++i) {
3539                 if (id == prog->stmts[i].id)
3540                         break;
3541         }
3542
3543         return i < prog->n_stmts ? &prog->stmts[i] : NULL;
3544 }
3545
3546 /* Set gen->tile_len and gen->n_parallel to those of the statement
3547  * affected by the first map (part of the schedule)
3548  * on which this function is called.
3549  * Because of the way the schedule is constructed, the other statements
3550  * in the list, if any, should have the same values for these properties.
3551  */
3552 static int extract_tile_len(__isl_take isl_map *map, void *user)
3553 {
3554         struct gpu_gen *gen = (struct gpu_gen *) user;
3555         isl_id *id;
3556         struct gpu_stmt *stmt;
3557
3558         id = isl_map_get_tuple_id(map, isl_dim_in);
3559         stmt = find_stmt(gen->prog, id);
3560         isl_id_free(id);
3561
3562         isl_map_free(map);
3563
3564         if (!stmt)
3565                 isl_die(gen->ctx, isl_error_unknown,
3566                         "statement not found", return -1);
3567
3568         gen->tile_len = stmt->tile_len;
3569         gen->n_parallel = stmt->n_parallel;
3570
3571         return -1;
3572 }
3573
3574 void ppcg_kernel_stmt_free(void *user)
3575 {
3576         int i;
3577         struct ppcg_kernel_stmt *stmt = user;
3578
3579         if (!stmt)
3580                 return;
3581
3582         switch (stmt->type) {
3583         case ppcg_kernel_copy:
3584                 isl_ast_expr_free(stmt->u.c.index);
3585                 isl_ast_expr_free(stmt->u.c.local_index);
3586                 break;
3587         case ppcg_kernel_domain:
3588                 isl_id_to_ast_expr_free(stmt->u.d.ref2expr);
3589                 break;
3590         case ppcg_kernel_sync:
3591                 break;
3592         }
3593
3594         free(stmt);
3595 }
3596
3597 /* Set the options of "context" to
3598  *
3599  *      { space -> [x] : x >= first }
3600  */
3601 static __isl_give isl_ast_build *set_unroll(
3602         __isl_take isl_ast_build *build, __isl_take isl_space *space,
3603         int first)
3604 {
3605         isl_ctx *ctx;
3606         isl_map *unroll;
3607         isl_union_map *opt;
3608
3609         ctx = isl_ast_build_get_ctx(build);
3610
3611         space = isl_space_from_domain(space);
3612         space = isl_space_add_dims(space, isl_dim_out, 1);
3613         space = isl_space_set_tuple_name(space, isl_dim_out, "unroll");
3614         unroll = isl_map_universe(space);
3615         unroll = isl_map_lower_bound_si(unroll, isl_dim_out, 0, first);
3616         opt = isl_union_map_from_map(unroll);
3617
3618         build = isl_ast_build_set_options(build, opt);
3619
3620         return build;
3621 }
3622
3623 /* Return a list of isl_ids of the form "prefix%d".
3624  */
3625 static __isl_give isl_id_list *generate_names(isl_ctx *ctx,
3626         int n, const char *prefix)
3627 {
3628         int i;
3629         char name[10];
3630         isl_id_list *names;
3631
3632         names = isl_id_list_alloc(ctx, n);
3633         for (i = 0; i < n; ++i) {
3634                 isl_id *id;
3635
3636                 snprintf(name, sizeof(name), "%s%d", prefix, i);
3637                 id = isl_id_alloc(ctx, name, NULL);
3638                 names = isl_id_list_add(names, id);
3639         }
3640
3641         return names;
3642 }
3643
3644 /* Extend the schedule "schedule" with the part of "extension"
3645  * starting at "first" up to "len".
3646  */
3647 static __isl_give isl_union_map *extend_schedule(
3648         __isl_take isl_union_map *schedule,
3649         __isl_take isl_union_map *extension, int first, int len)
3650 {
3651         isl_space *space;
3652         isl_map *proj;
3653         isl_union_map *umap;
3654         isl_set *set;
3655
3656         space = isl_union_map_get_space(schedule);
3657         space = isl_space_set_from_params(space);
3658         space = isl_space_add_dims(space, isl_dim_set, len);
3659         proj = isl_set_identity(isl_set_universe(space));
3660         proj = isl_map_project_out(proj, isl_dim_out, 0, first);
3661         extension = isl_union_map_apply_range(extension,
3662                                                 isl_union_map_from_map(proj));
3663
3664         schedule = isl_union_map_range_product(schedule, extension);
3665
3666         return schedule;
3667 }
3668
3669 /* Return the gpu_stmt_access in the list "accesses" that corresponds
3670  * to "ref_id".
3671  */
3672 static struct gpu_stmt_access *find_access(struct gpu_stmt_access *accesses,
3673         __isl_keep isl_id *ref_id)
3674 {
3675         struct gpu_stmt_access *access;
3676
3677         for (access = accesses; access; access = access->next)
3678                 if (access->ref_id == ref_id)
3679                         return access;
3680
3681         return NULL;
3682 }
3683
3684 /* Return the index of the array called "name" in the list of arrays.
3685  */
3686 static int find_array_index(struct gpu_gen *gen, const char *name)
3687 {
3688         int i;
3689
3690         for (i = 0; i < gen->prog->n_array; ++i)
3691                 if (!strcmp(name, gen->prog->array[i].name))
3692                         return i;
3693
3694         return -1;
3695 }
3696
3697 /* Internal data structure for the index and AST expression transformation
3698  * callbacks for pet_stmt_build_ast_exprs.
3699  *
3700  * "accesses" is the list of gpu_stmt_access in the statement.
3701  * "iterator_map" expresses the statement iterators in terms of
3702  * the AST loop iterators.
3703  * "sched2shared" expresses the first shared_len dimensions of
3704  * the computed schedule in terms of the AST loop iterators.
3705  *
3706  * The following fields are set in transform_index and used in transform_expr.
3707  * "array" is the array that is being accessed.
3708  * "global" is set if the global array is accessed (rather than
3709  * shared/private memory).
3710  * "local_array" refers to information on the array specialized
3711  * to the current kernel.
3712  */
3713 struct ppcg_transform_data {
3714         struct gpu_gen *gen;
3715         struct gpu_stmt_access *accesses;
3716         isl_pw_multi_aff *iterator_map;
3717         isl_pw_multi_aff *sched2shared;
3718
3719         struct gpu_array_info *array;
3720         int global;
3721         struct gpu_local_array_info *local_array;
3722 };
3723
3724 /* Return the name of the outer array (of structs) accessed by "access".
3725  */
3726 static const char *get_outer_array_name(__isl_keep isl_map *access)
3727 {
3728         isl_space *space;
3729         const char *name;
3730
3731         space = isl_space_range(isl_map_get_space(access));
3732         while (space && isl_space_is_wrapping(space))
3733                 space = isl_space_domain(isl_space_unwrap(space));
3734         name = isl_space_get_tuple_name(space, isl_dim_set);
3735         isl_space_free(space);
3736
3737         return name;
3738 }
3739
3740 /* Index transformation callback for pet_stmt_build_ast_exprs.
3741  *
3742  * "index" expresses the array indices in terms of statement iterators
3743  *
3744  * We first reformulate "index" in terms of the AST loop iterators.
3745  * Then we check if we are accessing the global array or
3746  * a shared/private copy.  In the former case, we simply return
3747  * the updated index.  If "index" is an affine expression rather
3748  * than an array access, then we also return the updated index here.
3749  *
3750  * If no reference groups have been computed for the array,
3751  * then we can only be accessing the global array.
3752  *
3753  * Otherwise, we apply the tiling to the index.
3754  * This tiling is of the form
3755  *
3756  *      [D -> A] -> T
3757  *
3758  * The index is of the form
3759  *
3760  *      L -> A
3761  *
3762  * We update the tiling to refer to the AST loop iteratos
3763  *
3764  *      [L -> A] -> T
3765  *
3766  * and modify index to keep track of those iterators
3767  *
3768  *      L -> [L -> A]
3769  *
3770  * Combining these two yields a tiled index expression in terms
3771  * of the AST loop iterators
3772  *
3773  *      L -> T
3774  */
3775 static __isl_give isl_multi_pw_aff *transform_index(
3776         __isl_take isl_multi_pw_aff *index, __isl_keep isl_id *ref_id,
3777         void *user)
3778 {
3779         struct ppcg_transform_data *data = user;
3780         struct gpu_stmt_access *access;
3781         struct gpu_array_ref_group *group;
3782         struct gpu_array_tile *tile;
3783         isl_pw_multi_aff *iterator_map;
3784         int i;
3785         const char *name;
3786         isl_space *space;
3787         isl_multi_pw_aff *tiling;
3788         isl_pw_multi_aff *pma;
3789         isl_multi_pw_aff *mpa;
3790
3791         data->array = NULL;
3792
3793         iterator_map = isl_pw_multi_aff_copy(data->iterator_map);
3794         index = isl_multi_pw_aff_pullback_pw_multi_aff(index, iterator_map);
3795
3796         access = find_access(data->accesses, ref_id);
3797         if (!access)
3798                 return index;
3799         if (!isl_map_has_tuple_name(access->access, isl_dim_out))
3800                 return index;
3801
3802         name = get_outer_array_name(access->access);
3803         i = find_array_index(data->gen, name);
3804         if (i < 0)
3805                 isl_die(isl_multi_pw_aff_get_ctx(index), isl_error_internal,
3806                         "cannot find array",
3807                         return isl_multi_pw_aff_free(index));
3808         data->array = &data->gen->prog->array[i];
3809         data->local_array = &data->gen->kernel->array[i];
3810
3811         if (access->group < 0) {
3812                 data->global = 1;
3813                 return index;
3814         }
3815
3816         group = data->array->groups[access->group];
3817         tile = group->private_tile;
3818         if (!tile)
3819                 tile = group->shared_tile;
3820         data->global = !tile;
3821         if (!tile)
3822                 return index;
3823
3824         space = isl_space_range(isl_multi_pw_aff_get_space(index));
3825         space = isl_space_map_from_set(space);
3826         pma = isl_pw_multi_aff_identity(space);
3827         pma = isl_pw_multi_aff_product(
3828                         isl_pw_multi_aff_copy(data->sched2shared), pma);
3829         tiling = isl_multi_pw_aff_from_multi_aff(
3830                                     isl_multi_aff_copy(tile->tiling));
3831         tiling = isl_multi_pw_aff_pullback_pw_multi_aff(tiling, pma);
3832
3833         space = isl_space_domain(isl_multi_pw_aff_get_space(index));
3834         space = isl_space_map_from_set(space);
3835         mpa = isl_multi_pw_aff_identity(space);
3836         index = isl_multi_pw_aff_range_product(mpa, index);
3837         index = isl_multi_pw_aff_pullback_multi_pw_aff(tiling, index);
3838
3839         return index;
3840 }
3841
3842 /* Dereference "expr" by adding an index [0].
3843  * The original "expr" is assumed not to have any indices.
3844  *
3845  * If "expr" is a member access, then the dereferencing needs
3846  * to be applied to the structure argument of this member access.
3847  */
3848 static __isl_give isl_ast_expr *dereference(__isl_take isl_ast_expr *expr)
3849 {
3850         isl_ctx *ctx;
3851         isl_ast_expr *res;
3852         isl_ast_expr_list *list;
3853
3854         if (isl_ast_expr_get_op_type(expr) == isl_ast_op_member) {
3855                 isl_ast_expr *arg;
3856
3857                 arg = isl_ast_expr_get_op_arg(expr, 0);
3858                 arg = dereference(arg);
3859                 expr = isl_ast_expr_set_op_arg(expr, 0, arg);
3860
3861                 return expr;
3862         }
3863
3864         ctx = isl_ast_expr_get_ctx(expr);
3865         res = isl_ast_expr_from_val(isl_val_zero(ctx));
3866         list = isl_ast_expr_list_from_ast_expr(res);
3867         res = isl_ast_expr_get_op_arg(expr, 0);
3868         res = isl_ast_expr_access(res, list);
3869         isl_ast_expr_free(expr);
3870
3871         return res;
3872 }
3873
3874 /* Linearize the index expression "expr" based on the array bounds
3875  * of "array".
3876  *
3877  * That is, transform expression
3878  *
3879  *      A[i_0][i_1]...[i_n]
3880  *
3881  * to
3882  *
3883  *      A[(..((i_0 * b_1 + i_1) ... ) * b_n + i_n]
3884  *
3885  * where b_0, b_1, ..., b_n are the bounds on the array.
3886  *
3887  * If the base of "expr" is a member access, then the linearization needs
3888  * to be applied to the structure argument of this member access.
3889  *
3890  * In the base case, if "expr" has no arguments (other than the name of
3891  * the array), then we are passing an entire array to a function.
3892  * In this case, there is nothing to linearize.
3893  * Note that at this point an expression with no arguments can
3894  * only be an entire array because the scalar case and
3895  * the case of single struct are handled by the caller.
3896  *
3897  * If the number of specified index expressions in "expr"
3898  * is smaller than the dimension of the accessed array,
3899  * then the missing i_j also do not appear in the linearized expression.
3900  * Furthermore, since such an expression does not refer to a single
3901  * element while the default linearized expression would refer to
3902  * a single element, we return the expression
3903  *
3904  *      A + (..((i_0 * b_1 + i_1) ... ) * b_n]
3905  *
3906  * instead.  Note that because of the special case handling above,
3907  * we can assume here that here that there is at least one index expression.
3908  */
3909 __isl_give isl_ast_expr *gpu_local_array_info_linearize_index(
3910         struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr)
3911 {
3912         int i, n;
3913         isl_ctx *ctx;
3914         isl_set *context;
3915         isl_ast_expr *arg0;
3916         isl_ast_expr *res;
3917         isl_ast_expr_list *list;
3918         isl_ast_build *build;
3919
3920         arg0 = isl_ast_expr_get_op_arg(expr, 0);
3921         if (isl_ast_expr_get_type(arg0) == isl_ast_expr_op &&
3922             isl_ast_expr_get_op_type(arg0) == isl_ast_op_member) {
3923                 isl_ast_expr *arg;
3924
3925                 arg = isl_ast_expr_get_op_arg(arg0, 0);
3926                 arg = gpu_local_array_info_linearize_index(array, arg);
3927                 arg0 = isl_ast_expr_set_op_arg(arg0, 0, arg);
3928                 expr = isl_ast_expr_set_op_arg(expr, 0, arg0);
3929
3930                 return expr;
3931         }
3932         isl_ast_expr_free(arg0);
3933
3934         if (isl_ast_expr_get_op_n_arg(expr) == 1)
3935                 return expr;
3936
3937         ctx = isl_ast_expr_get_ctx(expr);
3938         context = isl_set_universe(isl_space_params_alloc(ctx, 0));
3939         build = isl_ast_build_from_context(context);
3940
3941         n = isl_ast_expr_get_op_n_arg(expr);
3942         res = isl_ast_expr_get_op_arg(expr, 1);
3943         for (i = 1; i < array->n_index; ++i) {
3944                 isl_pw_aff *bound_i;
3945                 isl_ast_expr *expr_i;
3946
3947                 bound_i = isl_pw_aff_list_get_pw_aff(array->bound, i);
3948                 expr_i = isl_ast_build_expr_from_pw_aff(build, bound_i);
3949                 res = isl_ast_expr_mul(res, expr_i);
3950
3951                 if (i + 1 >= n)
3952                         continue;
3953                 expr_i = isl_ast_expr_get_op_arg(expr, i + 1);
3954                 res = isl_ast_expr_add(res, expr_i);
3955         }
3956
3957         isl_ast_build_free(build);
3958
3959         if (1 + array->n_index > n) {
3960                 res = isl_ast_expr_add(isl_ast_expr_get_op_arg(expr, 0), res);
3961         } else {
3962                 list = isl_ast_expr_list_from_ast_expr(res);
3963                 res = isl_ast_expr_get_op_arg(expr, 0);
3964                 res = isl_ast_expr_access(res, list);
3965         }
3966
3967         isl_ast_expr_free(expr);
3968
3969         return res;
3970 }
3971
3972 /* AST expression transformation callback for pet_stmt_build_ast_exprs.
3973  *
3974  * If the AST expression refers to a global scalar that is not
3975  * a read-only scalar, then its address was passed to the kernel and
3976  * we need to dereference it.
3977  *
3978  * If the AST expression refers to an access to a global array,
3979  * then we linearize the access exploiting the bounds in data->local_array.
3980  */
3981 static __isl_give isl_ast_expr *transform_expr(__isl_take isl_ast_expr *expr,
3982         __isl_keep isl_id *id, void *user)
3983 {
3984         struct ppcg_transform_data *data = user;
3985
3986         if (!data->array)
3987                 return expr;
3988         if (gpu_array_is_read_only_scalar(data->array))
3989                 return expr;
3990         if (!data->global)
3991                 return expr;
3992         if (data->array->n_index == 0)
3993                 return dereference(expr);
3994         if (!data->array->linearize)
3995                 return expr;
3996
3997         return gpu_local_array_info_linearize_index(data->local_array, expr);
3998 }
3999
4000 /* This function is called for each instance of a user statement
4001  * in the kernel.
4002  *
4003  * We attach a struct ppcg_kernel_stmt to the "node", containing
4004  * a computed AST expression for each access.
4005  * These AST expressions are computed from iterator_map,
4006  * which expresses the domain
4007  * elements in terms of the generated loops, and sched2shared,
4008  * which expresses the first shared_len dimensions of the schedule
4009  * computed by PPCG in terms of the generated loops.
4010  */
4011 static __isl_give isl_ast_node *at_each_domain(__isl_take isl_ast_node *node,
4012         __isl_keep isl_ast_build *build, void *user)
4013 {
4014         struct ppcg_transform_data data;
4015         struct gpu_gen *gen = (struct gpu_gen *) user;
4016         struct ppcg_kernel_stmt *stmt;
4017         isl_id *id;
4018         isl_pw_multi_aff *sched2shared;
4019         isl_map *map;
4020         isl_pw_multi_aff *iterator_map;
4021         isl_ast_expr *expr, *arg;
4022         isl_union_map *schedule;
4023         int i, n;
4024
4025         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
4026         if (!stmt)
4027                 return isl_ast_node_free(node);
4028
4029         expr = isl_ast_node_user_get_expr(node);
4030         arg = isl_ast_expr_get_op_arg(expr, 0);
4031         id = isl_ast_expr_get_id(arg);
4032
4033         schedule = isl_ast_build_get_schedule(build);
4034         map = isl_map_reverse(isl_map_from_union_map(schedule));
4035         iterator_map = isl_pw_multi_aff_from_map(map);
4036         sched2shared = compute_sched_to_shared(gen,
4037                                         isl_pw_multi_aff_copy(iterator_map));
4038
4039         stmt->type = ppcg_kernel_domain;
4040         stmt->u.d.stmt = find_stmt(gen->prog, id);
4041         if (!stmt->u.d.stmt)
4042                 goto error;
4043
4044         data.gen = gen;
4045         data.accesses = stmt->u.d.stmt->accesses;
4046         data.iterator_map = iterator_map;
4047         data.sched2shared = sched2shared;
4048         stmt->u.d.ref2expr = pet_stmt_build_ast_exprs(stmt->u.d.stmt->stmt,
4049                                             build, &transform_index, &data,
4050                                             &transform_expr, &data);
4051
4052         isl_id_free(id);
4053         isl_pw_multi_aff_free(iterator_map);
4054         isl_pw_multi_aff_free(sched2shared);
4055         isl_ast_expr_free(arg);
4056         isl_ast_expr_free(expr);
4057
4058         id = isl_id_alloc(gen->ctx, NULL, stmt);
4059         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
4060         return isl_ast_node_set_annotation(node, id);
4061 error:
4062         isl_id_free(id);
4063         isl_pw_multi_aff_free(iterator_map);
4064         ppcg_kernel_stmt_free(stmt);
4065         isl_pw_multi_aff_free(sched2shared);
4066         return isl_ast_node_free(node);
4067 }
4068
4069 /* This function is called when code has been generated for the shared
4070  * tile loops.  The "schedule" refers only to the original statements.
4071  *
4072  * We extend the schedule with that part of gen->local_sched that hasn't
4073  * been taken into account yet.  This introduces parameters referring
4074  * to thread ids in the schedule, so we add them (with the appropriate
4075  * bounds to the context as well).
4076  * Finally, we set the appropriate unrolling options
4077  * if gen->first_unroll is set.
4078  */
4079 static __isl_give isl_ast_node *create_domain_leaf(
4080         __isl_take isl_union_map *schedule, __isl_take isl_ast_build *build,
4081         void *user)
4082 {
4083         struct gpu_gen *gen = (struct gpu_gen *) user;
4084         isl_space *space;
4085         isl_union_map *sched;
4086         isl_ast_node *tree;
4087         isl_set *set;
4088         isl_id_list *iterators;
4089         int n;
4090
4091         schedule = extend_schedule(schedule,
4092                         isl_union_map_copy(gen->local_sched),
4093                         gen->shared_len, gen->thread_tiled_len);
4094
4095         space = isl_ast_build_get_schedule_space(build);
4096         set = isl_set_universe(space);
4097         set = add_bounded_parameters(set, gen->kernel->n_block,
4098                                         gen->kernel->block_dim, "t");
4099         build = isl_ast_build_restrict(build, set);
4100
4101         n = gen->thread_tiled_len - gen->shared_len;
4102
4103         if (gen->first_unroll >= 0) {
4104                 space = isl_space_set_alloc(gen->ctx, 0, n);
4105                 build = set_unroll(build, space, gen->first_unroll);
4106         }
4107         iterators = generate_names(gen->ctx, n, "c");
4108         build = isl_ast_build_set_iterators(build, iterators);
4109         build = isl_ast_build_set_at_each_domain(build, &at_each_domain, gen);
4110         tree = isl_ast_build_ast_from_schedule(build, schedule);
4111         isl_ast_build_free(build);
4112
4113         return tree;
4114 }
4115
4116 /* This function is called for each statement node in the AST of the code
4117  * for copying to or from shared/private memory.
4118  * Attach a pointer to a ppcg_kernel_stmt representing the copy
4119  * statement to the node.
4120  * The statement name is "read" or "write", depending on whether we are
4121  * reading from global memory or writing to global memory.
4122  * The name of the T space is {shared,private}_<array>.
4123  *
4124  * The schedule is of the form
4125  *
4126  *      type[A -> T] -> L
4127  *
4128  * where A refers to a piece of an array and T to the corresponding
4129  * shifted tile.  We split this schedule into mappings L -> A and L -> T
4130  * and store the corresponding expressions in stmt->index and stmt->local_index,
4131  * where stmt points to the ppcg_kernel_stmt that is attached to the node.
4132  */
4133 static __isl_give isl_ast_node *attach_copy_stmt(__isl_take isl_ast_node *node,
4134         __isl_keep isl_ast_build *build, void *user)
4135 {
4136         struct gpu_gen *gen = (struct gpu_gen *) user;
4137         struct ppcg_kernel_stmt *stmt;
4138         isl_id *id;
4139         isl_ast_expr *expr;
4140         isl_space *space;
4141         isl_map *access, *local_access, *map;
4142         isl_pw_multi_aff *pma;
4143         const char *type;
4144         int array_index;
4145
4146         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
4147         if (!stmt)
4148                 return isl_ast_node_free(node);
4149
4150         access = isl_map_from_union_map(isl_ast_build_get_schedule(build));
4151         type = isl_map_get_tuple_name(access, isl_dim_in);
4152         stmt->u.c.read = !strcmp(type, "read");
4153         access = isl_map_reverse(access);
4154         space = isl_space_unwrap(isl_space_range(isl_map_get_space(access)));
4155         local_access = isl_map_copy(access);
4156
4157         map = isl_map_domain_map(isl_map_universe(isl_space_copy(space)));
4158         id = isl_map_get_tuple_id(access, isl_dim_out);
4159         map = isl_map_set_tuple_id(map, isl_dim_in, id);
4160         access = isl_map_apply_range(access, map);
4161         pma = isl_pw_multi_aff_from_map(access);
4162         expr = isl_ast_build_access_from_pw_multi_aff(build, pma);
4163         stmt->u.c.index = expr;
4164
4165         map = isl_map_range_map(isl_map_universe(space));
4166         id = isl_map_get_tuple_id(local_access, isl_dim_out);
4167         map = isl_map_set_tuple_id(map, isl_dim_in, id);
4168         local_access = isl_map_apply_range(local_access, map);
4169         pma = isl_pw_multi_aff_from_map(local_access);
4170         expr = isl_ast_build_access_from_pw_multi_aff(build, pma);
4171         stmt->u.c.local_index = expr;
4172
4173         stmt->u.c.array = gen->copy_group->array;
4174         array_index = stmt->u.c.array - gen->prog->array;
4175         stmt->u.c.local_array = &gen->kernel->array[array_index];
4176         stmt->type = ppcg_kernel_copy;
4177
4178         id = isl_id_alloc(gen->ctx, NULL, stmt);
4179         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
4180         return isl_ast_node_set_annotation(node, id);
4181 }
4182
4183 /* Given a schedule of the form
4184  *
4185  *      [S -> A] -> L
4186  *
4187  * (with S the first shared_len dimensions of the computed schedule,
4188  * A the array and L the schedule correponding to the generated loops),
4189  * indicating where to copy the array elements that need to be copied,
4190  * construct code for performing the copying.
4191  *
4192  * "group" is the array reference group that is being copied
4193  * "type" is either "read" or "write"
4194  * private is set if copying needs to be performed to/from registers
4195  *
4196  * We first construct a mapping to a shifted tile of the array,
4197  *
4198  *      [S -> A] -> T(S,A)                                      (1)
4199  *
4200  * If private is set, then we also use this mapping as a schedule
4201  * (which is already thread-specific and will be completely unrolled).
4202  * Otherwise, we wrap/tile the range over the threads.
4203  * The result is
4204  *
4205  *      [S -> A] -> T'(S,A)
4206  *
4207  * Combined with the given schedule, we have
4208  *
4209  *      [S -> A] -> [L -> T'(S,A)]                              (2)
4210  *
4211  * From the shifted tile mapping, we construct a mapping
4212  *
4213  *      [S -> A] -> [A -> T(S,A)]
4214  *
4215  * and apply it to the schedule (2), obtaining
4216  *
4217  *      [A -> T(S(L),A)] -> [L -> T'(S(L),A)]
4218  *
4219  * Note that we can project out S because it is uniquely defined by L.
4220  */
4221 static __isl_give isl_ast_node *copy_access(struct gpu_gen *gen,
4222         __isl_take isl_map *sched,
4223         const char *type, struct gpu_array_ref_group *group,
4224         __isl_take isl_ast_build *build, int private)
4225 {
4226         isl_space *space;
4227         isl_ast_node *tree;
4228         isl_map *schedule, *shift, *map;
4229         isl_set *set;
4230         isl_id_list *iterators;
4231         int n;
4232
4233         shift = shift_access(group);
4234
4235         schedule = isl_map_copy(shift);
4236         schedule = isl_map_reset_tuple_id(schedule, isl_dim_out);
4237         if (!private)
4238                 schedule = tile_access_schedule(gen, schedule);
4239
4240         n = isl_map_dim(schedule, isl_dim_out);
4241         set = isl_set_universe(isl_ast_build_get_schedule_space(build));
4242         set = add_bounded_parameters(set, gen->kernel->n_block,
4243                                         gen->kernel->block_dim, "t");
4244
4245         schedule = isl_map_range_product(sched, schedule);
4246
4247         space = isl_space_domain(isl_map_get_space(shift));
4248         map = isl_map_range_map(isl_map_universe(isl_space_unwrap(space)));
4249         map = isl_map_range_product(map, shift);
4250
4251         schedule = isl_map_apply_domain(schedule, map);
4252
4253         schedule = isl_map_set_tuple_name(schedule, isl_dim_in, type);
4254
4255         build = isl_ast_build_restrict(build, set);
4256
4257         gen->copy_group = group;
4258
4259         if (private) {
4260                 space = isl_space_range(isl_map_get_space(schedule));
4261                 space = isl_space_range(isl_space_unwrap(space));
4262                 build = set_unroll(build, space, 0);
4263         }
4264         iterators = generate_names(gen->ctx, n, "c");
4265         build = isl_ast_build_set_iterators(build, iterators);
4266         build = isl_ast_build_set_at_each_domain(build, &attach_copy_stmt, gen);
4267         tree = isl_ast_build_ast_from_schedule(build,
4268                                             isl_union_map_from_map(schedule));
4269         isl_ast_build_free(build);
4270
4271         return tree;
4272 }
4273
4274 /* Return code for reading into or writing from shared memory
4275  * the given array reference group.
4276  *
4277  * If we are performing a read from global memory to shared memory and
4278  * if the array involved is not a scalar, then we copy
4279  * the entire tile to shared memory.  This may result in some extra
4280  * elements getting copied, but it should lead to simpler code
4281  * (which means that fewer registers may be needed) and less divergence.
4282  *
4283  * Otherwise, we only copy the elements that will be read or have been written
4284  * in the kernel.
4285  *
4286  *
4287  * The input "sched" is of the form.
4288  *
4289  *      type[S -> A] -> L
4290  *
4291  * with S the first shared_len dimensions of the computed schedule,
4292  * A the array and L the schedule correponding to the generated loops.
4293  *
4294  * We first drop "type",
4295  *
4296  *      [S -> A] -> L
4297  *
4298  * If the above conditions are satisfied, we project out A,
4299  * resulting in
4300  *
4301  *      S -> L
4302  *
4303  * and then introduce the group tile [S -> T], resulting in
4304  *
4305  *      [S -> T] -> L
4306  */
4307 static __isl_give isl_ast_node *copy_group_shared_accesses(
4308         struct gpu_gen *gen, struct gpu_array_ref_group *group,
4309         __isl_take isl_map *sched, __isl_take isl_ast_build *build)
4310 {
4311         const char *type;
4312         int read;
4313         isl_union_map *access;
4314
4315         type = isl_map_get_tuple_name(sched, isl_dim_in);
4316         read = !strcmp(type, "read");
4317
4318         sched = isl_map_reset_tuple_id(sched, isl_dim_in);
4319
4320         if (read && !gpu_array_is_scalar(group->array)) {
4321                 isl_space *space;
4322                 isl_map *map;
4323
4324                 space = isl_space_domain(isl_map_get_space(sched));
4325                 space = isl_space_unwrap(space);
4326                 map = isl_map_domain_map(isl_map_universe(space));
4327                 sched = isl_map_apply_domain(sched, map);
4328
4329                 map = group_tile(group);
4330                 map = isl_map_reverse(isl_map_domain_map(map));
4331                 sched = isl_map_apply_domain(sched, map);
4332         }
4333
4334         return copy_access(gen, sched, type, group, build, 0);
4335 }
4336
4337 /* Return code for reading into or writing from private memory
4338  * the given array reference group.
4339  *
4340  * Let S be the first shared_len dimensions of the computed schedule,
4341  * D the iteration domains, A the array and L the schedule correponding
4342  * to the generated loops.
4343  * "sched" is of the form
4344  *
4345  *      type[S -> A] -> L
4346  *
4347  * where type is either "read" or "write".
4348  * We apply the privatization D -> S(t), with t the thread ids,
4349  * to the access relation D -> A to obtain the privatized access relation
4350  *
4351  *      S(t) -> A
4352  *
4353  * We drop the type from "sched" and intersect with the privatized access
4354  * relation to obtain
4355  *
4356  *      [S(t) -> A] -> L
4357  */
4358 static __isl_give isl_ast_node *copy_group_private_accesses(
4359         struct gpu_gen *gen, struct gpu_array_ref_group *group,
4360         __isl_take isl_map *sched, __isl_take isl_ast_build *build)
4361 {
4362         const char *type;
4363         int read;
4364         isl_union_map *priv;
4365         isl_union_map *access;
4366         isl_map *access_map;
4367
4368         type = isl_map_get_tuple_name(sched, isl_dim_in);
4369         read = !strcmp(type, "read");
4370
4371         priv = isl_union_map_from_map(isl_map_copy(gen->privatization));
4372         priv = isl_union_map_apply_range(isl_union_map_copy(gen->shared_sched),
4373                                         priv);
4374
4375         access = group_access_relation(group, read, !read);
4376         access = isl_union_map_apply_domain(access, priv);
4377         access_map = isl_map_from_union_map(access);
4378
4379         sched = isl_map_reset_tuple_id(sched, isl_dim_in);
4380         sched = isl_map_intersect_domain(sched, isl_map_wrap(access_map));
4381
4382         return copy_access(gen, sched, type, group, build, 1);
4383 }
4384
4385 /* Return code for reading into or writing from shared or private memory.
4386  *
4387  * "schedule" is of the form
4388  *
4389  *      type[S -> A] -> L
4390  *
4391  * with S be the first shared_len dimensions of the computed schedule,
4392  * A the array and L the schedule correponding to the generated loops.
4393  * The array reference group is attached to "type".
4394  */
4395 static __isl_give isl_ast_node *create_access_leaf(
4396         struct gpu_gen *gen, __isl_take isl_map *schedule,
4397         __isl_take isl_ast_build *build)
4398 {
4399         struct gpu_array_ref_group *group;
4400         isl_id *id;
4401
4402         id = isl_map_get_tuple_id(schedule, isl_dim_in);
4403         group = isl_id_get_user(id);
4404         isl_id_free(id);
4405
4406         if (group->private_tile)
4407                 return copy_group_private_accesses(gen, group, schedule,
4408                                                         build);
4409         else
4410                 return copy_group_shared_accesses(gen, group, schedule,
4411                                                         build);
4412 }
4413
4414 /* Create a domain node representing a synchronization.
4415  */
4416 static __isl_give isl_ast_node *create_sync_leaf(
4417         struct gpu_gen *gen, __isl_take isl_map *schedule,
4418         __isl_take isl_ast_build *build)
4419 {
4420         struct ppcg_kernel_stmt *stmt;
4421         isl_id *id;
4422         isl_space *space;
4423         isl_ast_node *node;
4424         isl_ast_expr *expr;
4425
4426         isl_map_free(schedule);
4427
4428         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
4429         if (!stmt)
4430                 return NULL;
4431
4432         stmt->type = ppcg_kernel_sync;
4433
4434         space = isl_ast_build_get_schedule_space(build);
4435         space = isl_space_from_domain(space);
4436         space = isl_space_set_tuple_name(space, isl_dim_out, "sync");
4437         expr = isl_ast_build_call_from_pw_multi_aff(build,
4438                     isl_pw_multi_aff_from_multi_aff(isl_multi_aff_zero(space)));
4439         node = isl_ast_node_alloc_user(expr);
4440         isl_ast_build_free(build);
4441
4442         id = isl_id_alloc(gen->ctx, NULL, stmt);
4443         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
4444         return isl_ast_node_set_annotation(node, id);
4445 }
4446
4447 /* This function is called during the code generation at the point
4448  * where the schedule domain element is completely determined by
4449  * the generated code.  The input schedule contains the original
4450  * statements as well as synchronization and copy "statements".
4451  * The latter are scheduled at different points than any of the original
4452  * statements, so they will only arrive here in isolation.
4453  *
4454  * If the current schedule only refers to a single statement,
4455  * we check if it is a copy or synchronization statement and
4456  * call the appropriate functions.
4457  * Otherwise, we assume we are dealing with the original statements
4458  * and we call create_domain_leaf.
4459  */
4460 static __isl_give isl_ast_node *create_kernel_leaf(
4461         __isl_take isl_ast_build *build, void *user)
4462 {
4463         struct gpu_gen *gen = (struct gpu_gen *) user;
4464         isl_map *map;
4465         isl_union_map *schedule;
4466         const char *name;
4467
4468         schedule = isl_ast_build_get_schedule(build);
4469
4470         if (isl_union_map_n_map(schedule) != 1)
4471                 return create_domain_leaf(schedule, build, user);
4472
4473         map = isl_map_from_union_map(schedule);
4474         name = isl_map_get_tuple_name(map, isl_dim_in);
4475         if (!strcmp(name, "read") || !strcmp(name, "write"))
4476                 return create_access_leaf(gen, map, build);
4477         if (!strcmp(name, "sync"))
4478                 return create_sync_leaf(gen, map, build);
4479
4480         return create_domain_leaf(isl_union_map_from_map(map), build, user);
4481 }
4482
4483 /* Mark all odd schedule dimensions as "atomic" (when the even dimensions
4484  * have value 0) and all even schedule dimensions as "unroll".
4485  *
4486  * That is, the options look as follows
4487  *
4488  *      { [0, b, 0, d, ..., 0] -> atomic[i] : exists a : i = 2 a + 1;
4489  *        [a, b, c, d, ..., z] -> unroll[i] : exists a : i = 2 a }
4490  *
4491  * The even positions are used to be able to schedule copying blocks
4492  * and synchronization before or after each level of the shared memory
4493  * tile loops and we want to make sure that code for these is generated
4494  * separately (within each level).
4495  */
4496 static __isl_give isl_ast_build *set_atomic_and_unroll(
4497         __isl_take isl_ast_build *build,
4498         __isl_take isl_space *space, int sched_len)
4499 {
4500         isl_ctx *ctx;
4501         isl_map *map;
4502         isl_constraint *c;
4503         isl_union_map *opt;
4504         isl_local_space *ls;
4505         int i, n;
4506
4507         ctx = isl_ast_build_get_ctx(build);
4508
4509         space = isl_space_params(space);
4510         space = isl_space_add_dims(space, isl_dim_set, sched_len);
4511         space = isl_space_from_domain(space);
4512         space = isl_space_add_dims(space, isl_dim_out, 2);
4513         map = isl_map_universe(isl_space_copy(space));
4514         for (i = 0; i < sched_len; i += 2)
4515                 map = isl_map_fix_si(map, isl_dim_in, i, 0);
4516         ls = isl_local_space_from_space(isl_map_get_space(map));
4517         c = isl_equality_alloc(ls);
4518         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 0, 1);
4519         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 1, 2);
4520         c = isl_constraint_set_constant_si(c, 1);
4521         map = isl_map_add_constraint(map, c);
4522         map = isl_map_project_out(map, isl_dim_out, 1, 1);
4523         map = isl_map_set_tuple_name(map, isl_dim_out, "atomic");
4524         opt = isl_union_map_from_map(map);
4525
4526         map = isl_map_universe(space);
4527         ls = isl_local_space_from_space(isl_map_get_space(map));
4528         c = isl_equality_alloc(ls);
4529         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 0, 1);
4530         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 1, 2);
4531         map = isl_map_add_constraint(map, c);
4532         map = isl_map_project_out(map, isl_dim_out, 1, 1);
4533         map = isl_map_set_tuple_name(map, isl_dim_out, "unroll");
4534         opt = isl_union_map_add_map(opt, map);
4535
4536         build = isl_ast_build_set_options(build, opt);
4537
4538         return build;
4539 }
4540
4541 /* Return a map that maps a space of dimension gen->shared_len
4542  * to its last dimensions starting at gen->tile_first.
4543  * The range is of dimension
4544  *
4545  *      2 * (gen->shared_len - gen->tile_first) + 1
4546  *
4547  * The input dimensions are mapped to the odd dimensions in the output,
4548  * while the even dimensions (except 2*pos) are fixed to 0.
4549  * Output dimension 2*pos (if pos >= 0) is fixed to "val".
4550  * If pos >= 0, then only the pos first dimensions starting at gen->tile_first
4551  * are mapped to the output.  The remaining input dimensions are projected
4552  * out and the corresponding output dimensions are fixed to 0.
4553  */
4554 static __isl_give isl_map *insert_even(struct gpu_gen *gen,
4555         __isl_take isl_space *space, int pos, int val)
4556 {
4557         int i, n;
4558         isl_map *proj;
4559
4560         space = isl_space_set_from_params(space);
4561         space = isl_space_add_dims(space, isl_dim_set, gen->shared_len);
4562         space = isl_space_map_from_set(space);
4563         proj = isl_map_identity(space);
4564         proj = isl_map_project_out(proj, isl_dim_out, 0, gen->tile_first);
4565         n = gen->shared_len - gen->tile_first;
4566         for (i = 0; i <= n; ++i) {
4567                 proj = isl_map_insert_dims(proj, isl_dim_out, 2 * i, 1);
4568                 if (i == pos)
4569                         proj = isl_map_fix_si(proj, isl_dim_out, 2 * i, val);
4570                 else
4571                         proj = isl_map_fix_si(proj, isl_dim_out, 2 * i, 0);
4572         }
4573
4574         if (pos < 0)
4575                 return proj;
4576
4577         proj = isl_map_eliminate(proj, isl_dim_in, gen->tile_first + pos,
4578                                 gen->shared_len - (gen->tile_first + pos));
4579         for (i = pos; i < n; ++i)
4580                 proj = isl_map_fix_si(proj, isl_dim_out, 2 * i + 1, 0);
4581
4582         return proj;
4583 }
4584
4585 /* Given the AST context schedule "schedule" and the mapping from
4586  * domains to the shared tile loops "shared_sched", add a schedule
4587  * for a synchronization operation at position "val" of loop level "pos".
4588  *
4589  * schedule is of the form
4590  *
4591  *      D -> L
4592  *
4593  * (with D the iteration domains and L the already generated loops),
4594  * while shared_sched is of the form
4595  *
4596  *      D -> S
4597  *
4598  * We combine them into
4599  *
4600  *      L -> S
4601  *
4602  * apply a mapping
4603  *
4604  *      [s_0,...] -> [0,s_{tile_first},0,..., val, 0, 0, ... 0]
4605  *
4606  * and use the result as a schedule for "sync".
4607  */
4608 static __isl_give isl_union_map *add_sync_schedule(struct gpu_gen *gen,
4609         __isl_take isl_union_map *res, __isl_keep isl_union_map *schedule,
4610         __isl_keep isl_union_map *shared_sched, int pos, int val)
4611 {
4612         isl_space *space;
4613         isl_map *proj, *map;
4614
4615         shared_sched = isl_union_map_copy(shared_sched);
4616         schedule = isl_union_map_copy(schedule);
4617
4618         space = isl_union_map_get_space(shared_sched);
4619         schedule = isl_union_map_apply_domain(shared_sched, schedule);
4620         map = isl_map_from_union_map(schedule);
4621
4622         proj = insert_even(gen, space, pos, val);
4623         map = isl_map_apply_range(map, proj);
4624         map = isl_map_from_range(isl_map_wrap(map));
4625         map = isl_map_set_tuple_name(map, isl_dim_in, "sync");
4626
4627         res = isl_union_map_add_map(res, map);
4628
4629         return res;
4630 }
4631
4632 /* Given a set of wrapped references "ref", return the corresponding
4633  * access relations based on the tagged access relations "tagged".
4634  *
4635  * The elements of "ref" are of the form
4636  *
4637  *      [D -> R]
4638  *
4639  * with D an iteration domains and R a reference.
4640  * The elements of "tagged" are of the form
4641  *
4642  *      [D -> R] -> A
4643  *
4644  * with A an array.
4645  *
4646  * Extend "tagged" to include the iteration domain in the range, i.e.,
4647  *
4648  *      [D -> R] -> [D -> A]
4649  *
4650  * apply the result to "ref" and then unwrap the resulting set
4651  * to obtain relations of the form
4652  *
4653  *      D -> A
4654  */
4655 static __isl_give isl_union_map *wrapped_reference_to_access(
4656         __isl_take isl_union_set *ref, __isl_take isl_union_map *tagged)
4657 {
4658         isl_union_map *tag2access;
4659
4660         tag2access = isl_union_map_copy(tagged);
4661         tag2access = isl_union_map_universe(tag2access);
4662         tag2access = isl_union_set_unwrap(isl_union_map_domain(tag2access));
4663         tag2access = isl_union_map_domain_map(tag2access);
4664         tag2access = isl_union_map_range_product(tag2access, tagged);
4665
4666         ref = isl_union_set_coalesce(ref);
4667         ref = isl_union_set_apply(ref, tag2access);
4668
4669         return isl_union_set_unwrap(ref);
4670 }
4671
4672 /* Given an access relation "access" from "group", remove those reads
4673  * if ("read" is 1) or writes (if "read" is 0) that are only needed to
4674  * communicate data within the same iteration of the last_shared dimension
4675  * of the group.
4676  *
4677  * If the access is a read then it is necessarily an element of
4678  *
4679  *      live_in union (range flow)
4680  *
4681  * where live_in and flow may be overapproximations.
4682  * If the access is a write then it is necessarily an element of
4683  *
4684  *      live_out union (domain flow)
4685  *
4686  * In both cases, the access relation is also a subset of
4687  * the group access relation.
4688  *
4689  * Essentially, we compute the intersection of "access" with either
4690  *
4691  *      live_in union (range non-local-flow)
4692  *
4693  * or
4694  *
4695  *      live_out union (domain non-local-flow)
4696  *
4697  * We first construct a relation "local"
4698  *
4699  *      [[D -> R] -> [D' -> R']]
4700  *
4701  * of pairs of domain iterations accessing the reference group
4702  * and references in the group that are scheduled to the same iteration
4703  * of the last_shared dimension.
4704  *
4705  * If this relation does not intersect the dataflow dependences,
4706  * then there is nothing we can possibly remove and we simply
4707  * return the input.
4708  *
4709  * Otherwise, we remove the "local" dataflow dependences from
4710  * the set of all dataflow dependences.
4711  * Note that if the potential dataflow dependences are an overapproximation
4712  * of the actual dataflow dependences, then the result remains an
4713  * overapproximation of the non-local dataflow dependences.
4714  * Copying to/from global memory is only needed for the references
4715  * in the domain/range of the result or for accesses that are live out/in
4716  * for the entire scop.
4717  *
4718  * We therefore map the domain/range of the "external" relation
4719  * to the corresponding access relation and take the union with
4720  * the live out/in relation.
4721  */
4722 static __isl_give isl_union_map *remove_local_accesses(struct gpu_gen *gen,
4723         struct gpu_array_ref_group *group, __isl_take isl_union_map *access,
4724         int read)
4725 {
4726         int empty;
4727         isl_union_map *tagger;
4728         isl_union_set *domain;
4729         isl_space *space;
4730         isl_union_map *sched, *local, *tagged, *external;
4731         isl_union_set *tag_set;
4732         isl_map *proj;
4733
4734         if (isl_union_map_is_empty(access))
4735                 return access;
4736
4737         tagged = group_tagged_access_relation(group);
4738
4739         sched = isl_union_map_copy(gen->sched);
4740
4741         space = isl_union_map_get_space(sched);
4742         proj = projection(space, gen->untiled_len, group->last_shared + 1);
4743         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
4744
4745         tagger = isl_union_map_copy(gen->prog->scop->tagger);
4746         domain = isl_union_map_domain(isl_union_map_copy(tagged));
4747         tagger = isl_union_map_intersect_range(tagger, domain);
4748         sched = isl_union_map_apply_domain(sched, tagger);
4749
4750         local = isl_union_map_apply_range(sched,
4751                             isl_union_map_reverse(isl_union_map_copy(sched)));
4752         local = isl_union_map_intersect(local,
4753                         isl_union_map_copy(gen->prog->scop->tagged_dep_flow));
4754
4755         empty = isl_union_map_is_empty(local);
4756         if (empty < 0 || empty) {
4757                 isl_union_map_free(tagged);
4758                 isl_union_map_free(local);
4759                 if (empty < 0)
4760                         return isl_union_map_free(access);
4761                 return access;
4762         }
4763
4764         external = isl_union_map_copy(gen->prog->scop->tagged_dep_flow);
4765         external = isl_union_map_intersect_params(external,
4766                                 isl_set_copy(gen->prog->scop->context));
4767         external = isl_union_map_subtract(external, local);
4768
4769         if (read) {
4770                 tag_set = isl_union_map_range(external);
4771                 external = wrapped_reference_to_access(tag_set, tagged);
4772                 external = isl_union_map_union(external,
4773                                 isl_union_map_copy(gen->prog->scop->live_in));
4774         } else {
4775                 tag_set = isl_union_map_domain(external);
4776                 external = wrapped_reference_to_access(tag_set, tagged);
4777                 external = isl_union_map_union(external,
4778                                 isl_union_map_copy(gen->prog->scop->live_out));
4779         }
4780
4781         access = isl_union_map_intersect(access, external);
4782
4783         return access;
4784 }
4785
4786 /* Given the AST context schedule "schedule" and the mapping from
4787  * domains to the shared tile loops "shared_sched", add a schedule
4788  * for copying an array reference group to/from shared/private memory.
4789  * "read" is set if data should be copied from global memory
4790  * to shared/private memory.
4791  * "k" represents the current group
4792  * "s" is the total number of groups
4793  *
4794  * We schedule an operation before or after the innermost loop
4795  * of "shared_sched" that affects the tile of the array reference group.
4796  *
4797  * schedule is of the form
4798  *
4799  *      D -> L
4800  *
4801  * (with D the iteration domains and L the already generated loops),
4802  * while shared_sched is of the form
4803  *
4804  *      D -> S
4805  *
4806  * We first compute the access relation for the reference group
4807  *
4808  *      D -> A
4809  *
4810  * and remove from this access relation those reads or writes
4811  * that only needed to communicate data within the same iteration
4812  * of the last_shared dimension of the group.
4813  * We then combine what is left with shared_sched into
4814  *
4815  *      D -> [S -> A]
4816  *
4817  * If this results in an empty relation, no copying needs to be performed
4818  * at this point.
4819  * Otherwise, we invert the relation and combine it with "schedule" into
4820  *
4821  *      [S -> A] -> L
4822  *
4823  * The actual additional piece of the schedule is obtained from combining
4824  *
4825  *      [S -> A] -> S
4826  *
4827  * with a mapping
4828  *
4829  *      [s_0,...] -> [0,s_{tile_first},0,..., val, 0, 0, ... 0]
4830  *
4831  * The position of "val" corresponds to the innermost loop that affects
4832  * the tile and the value indicates where the copying is scheduled
4833  * with respect to the actual kernel code (at value 0).
4834  * Reads are schedule before the code, writes to global memory from
4835  * private memory are scheduled at values 1 to s, writes to global
4836  * memory from shared memory are scheduled at values s + 2 to 2 * s + 1.
4837  *
4838  * If we are scheduling a read from global memory to shared memory,
4839  * we insert a synchronization before the kernel code (at the innermost
4840  * level).
4841  * If we are scheduling a write to global memory, then we add
4842  * a synchronization after all writes (at value 2 *s + 2).
4843  * However, there is no need for a synchronization after the outermost loop.
4844  * A write to global memory from private memory at the innermost level
4845  * does not require a synchronization, because it is covered by
4846  * the synchronization after the kernel inserted by body_schedule.
4847  */
4848 static __isl_give isl_union_map *add_group_schedule(struct gpu_gen *gen,
4849         __isl_take isl_union_map *res, __isl_keep isl_union_map *schedule,
4850         __isl_keep isl_union_map *shared_sched,
4851         struct gpu_array_ref_group *group, int read, int k, int s)
4852 {
4853         int n;
4854         int pos, val;
4855         isl_space *space;
4856         isl_union_map *access;
4857         isl_map *map, *proj, *access_map;
4858         isl_id *id;
4859
4860         access = group_access_relation(group, read, !read);
4861         access = remove_local_accesses(gen, group, access, read);
4862         access = isl_union_map_range_product(isl_union_map_copy(shared_sched),
4863                                                 access);
4864
4865         if (isl_union_map_is_empty(access)) {
4866                 isl_union_map_free(access);
4867                 return res;
4868         }
4869
4870         access = isl_union_map_reverse(access);
4871         access = isl_union_map_apply_range(access,
4872                                             isl_union_map_copy(schedule));
4873         access_map = isl_map_from_union_map(access);
4874
4875         space = isl_space_copy(group->array->space);
4876         space = isl_space_from_range(space);
4877         space = isl_space_add_dims(space, isl_dim_in, gen->shared_len);
4878         map = isl_map_domain_map(isl_map_universe(space));
4879
4880         space = isl_union_map_get_space(schedule);
4881         pos = group->last_shared + 1 - gen->tile_first;
4882         assert(pos >= 0);
4883         if (read)
4884                 val = -2 - k;
4885         else if (group->private_tile)
4886                 val = 1 + k;
4887         else
4888                 val = 1 + s + 1 + k;
4889         proj = insert_even(gen, space, pos, val);
4890         map = isl_map_apply_range(map, proj);
4891
4892         access_map = isl_map_range_product(access_map, map);
4893
4894         id = isl_id_alloc(gen->ctx, read ? "read" : "write", group);
4895         access_map = isl_map_set_tuple_id(access_map, isl_dim_in, id);
4896
4897         res = isl_union_map_add_map(res, access_map);
4898
4899         n = gen->shared_len - gen->tile_first;
4900         if (read) {
4901                 if (!group->private_tile)
4902                         res = add_sync_schedule(gen, res, schedule,
4903                                                 shared_sched, n, -1);
4904         } else {
4905                 if (pos == 0)
4906                         return res;
4907                 if (pos == n && group->private_tile)
4908                         return res;
4909                 res = add_sync_schedule(gen, res, schedule, shared_sched,
4910                                         pos, 2 * s + 2);
4911         }
4912
4913         return res;
4914 }
4915
4916 /* Return a schedule for the shared tile loops based on the current
4917  * AST context schedule.
4918  *
4919  * We create a "shared_sched" that maps the domains to the first
4920  * shared_len dimensions of the computed schedule, project out the
4921  * first tile_first dimensions (as these are already covered by
4922  * the host code) and insert "statement-level" dimensions at even
4923  * positions so that we can schedule copy blocks and synchronization
4924  * before/after each level.
4925  *
4926  * In particular, copy blocks are inserted inside the innermost
4927  * level that affect the tile.  For the copying to global memory,
4928  * those from private memory are scheduled before those from shared
4929  * memory such that synchronization can be inserted between the two
4930  * at the innermost level.
4931  * Synchronization is inserted at the innermost level before the
4932  * actual kernel code if there is any copying from global memory
4933  * to shared memory.  It is inserted unconditionally at the innermost
4934  * level after the actual kernel code and the copying to global memory
4935  * from private memory (if any).  Finally, it is inserted after
4936  * any copying to global memory, except at the outermost level
4937  * and at the innermost level if there is no copying from shared
4938  * memory.  The copying from private memory is covered by the unconditional
4939  * synchronization at the innermost level.
4940  */
4941 static __isl_give isl_union_map *body_schedule(struct gpu_gen *gen,
4942         __isl_take isl_union_map *schedule)
4943 {
4944         isl_space *space;
4945         isl_union_map *res;
4946         isl_union_map *shared_sched;
4947         isl_union_map *sched;
4948         isl_map *proj, *map;
4949         int i, j, k, s;
4950
4951         shared_sched = isl_union_map_copy(gen->tiled_sched);
4952         proj = projection(isl_union_map_get_space(shared_sched),
4953                                 gen->tiled_len, gen->shared_len);
4954         shared_sched = isl_union_map_apply_range(shared_sched,
4955                                 isl_union_map_from_map(proj));
4956         space = isl_union_map_get_space(shared_sched);
4957         proj = insert_even(gen, space, -1, 0);
4958         sched = isl_union_map_apply_range(isl_union_map_copy(shared_sched),
4959                                 isl_union_map_from_map(proj));
4960
4961         res = isl_union_map_range_product(isl_union_map_copy(schedule), sched);
4962
4963         s = 0;
4964         for (i = 0; i < gen->prog->n_array; ++i)
4965                 s += gen->prog->array[i].n_group;
4966
4967         k = 0;
4968         for (i = 0; i < gen->prog->n_array; ++i) {
4969                 struct gpu_array_info *array = &gen->prog->array[i];
4970
4971                 for (j = 0; j < array->n_group; ++j) {
4972                         struct gpu_array_ref_group *group;
4973
4974                         group = array->groups[j];
4975                         if (!group->private_tile && !group->shared_tile)
4976                                 continue;
4977                         res = add_group_schedule(gen, res, schedule,
4978                                                 shared_sched, group, 0, k, s);
4979                         res = add_group_schedule(gen, res, schedule,
4980                                                 shared_sched, group, 1, k, s);
4981                         ++k;
4982                 }
4983         }
4984
4985         res = add_sync_schedule(gen, res, schedule, shared_sched,
4986                             gen->shared_len - gen->tile_first, 1 + s);
4987
4988         isl_union_map_free(shared_sched);
4989         isl_union_map_free(schedule);
4990
4991         return res;
4992 }
4993
4994 /* Generate code for "kernel" in the given "context".
4995  *
4996  * We first generate code for the shared tile loops (T1T, T1P and T2)
4997  * in a context that includes the block ids.
4998  * Within each iteration of these loops an additional code generation
4999  * is performed (within create_kernel_leaf) for the rest of the schedule
5000  * in a context that includes the thread ids.
5001  */
5002 static __isl_give isl_ast_node *generate_kernel(struct gpu_gen *gen,
5003         __isl_keep isl_ast_build *build, __isl_keep isl_set *host_domain,
5004         __isl_keep isl_multi_pw_aff *grid_size)
5005 {
5006         isl_space *space;
5007         isl_set *set;
5008         isl_id_list *iterators;
5009         isl_union_map *schedule;
5010         isl_ast_node *tree;
5011         int sched_len;
5012
5013         schedule = isl_ast_build_get_schedule(build);
5014
5015         build = isl_ast_build_copy(build);
5016         build = isl_ast_build_restrict(build, isl_set_copy(host_domain));
5017         space = isl_ast_build_get_schedule_space(build);
5018         set = isl_set_universe(isl_space_copy(space));
5019         set = add_bounded_parameters_dynamic(set, grid_size, "b");
5020         build = isl_ast_build_restrict(build, set);
5021
5022         schedule = body_schedule(gen, schedule);
5023
5024         sched_len = 2 * (gen->shared_len - gen->tile_first) + 1;
5025
5026         build = set_atomic_and_unroll(build, space, sched_len);
5027         iterators = generate_names(gen->ctx, sched_len, "g");
5028         build = isl_ast_build_set_iterators(build, iterators);
5029         build = isl_ast_build_set_create_leaf(build, &create_kernel_leaf, gen);
5030         tree = isl_ast_build_ast_from_schedule(build, schedule);
5031         isl_ast_build_free(build);
5032
5033         return tree;
5034 }
5035
5036 /* Attach "id" to the given node.
5037  */
5038 static __isl_give isl_ast_node *attach_id(__isl_take isl_ast_node *node,
5039         __isl_keep isl_ast_build *build, void *user)
5040 {
5041         isl_id *id = user;
5042
5043         node = isl_ast_node_set_annotation(node, id);
5044
5045         return node;
5046 }
5047
5048 /* Construct an AST node for performing a kernel launch and attach
5049  * the information about the kernel to that node.
5050  *
5051  * The kernel AST has been constructed in the context of the range
5052  * of "schedule".  In particular, the grid size has been computed
5053  * in the context.  We therefore still need to make sure that these
5054  * constraints are expressed in the code.  We do this by creating a schedule
5055  *
5056  *      kernel[] -> [S -> []]
5057  *
5058  * where S is the schedule domain, i.e., the range of "schedule".
5059  * The AST generation will then create a single call surrounded by
5060  * all the condition in "S" that have not been expressed yet.
5061  *
5062  * The kernel information is attached to this node in attach_id.
5063  */
5064 static __isl_give isl_ast_node *construct_launch(
5065         __isl_take isl_ast_build *build, __isl_take isl_union_map *schedule,
5066         __isl_take struct ppcg_kernel *kernel)
5067 {
5068         isl_id *id;
5069         isl_ctx *ctx;
5070         isl_union_set *domain;
5071         isl_set *set;
5072         isl_map *map;
5073         isl_ast_node *node;
5074
5075         ctx = isl_ast_build_get_ctx(build);
5076
5077         id = isl_id_alloc(ctx, NULL, kernel);
5078         id = isl_id_set_free_user(id, &ppcg_kernel_free);
5079
5080         domain = isl_union_map_range(schedule);
5081         set = isl_set_from_union_set(domain);
5082         map = isl_map_from_domain(set);
5083         map = isl_map_from_range(isl_map_wrap(map));
5084         map = isl_map_set_tuple_name(map, isl_dim_in, "kernel");
5085         schedule = isl_union_map_from_map(map);
5086
5087         build = isl_ast_build_set_at_each_domain(build, &attach_id, id);
5088         node = isl_ast_build_ast_from_schedule(build, schedule);
5089         isl_ast_build_free(build);
5090
5091         return node;
5092 }
5093
5094 /* This function is called for each leaf in the AST of the host code.
5095  * We first specialize the schedule to the site of the leaf, compute
5096  * the size of shared memory and then construct the body of the host code
5097  * and the associated kernel.
5098  *
5099  * The necessary information for printing the kernel launch is
5100  * stored in a struct ppcg_kernel and attached to the leaf node
5101  * created to represent the launch.
5102  */
5103 static __isl_give isl_ast_node *create_host_leaf(
5104         __isl_take isl_ast_build *build, void *user)
5105 {
5106         struct gpu_gen *gen = (struct gpu_gen *) user;
5107         isl_id *id;
5108         isl_ast_node *node;
5109         struct ppcg_kernel *kernel;
5110         isl_set *host_domain;
5111         isl_union_map *schedule;
5112         isl_union_map *local_sched;
5113         isl_union_map *access;
5114         isl_union_set *domain;
5115         int i;
5116
5117         schedule = isl_ast_build_get_schedule(build);
5118
5119         isl_union_map_foreach_map(schedule, &extract_tile_len, gen);
5120         read_sizes(gen);
5121
5122         domain = isl_union_map_domain(isl_union_map_copy(schedule));
5123
5124         local_sched = isl_union_map_copy(gen->sched);
5125         local_sched = isl_union_map_intersect_domain(local_sched, domain);
5126         access = isl_union_map_union(isl_union_map_copy(gen->prog->read),
5127                                      isl_union_map_copy(gen->prog->may_write));
5128         access = isl_union_map_apply_domain(access,
5129                                             isl_union_map_copy(local_sched));
5130
5131         gen->tiled_sched = tile_schedule(gen, local_sched);
5132         gen->tiled_sched = parametrize_tiled_schedule(gen, gen->tiled_sched);
5133         gen->tiled_sched = scale_tile_loops(gen, gen->tiled_sched);
5134
5135         gen->local_sched = isl_union_map_copy(gen->tiled_sched);
5136         gen->local_sched = thread_tile_schedule(gen, gen->local_sched);
5137         gen->local_sched = scale_thread_tile_loops(gen, gen->local_sched);
5138
5139         kernel = gen->kernel = isl_calloc_type(gen->ctx, struct ppcg_kernel);
5140         if (!kernel)
5141                 goto error;
5142
5143         kernel->id = gen->kernel_id++;
5144         kernel->context = isl_union_map_params(isl_union_map_copy(schedule));
5145         kernel->grid_size = extract_grid_size(gen, kernel);
5146         extract_block_size(gen, kernel);
5147         kernel->arrays = isl_union_map_range(access);
5148         kernel->arrays = isl_union_set_apply(kernel->arrays,
5149                                 isl_union_map_copy(gen->prog->to_outer));
5150         kernel->space = isl_ast_build_get_schedule_space(build);
5151
5152         compute_shared_sched(gen);
5153         gen->privatization = compute_privatization(gen);
5154         check_scalar_live_ranges(gen);
5155         if (group_references(gen) < 0)
5156                 schedule = isl_union_map_free(schedule);
5157         host_domain = isl_set_from_union_set(isl_union_map_range(
5158                                                 isl_union_map_copy(schedule)));
5159         localize_bounds(gen, kernel, host_domain);
5160
5161         gen->local_sched = interchange_for_unroll(gen, gen->local_sched);
5162         check_shared_memory_bound(gen);
5163         compute_group_tilings(gen);
5164
5165         kernel->tree = generate_kernel(gen, build, host_domain,
5166                                         kernel->grid_size);
5167         create_kernel_vars(gen, kernel);
5168
5169         free_local_array_info(gen);
5170         isl_map_free(gen->privatization);
5171         isl_union_map_free(gen->local_sched);
5172         isl_union_map_free(gen->tiled_sched);
5173         isl_union_map_free(gen->shared_sched);
5174         isl_union_map_free(gen->shared_proj);
5175         isl_set_free(host_domain);
5176         free(gen->tile_size);
5177
5178         node = construct_launch(build, schedule, kernel);
5179
5180         return node;
5181 error:
5182         isl_union_map_free(schedule);
5183         return NULL;
5184 }
5185
5186 /* Use isl to generate code for the outer gen->tile_first loops
5187  * of the global schedule in gen->sched, resulting in the host code.
5188  * Within each iteration of this partial schedule, i.e., for each kernel
5189  * launch, create_host_leaf takes care of generating the kernel code.
5190  */
5191 static __isl_give isl_ast_node *generate_host_code(struct gpu_gen *gen)
5192 {
5193         isl_ast_build *build;
5194         isl_ast_node *tree;
5195         isl_union_map *sched;
5196         isl_map *proj;
5197         isl_id_list *iterators;
5198
5199         sched = isl_union_map_copy(gen->sched);
5200         proj = projection(isl_union_map_get_space(sched),
5201                             gen->untiled_len, gen->tile_first);
5202         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
5203
5204         isl_options_set_ast_build_group_coscheduled(gen->ctx, 1);
5205         build = isl_ast_build_from_context(isl_set_copy(gen->prog->context));
5206         iterators = generate_names(gen->ctx, gen->tile_first, "h");
5207         build = isl_ast_build_set_iterators(build, iterators);
5208         build = isl_ast_build_set_create_leaf(build, &create_host_leaf, gen);
5209         tree = isl_ast_build_ast_from_schedule(build, sched);
5210         isl_ast_build_free(build);
5211
5212         return tree;
5213 }
5214
5215 __isl_give isl_union_map *extract_sizes_from_str(isl_ctx *ctx, const char *str)
5216 {
5217         if (!str)
5218                 return NULL;
5219         return isl_union_map_read_from_str(ctx, str);
5220 }
5221
5222 /* Information about the outermost tilable bands in the forest of bands.
5223  *
5224  * tile_len and n_parallel are only sets on band_info structures
5225  * that correspond to outermost bands.  For other bands (in particular,
5226  * ancestors of the outermost bands), n_parallal is set to 0.
5227  *
5228  * prefix is the (padded) schedule leading up to the outermost tilable bands.
5229  *
5230  * tile_first is the number of schedule dimensions in prefix.
5231  *
5232  * suffix is the schedule of the outermost tilable bands and their descendants.
5233  */
5234 struct band_info {
5235         struct gpu_gen *gen;
5236         int tile_first;
5237         int tile_len;
5238         int n_parallel;
5239         isl_union_map *prefix;
5240         isl_union_map *suffix;
5241 };
5242
5243 /* Set tile_len and n_parallel of the statement to that of
5244  * their outermost band, recorded in the band_info.
5245  */
5246 static int set_stmt_tile_len(__isl_take isl_map *map, void *user)
5247 {
5248         struct band_info *info = user;
5249         struct gpu_stmt *stmt;
5250         isl_id *id;
5251
5252         id = isl_map_get_tuple_id(map, isl_dim_in);
5253         stmt = find_stmt(info->gen->prog, id);
5254         isl_id_free(id);
5255
5256         stmt->tile_len = info->tile_len;
5257         stmt->n_parallel = info->n_parallel;
5258
5259         isl_map_free(map);
5260
5261         return 0;
5262 }
5263
5264 static void list_select_outer_band(struct gpu_gen *gen,
5265         __isl_take isl_band_list *list, int pos, struct band_info *list_info);
5266
5267 /* Check if this band has any parallel loops.  If so, take it as
5268  * the outermost tilable band.  If not, continue looking for the
5269  * outermost tilable band in the children of the current band.
5270  */
5271 static void band_select_outer_band(struct gpu_gen *gen,
5272         __isl_take isl_band *band, int pos, struct band_info *info)
5273 {
5274         int n = isl_band_n_member(band);
5275         int n_parallel;
5276
5277         for (n_parallel = 0; n_parallel < n; ++n_parallel)
5278                 if (!isl_band_member_is_coincident(band, n_parallel))
5279                         break;
5280
5281         info->n_parallel = n_parallel;
5282         if (n_parallel) {
5283                 gen->any_parallelism = 1;
5284                 info->gen = gen;
5285                 info->tile_first = pos;
5286                 info->tile_len = n;
5287                 info->prefix = isl_band_get_prefix_schedule(band);
5288                 info->suffix = isl_union_map_flat_range_product(
5289                                 isl_band_get_partial_schedule(band),
5290                                 isl_band_get_suffix_schedule(band));
5291                 isl_union_map_foreach_map(info->prefix,
5292                                             &set_stmt_tile_len, info);
5293         } else if (isl_band_has_children(band)) {
5294                 isl_band_list *children;
5295                 children = isl_band_get_children(band);
5296                 list_select_outer_band(gen, children, pos + n, info);
5297         } else {
5298                 info->gen = gen;
5299                 info->tile_first = pos + n;
5300                 info->tile_len = 0;
5301                 info->prefix = isl_union_map_flat_range_product(
5302                                 isl_band_get_prefix_schedule(band),
5303                                 isl_band_get_partial_schedule(band));
5304                 info->suffix = isl_band_get_suffix_schedule(band);
5305                 isl_union_map_foreach_map(info->prefix,
5306                                             &set_stmt_tile_len, info);
5307         }
5308
5309         isl_band_free(band);
5310 }
5311
5312 /* Comparison function that returns a non-zero value for band_infos
5313  * with different tile_len fields or different n_parallel fields.
5314  */
5315 static int cmp_band(const void *p1, const void *p2)
5316 {
5317         const struct band_info *info1 = p1;
5318         const struct band_info *info2 = p2;
5319
5320         if (info1->tile_len != info2->tile_len)
5321                 return info1->tile_len - info2->tile_len;
5322
5323         return info1->n_parallel - info2->n_parallel;
5324 }
5325
5326 /* Extend "umap" with coordinates with fixed value "val"
5327  * to a total length of "dst_len", assuming the original dimension is "src_len".
5328  */
5329 static __isl_give isl_union_map *extend_range(
5330         __isl_take isl_union_map *umap, int src_len, int dst_len, int val)
5331 {
5332         isl_space *dim;
5333         isl_map *map;
5334         int i;
5335
5336         dim = isl_union_map_get_space(umap);
5337         map = isl_map_reverse(projection(dim, dst_len, src_len));
5338         for (i = src_len; i < dst_len; ++i)
5339                 map = isl_map_fix_si(map, isl_dim_out, i, val);
5340
5341         umap = isl_union_map_apply_range(umap, isl_union_map_from_map(map));
5342
5343         return umap;
5344 }
5345
5346 /* Group bands with the same values for tile_len and n_parallel.
5347  * The prefix schedule is then extended with a fixed coordinate that
5348  * is different for each such group.
5349  * Note that the actual values for this coordinate are not important.
5350  * The bands have already been effectively separated at a higher level
5351  * or they are independent and may be executed in parallel.
5352  * The list of band_info has been sorted before this functions is called.
5353  */
5354 static void separate_bands(struct band_info *info, int n)
5355 {
5356         int i;
5357         int j = 0;
5358
5359         for (i = 0; i < n; ++i) {
5360                 int l = info[i].tile_first;
5361
5362                 if (i &&
5363                     (info[i].tile_len != info[i - 1].tile_len ||
5364                      info[i].n_parallel != info[i - 1].n_parallel))
5365                         j++;
5366
5367                 info[i].prefix = extend_range(info[i].prefix,
5368                                                 l, l + 1, j);
5369                 info[i].tile_first = l + 1;
5370         }
5371 }
5372
5373 /* Select the outermost bands in the elements of the list, align
5374  * their prefix schedules, separate bands with different values
5375  * for tile_len and/or n_parallel and then combine the resulting
5376  * prefix and suffix schedules into a single pair of prefix and
5377  * suffix schedules for the entire list.
5378  */
5379 static void list_select_outer_band(struct gpu_gen *gen,
5380         __isl_take isl_band_list *list, int pos, struct band_info *list_info)
5381 {
5382         isl_band *band;
5383         int i;
5384         int n = isl_band_list_n_band(list);
5385         isl_ctx *ctx = isl_band_list_get_ctx(list);
5386         struct band_info *info;
5387         int max_tile_first;
5388         isl_union_map *prefix;
5389         isl_union_map *suffix;
5390
5391         assert(n >= 1);
5392         info = isl_calloc_array(ctx, struct band_info, n);
5393         assert(info);
5394
5395         max_tile_first = 0;
5396         for (i = 0; i < n; ++i) {
5397                 band = isl_band_list_get_band(list, i);
5398                 band_select_outer_band(gen, band, pos, &info[i]);
5399                 if (info[i].tile_first > max_tile_first)
5400                         max_tile_first = info[i].tile_first;
5401         }
5402
5403         for (i = 0; i < n; ++i) {
5404                 if (info[i].tile_first == max_tile_first)
5405                         continue;
5406                 info[i].prefix = extend_range(info[i].prefix,
5407                                         info[i].tile_first, max_tile_first, 0);
5408                 info[i].tile_first = max_tile_first;
5409         }
5410
5411         qsort(info, n, sizeof(struct band_info), &cmp_band);
5412
5413         for (i = 0; i < n - 1; ++i)
5414                 if (info[i].tile_len != info[i + 1].tile_len ||
5415                     info[i].n_parallel != info[i + 1].n_parallel)
5416                         break;
5417
5418         if (i < n -1)
5419                 separate_bands(info, n);
5420
5421         prefix = info[0].prefix;
5422         suffix = info[0].suffix;
5423
5424         for (i = 1; i < n; ++i) {
5425                 prefix = isl_union_map_union(prefix, info[i].prefix);
5426                 suffix = isl_union_map_union(suffix, info[i].suffix);
5427         }
5428
5429         list_info->tile_first = info[0].tile_first;
5430         list_info->tile_len = -1;
5431         list_info->prefix = prefix;
5432         list_info->suffix = suffix;
5433
5434         isl_band_list_free(list);
5435         free(info);
5436 }
5437
5438 /* Select the outermost tilable band that (by construction)
5439  * has at least one parallel loop.
5440  * The starting position of the aligned band is stored in the pair
5441  * gen->tile_first.
5442  * The sizes and number of parallel loops may be different in different
5443  * parts of the band forest and are therefore stored in the gpu_stmts.
5444  *
5445  * Return the complete schedule, with the tilable bands aligned
5446  * at gen->tile_first and padded with zero, if needed.
5447  */
5448 static __isl_give isl_union_map *select_outer_tilable_band(struct gpu_gen *gen,
5449         __isl_keep isl_schedule *schedule)
5450 {
5451         isl_band_list *list;
5452         struct band_info info;
5453
5454         gen->n_parallel = 0;
5455         gen->tile_len = -1;
5456
5457         list = isl_schedule_get_band_forest(schedule);
5458
5459         if (isl_band_list_n_band(list) == 0) {
5460                 isl_band_list_free(list);
5461                 return isl_schedule_get_map(schedule);
5462         }
5463
5464         list_select_outer_band(gen, list, 0, &info);
5465
5466         gen->tile_first = info.tile_first;
5467         info.suffix = align_range(info.suffix);
5468
5469         return isl_union_map_flat_range_product(info.prefix, info.suffix);
5470 }
5471
5472 /* Set gen->untiled_len to the number of scheduling dimensions
5473  * for the schedule of the first domain.
5474  * We assume here that this number is the same for all domains.
5475  */
5476 static int set_untiled_len(__isl_take isl_map *map, void *user)
5477 {
5478         unsigned *untiled_len = user;
5479
5480         *untiled_len = isl_map_dim(map, isl_dim_out);
5481
5482         isl_map_free(map);
5483         return -1;
5484 }
5485
5486 /* Compute an appropriate schedule based on the accesses in
5487  * gen->read and gen->write.
5488  *
5489  * We use the dependences in gen->prog->scop to compute
5490  * a schedule that has a parallel loop in each tilable band.
5491  * Finally, we select the outermost tilable band.
5492  *
5493  * If live range reordering is allowed, then we need to make sure
5494  * that live ranges on arrays are not run in parallel since doing
5495  * so would require array expansion.  We therefore add the array
5496  * order dependences to the coincidence dependences.  Non-zero array
5497  * order dependences will then prevent a schedule dimension from being
5498  * considered parallel.
5499  * Live ranges derived from scalars are allowed to be run in parallel
5500  * since we force the scalars to be mapped to private memory in
5501  * check_scalar_live_ranges.
5502  * If live range reordering is allowed, then the false dependences
5503  * are not added to the validity constraints as that would prevent
5504  * reordering.  Instead, the external false dependences that enforce that reads
5505  * from potentially live-in data precede any later write and
5506  * that writes of potentially live-out data follow any other earlier write
5507  * are added to the validity and the coincidence constraints.
5508  * The false dependences are still added to the proximity constraints
5509  * for consistency with the case where live range reordering is not allowed.
5510  * The coincidence constraints then consist of flow dependences,
5511  * exernal false dependences and array order dependences.
5512  * The independences can be filtered out from the first two sets.
5513  * They have already been filtered out from the array order dependences
5514  * on a per array basis in collect_order_dependences.
5515  * There is no need for a per array handling of the other two sets
5516  * as there should be no flow or external false dependence on local
5517  * variables that can be filtered out.
5518  */
5519 static void compute_schedule(struct gpu_gen *gen)
5520 {
5521         isl_union_set *domain;
5522         isl_union_map *dep_raw, *dep;
5523         isl_union_map *validity, *proximity, *coincidence;
5524         isl_union_map *sched;
5525         isl_schedule_constraints *sc;
5526         isl_schedule *schedule;
5527
5528         domain = isl_union_set_copy(gen->prog->scop->domain);
5529         domain = isl_union_set_intersect_params(domain,
5530                                 isl_set_copy(gen->prog->scop->context));
5531         sc = isl_schedule_constraints_on_domain(isl_union_set_copy(domain));
5532         if (gen->options->live_range_reordering) {
5533                 sc = isl_schedule_constraints_set_conditional_validity(sc,
5534                         isl_union_map_copy(gen->prog->scop->tagged_dep_flow),
5535                         isl_union_map_copy(gen->prog->scop->tagged_dep_order));
5536                 proximity = isl_union_map_copy(gen->prog->scop->dep_flow);
5537                 validity = isl_union_map_copy(proximity);
5538                 validity = isl_union_map_union(validity,
5539                             isl_union_map_copy(gen->prog->scop->dep_external));
5540                 proximity = isl_union_map_union(proximity,
5541                             isl_union_map_copy(gen->prog->scop->dep_false));
5542                 coincidence = isl_union_map_copy(validity);
5543                 coincidence = isl_union_map_subtract(coincidence,
5544                         isl_union_map_copy(gen->prog->scop->independence));
5545                 coincidence = isl_union_map_union(coincidence,
5546                                 isl_union_map_copy(gen->prog->array_order));
5547         } else {
5548                 dep_raw = isl_union_map_copy(gen->prog->scop->dep_flow);
5549                 dep = isl_union_map_copy(gen->prog->scop->dep_false);
5550                 dep = isl_union_map_union(dep, dep_raw);
5551                 dep = isl_union_map_coalesce(dep);
5552                 proximity = isl_union_map_copy(dep);
5553                 coincidence = isl_union_map_copy(dep);
5554                 validity = dep;
5555         }
5556         sc = isl_schedule_constraints_set_validity(sc, validity);
5557         sc = isl_schedule_constraints_set_coincidence(sc, coincidence);
5558         sc = isl_schedule_constraints_set_proximity(sc, proximity);
5559
5560         if (gen->options->debug->dump_schedule_constraints)
5561                 isl_schedule_constraints_dump(sc);
5562         schedule = isl_schedule_constraints_compute_schedule(sc);
5563         if (gen->options->debug->dump_schedule)
5564                 isl_schedule_dump(schedule);
5565
5566         sched = select_outer_tilable_band(gen, schedule);
5567
5568         isl_union_map_foreach_map(sched, &set_untiled_len, &gen->untiled_len);
5569         sched = isl_union_map_intersect_domain(sched, domain);
5570         gen->sched = sched;
5571
5572         isl_schedule_free(schedule);
5573 }
5574
5575 /* Compute the sets of outer array elements that need to be copied in and out.
5576  *
5577  * In particular, for each array that is possibly written anywhere in
5578  * gen->prog and that is visible outside the corresponding scop,
5579  * we copy out its entire extent.
5580  *
5581  * Any array elements that is read without first being written needs
5582  * to be copied in. Furthermore, if there are any array elements that
5583  * are copied out, but that may not be written inside gen->prog, then
5584  * they also need to be copied in to ensure that the value after execution
5585  * is the same as the value before execution.
5586  * In case the array elements are structures, we need to take into
5587  * account that all members of the structures need to be written
5588  * by gen->prog before we can avoid copying the data structure in.
5589  *
5590  * While computing the set of array elements that are copied out but
5591  * not necessarily written, we intersect both sets with the context.
5592  * This helps in those cases where the arrays are declared with a fixed size,
5593  * while the accesses are parametric and the context assigns a fixed value
5594  * to the parameters.
5595  *
5596  * If an element from a local array is read without first being written,
5597  * then there is no point in copying it in since it cannot have been
5598  * written prior to the scop.  Warn about the uninitialized read instead.
5599  */
5600 static void compute_copy_in_and_out(struct gpu_gen *gen)
5601 {
5602         int i;
5603         isl_union_set *local;
5604         isl_union_set *may_write, *must_write;
5605         isl_union_set *copy_in, *copy_out;
5606         isl_union_set *not_written;
5607         isl_union_map *uninitialized;
5608         isl_union_map *local_uninitialized;
5609
5610         must_write = isl_union_map_range(
5611                                 isl_union_map_copy(gen->prog->must_write));
5612         must_write = isl_union_set_intersect_params(must_write,
5613                                             isl_set_copy(gen->prog->context));
5614         may_write = isl_union_map_range(
5615                                 isl_union_map_copy(gen->prog->may_write));
5616         may_write = isl_union_set_intersect_params(may_write,
5617                                             isl_set_copy(gen->prog->context));
5618         may_write = isl_union_set_universe(may_write);
5619         may_write = isl_union_set_apply(may_write,
5620                                     isl_union_map_copy(gen->prog->to_outer));
5621         copy_out = isl_union_set_empty(isl_union_set_get_space(may_write));
5622         local = isl_union_set_copy(copy_out);
5623
5624         for (i = 0; i < gen->prog->n_array; ++i) {
5625                 isl_space *space;
5626                 isl_set *write_i;
5627                 int empty;
5628
5629                 space = isl_space_copy(gen->prog->array[i].space);
5630
5631                 if (gen->prog->array[i].local) {
5632                         isl_set *set;
5633
5634                         set = isl_set_universe(space);
5635                         local = isl_union_set_add_set(local, set);
5636                         continue;
5637                 }
5638
5639                 write_i = isl_union_set_extract_set(may_write, space);
5640                 empty = isl_set_plain_is_empty(write_i);
5641                 isl_set_free(write_i);
5642                 if (empty)
5643                         continue;
5644
5645                 write_i = isl_set_copy(gen->prog->array[i].extent);
5646                 copy_out = isl_union_set_add_set(copy_out, write_i);
5647         }
5648         isl_union_set_free(may_write);
5649
5650         copy_out = isl_union_set_intersect_params(copy_out,
5651                                             isl_set_copy(gen->prog->context));
5652
5653         gen->prog->copy_out = isl_union_set_copy(copy_out);
5654
5655         copy_out = isl_union_set_apply(copy_out,
5656                                     isl_union_map_copy(gen->prog->to_inner));
5657         not_written = isl_union_set_subtract(copy_out, must_write);
5658
5659         uninitialized = isl_union_map_copy(gen->prog->scop->live_in);
5660         local_uninitialized = isl_union_map_copy(uninitialized);
5661
5662         local = isl_union_set_apply(local,
5663                                     isl_union_map_copy(gen->prog->to_inner));
5664         local_uninitialized = isl_union_map_intersect_range(local_uninitialized,
5665                                                             local);
5666         if (!isl_union_map_is_empty(local_uninitialized)) {
5667                 fprintf(stderr,
5668                         "possibly uninitialized reads (not copied in):\n");
5669                 isl_union_map_dump(local_uninitialized);
5670         }
5671         uninitialized = isl_union_map_subtract(uninitialized,
5672                                                 local_uninitialized);
5673         copy_in = isl_union_map_range(uninitialized);
5674         copy_in = isl_union_set_union(copy_in, not_written);
5675         copy_in = isl_union_set_apply(copy_in,
5676                                     isl_union_map_copy(gen->prog->to_outer));
5677
5678         gen->prog->copy_in = copy_in;
5679 }
5680
5681 /* Internal data structure for extract_access.
5682  * "next_access" points to the end of a linked list that is extended
5683  * by extract_access.
5684  * "single_expression" is set if the access expressions belong to
5685  * an expression statement (i.e., a statement without internal control).
5686  * "any_to_outer" maps all intermediate arrays to their outer arrays.
5687  */
5688 struct ppcg_extract_access_data {
5689         struct gpu_stmt_access **next_access;
5690         int single_expression;
5691         isl_union_map *any_to_outer;
5692 };
5693
5694 /* Extract a gpu_stmt_access from "expr", append it to the list
5695  * that ends in *data->next_access and update the end of the list.
5696  * If the access expression performs a write, then it is considered
5697  * exact only if it appears in a single expression statement and
5698  * if its may access relation is equal to its must access relation.
5699  *
5700  * The combined set of may accesses may be union if member accesses
5701  * are involved, but the entire set is derived from a single reference and
5702  * therefore from a single index expression.  These accesses therefore
5703  * all map to the same outer array.
5704  */
5705 static int extract_access(__isl_keep pet_expr *expr, void *user)
5706 {
5707         struct ppcg_extract_access_data *data = user;
5708         isl_union_map *may, *tagged;
5709         struct gpu_stmt_access *access;
5710         isl_ctx *ctx;
5711         isl_multi_pw_aff *index;
5712
5713         may = pet_expr_access_get_may_read(expr);
5714         may = isl_union_map_union(may, pet_expr_access_get_may_write(expr));
5715         may = isl_union_map_apply_range(may,
5716                                         isl_union_map_copy(data->any_to_outer));
5717         ctx = isl_union_map_get_ctx(may);
5718         access = isl_alloc_type(ctx, struct gpu_stmt_access);
5719         assert(access);
5720         access->next = NULL;
5721         access->read = pet_expr_access_is_read(expr);
5722         access->write = pet_expr_access_is_write(expr);
5723         tagged = pet_expr_access_get_tagged_may_read(expr);
5724         tagged = isl_union_map_union(tagged,
5725                                 pet_expr_access_get_tagged_may_write(expr));
5726         tagged = isl_union_map_apply_range(tagged,
5727                                         isl_union_map_copy(data->any_to_outer));
5728         access->tagged_access = isl_map_from_union_map(tagged);
5729         if (!access->write) {
5730                 access->exact_write = 1;
5731         } else if (!data->single_expression) {
5732                 access->exact_write = 0;
5733         } else {
5734                 isl_union_map *must;
5735                 must = pet_expr_access_get_must_write(expr);
5736                 access->exact_write = isl_union_map_is_equal(must, may);
5737                 isl_union_map_free(must);
5738         }
5739         access->access = isl_map_from_union_map(may);
5740         index = pet_expr_access_get_index(expr);
5741         access->n_index = isl_multi_pw_aff_dim(index, isl_dim_out);
5742         isl_multi_pw_aff_free(index);
5743         access->ref_id = pet_expr_access_get_ref_id(expr);
5744         access->group = -1;
5745
5746         *data->next_access = access;
5747         data->next_access = &(*data->next_access)->next;
5748
5749         return 0;
5750 }
5751
5752 /* Construct a linked list of gpu_stmt_access objects,
5753  * one for each access expression in the statement body.
5754  * "any_to_outer" maps all intermediate arrays to their outer arrays.
5755  */
5756 static void pet_stmt_extract_accesses(struct gpu_stmt *stmt,
5757         __isl_keep isl_union_map *any_to_outer)
5758 {
5759         struct ppcg_extract_access_data data;
5760
5761         stmt->accesses = NULL;
5762         data.next_access = &stmt->accesses;
5763         data.single_expression =
5764                 pet_tree_get_type(stmt->stmt->body) == pet_tree_expr;
5765         data.any_to_outer = any_to_outer;
5766         pet_tree_foreach_access_expr(stmt->stmt->body, &extract_access, &data);
5767 }
5768
5769 /* Return an array of gpu_stmt representing the statements in "scop".
5770  */
5771 static struct gpu_stmt *extract_stmts(isl_ctx *ctx, struct ppcg_scop *scop,
5772         __isl_keep isl_set *context, __isl_keep isl_union_map *any_to_outer)
5773 {
5774         int i;
5775         struct gpu_stmt *stmts;
5776
5777         stmts = isl_calloc_array(ctx, struct gpu_stmt, scop->pet->n_stmt);
5778         if (!stmts)
5779                 return NULL;
5780
5781         for (i = 0; i < scop->pet->n_stmt; ++i) {
5782                 struct gpu_stmt *s = &stmts[i];
5783
5784                 s->id = isl_set_get_tuple_id(scop->pet->stmts[i]->domain);
5785                 s->stmt = scop->pet->stmts[i];
5786                 pet_stmt_extract_accesses(s, any_to_outer);
5787         }
5788
5789         return stmts;
5790 }
5791
5792 /* Callback for ppcg_print_guarded that calls the callback for generate_gpu.
5793  */
5794 static __isl_give isl_printer *print_gpu(__isl_take isl_printer *p, void *user)
5795 {
5796         struct gpu_gen *gen = user;
5797
5798         return gen->print(p, gen->prog, gen->tree, &gen->types,
5799                             gen->print_user);
5800 }
5801
5802 /* Generate CUDA code for "scop" and print it to "p".
5803  * After generating an AST for the transformed scop as explained below,
5804  * we call "gen->print" to print the AST in the desired output format
5805  * to "p".
5806  *
5807  * If it turns out that it does not make sense to generate GPU code,
5808  * then we generate CPU code instead.
5809  *
5810  * The GPU code is generated in a context where at least one
5811  * statement instance is executed.  The corresponding guard (if any) is printed
5812  * around the entire generated GPU code, except for the declaration
5813  * of the arrays that are visible outside of the scop and that therefore
5814  * cannot be declared inside the body of any possible guard.
5815  *
5816  * We first compute a schedule that respects the dependences
5817  * of the original program and select the outermost band
5818  * of tilable dimensions that has at least one parallel loop.
5819  * We then have three blocks of dimensions
5820  *
5821  *      H               B                       G
5822  *
5823  * The tilable band "B" is first tiled according to "tile" sizes, resulting
5824  * in
5825  *
5826  *      H       T               P               G
5827  *
5828  * For each iteration of the T loop and for each array, we compute
5829  * the array elements accessed by that iteration, construct a rectangular
5830  * box around it and shift it to the origin.  The result is used
5831  * as shared memory for the array.
5832  *
5833  * We then split off at most 2 parallel loops from the T loops and
5834  * at most 3 parallel loops from the P loops
5835  *
5836  *      H       T1      T2      P1      P2      G
5837  *
5838  * The T1/P1 loops are then tiled or "wrapped" over the blocks/threads,
5839  * according to "grid"/"block" sizes.
5840  *
5841  *      H       T1T T1P T2      P1T P1P P2      G
5842  *
5843  * Finally, the T1P and P1P iterators are equated to the block and
5844  * thread dimensions respectively and so are effectively removed.
5845  * The H loops are run on the host.  The T1T, T2, P1T, P2 and G loops
5846  * are run on the GPU.
5847  *
5848  * Code is generated in three stages.  We first generate code for the
5849  * host (the H loops), with iterators h%d.  Then, for each leaf node
5850  * of the resulting AST, we generate code for the shared loops (up to
5851  * and including T2), with iterators g%d and after equating the H loops
5852  * to h%d parameters and the T1P loops to the block dimensions.
5853  * Finally, we generate code for the remaining loops in a similar fashion.
5854  */
5855 static __isl_give isl_printer *generate(__isl_take isl_printer *p,
5856         struct gpu_gen *gen, struct ppcg_scop *scop,
5857         struct ppcg_options *options)
5858 {
5859         struct gpu_prog *prog;
5860         isl_ctx *ctx;
5861         isl_set *context, *guard;
5862
5863         if (!scop)
5864                 return isl_printer_free(p);
5865
5866         ctx = isl_printer_get_ctx(p);
5867         prog = gpu_prog_alloc(ctx, scop);
5868         if (!prog)
5869                 return isl_printer_free(p);
5870
5871         context = isl_set_copy(prog->context);
5872         guard = isl_union_set_params(isl_union_set_copy(prog->scop->domain));
5873         prog->context = isl_set_intersect(prog->context, isl_set_copy(guard));
5874
5875         gen->prog = prog;
5876         gen->any_parallelism = 0;
5877         compute_schedule(gen);
5878
5879         if (!gen->any_parallelism) {
5880                 isl_set_free(context);
5881                 isl_set_free(guard);
5882                 p = print_cpu(p, scop, options);
5883         } else {
5884                 compute_copy_in_and_out(gen);
5885                 gen->tree = generate_host_code(gen);
5886                 p = ppcg_print_exposed_declarations(p, prog->scop);
5887                 p = ppcg_print_guarded(p, guard, context, &print_gpu, gen);
5888                 isl_ast_node_free(gen->tree);
5889         }
5890
5891         isl_union_map_free(gen->sched);
5892
5893         gpu_prog_free(prog);
5894
5895         return p;
5896 }
5897
5898 /* Wrapper around generate for use as a ppcg_transform callback.
5899  */
5900 static __isl_give isl_printer *generate_wrap(__isl_take isl_printer *p,
5901         struct ppcg_scop *scop, void *user)
5902 {
5903         struct gpu_gen *gen = user;
5904
5905         return generate(p, gen, scop, gen->options);
5906 }
5907
5908 /* Transform the code in the file called "input" by replacing
5909  * all scops by corresponding GPU code and write the results to "out".
5910  */
5911 int generate_gpu(isl_ctx *ctx, const char *input, FILE *out,
5912         struct ppcg_options *options,
5913         __isl_give isl_printer *(*print)(__isl_take isl_printer *p,
5914                 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
5915                 struct gpu_types *types, void *user), void *user)
5916 {
5917         struct gpu_gen gen;
5918         int r;
5919         int i;
5920
5921         gen.ctx = ctx;
5922         gen.sizes = extract_sizes_from_str(ctx, options->sizes);
5923         gen.options = options;
5924         gen.kernel_id = 0;
5925         gen.print = print;
5926         gen.print_user = user;
5927         gen.types.n = 0;
5928         gen.types.name = NULL;
5929
5930         if (options->debug->dump_sizes) {
5931                 isl_space *space = isl_space_params_alloc(ctx, 0);
5932                 gen.used_sizes = isl_union_map_empty(space);
5933         }
5934
5935         r = ppcg_transform(ctx, input, out, options, &generate_wrap, &gen);
5936
5937         if (options->debug->dump_sizes) {
5938                 isl_union_map_dump(gen.used_sizes);
5939                 isl_union_map_free(gen.used_sizes);
5940         }
5941
5942         isl_union_map_free(gen.sizes);
5943         for (i = 0; i < gen.types.n; ++i)
5944                 free(gen.types.name[i]);
5945         free(gen.types.name);
5946
5947         return r;
5948 }
5949
5950 struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop)
5951 {
5952         struct gpu_prog *prog;
5953         isl_space *space;
5954         isl_map *id;
5955
5956         if (!scop)
5957                 return NULL;
5958
5959         prog = isl_calloc_type(ctx, struct gpu_prog);
5960         assert(prog);
5961
5962         prog->ctx = ctx;
5963         prog->scop = scop;
5964         prog->context = isl_set_copy(scop->context);
5965         prog->n_stmts = scop->pet->n_stmt;
5966         prog->any_to_outer = pet_scop_compute_outer_to_any(scop->pet);
5967         prog->any_to_outer = isl_union_map_reverse(prog->any_to_outer);
5968         space = isl_union_map_get_space(prog->any_to_outer);
5969         space = isl_space_set_from_params(space);
5970         space = isl_space_add_dims(space, isl_dim_set, 1);
5971         space = isl_space_map_from_set(space);
5972         id = isl_map_identity(space);
5973         prog->any_to_outer = isl_union_map_add_map(prog->any_to_outer, id);
5974         prog->stmts = extract_stmts(ctx, scop,
5975                                         prog->context, prog->any_to_outer);
5976         prog->read = isl_union_map_copy(scop->reads);
5977         prog->may_write = isl_union_map_copy(scop->may_writes);
5978         prog->must_write = isl_union_map_copy(scop->must_writes);
5979         prog->to_inner = pet_scop_compute_outer_to_inner(scop->pet);
5980         prog->to_outer = isl_union_map_copy(prog->to_inner);
5981         prog->to_outer = isl_union_map_reverse(prog->to_outer);
5982
5983         if (!prog->stmts)
5984                 return gpu_prog_free(prog);
5985
5986         if (collect_array_info(prog) < 0)
5987                 return gpu_prog_free(prog);
5988
5989         return prog;
5990 }
5991
5992 void *gpu_prog_free(struct gpu_prog *prog)
5993 {
5994         if (!prog)
5995                 return NULL;
5996         free_array_info(prog);
5997         free_stmts(prog->stmts, prog->n_stmts);
5998         isl_union_map_free(prog->any_to_outer);
5999         isl_union_map_free(prog->to_outer);
6000         isl_union_map_free(prog->to_inner);
6001         isl_union_set_free(prog->copy_in);
6002         isl_union_set_free(prog->copy_out);
6003         isl_union_map_free(prog->read);
6004         isl_union_map_free(prog->may_write);
6005         isl_union_map_free(prog->must_write);
6006         isl_union_map_free(prog->array_order);
6007         isl_set_free(prog->context);
6008         free(prog);
6009         return NULL;
6010 }