gpu.c

   1 /*
   2  * Copyright 2010-2011 INRIA Saclay
   3  * Copyright 2012-2013 Ecole Normale Superieure
   4  *
   5  * Use of this software is governed by the MIT license
   6  *
   7  * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
   8  * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
   9  * 91893 Orsay, France
  10  * and Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
  11  */
  12
  13 #include <assert.h>
  14 #include <stdlib.h>
  15 #include <string.h>
  16
  17 #include <isl/polynomial.h>
  18 #include <isl/union_set.h>
  19 #include <isl/aff.h>
  20 #include <isl/ilp.h>
  21 #include <isl/flow.h>
  22 #include <isl/schedule.h>
  23 #include <isl/schedule_node.h>
  24 #include <isl/options.h>
  25 #include <isl/ast_build.h>
  26
  27 #include "cpu.h"
  28 #include "gpu.h"
  29 #include "schedule.h"
  30 #include "ppcg_options.h"
  31 #include "print.h"
  32
  33 /* The fields stride and shift only contain valid information
  34  * if shift != NULL.
  35  * If so, they express that current index is such that if you add shift,
  36  * then the result is always a multiple of stride.
  37  * Let D represent the initial shared_len dimensions of the computed schedule.
  38  * The spaces of "lb" and "shift" are of the form
  39  *
  40  *      D -> [b]
  41  */
  42 struct gpu_array_bound {
  43         isl_val *size;
  44         isl_aff *lb;
  45
  46         isl_val *stride;
  47         isl_aff *shift;
  48 };
  49
  50 /* A tile of an array.
  51  *
  52  * n is the dimension of the array.
  53  * bound is an array of size "n" representing the lower bound
  54  *      and size for each index.
  55  *
  56  * tiling maps a tile in the global array to the corresponding
  57  * shared/private memory tile and is of the form
  58  *
  59  *      { [D[i] -> A[a]] -> T[(a + shift(i))/stride - lb(i)] }
  60  *
  61  * where D represents the initial shared_len dimensions
  62  * of the computed schedule.
  63  */
  64 struct gpu_array_tile {
  65         int n;
  66         struct gpu_array_bound *bound;
  67         isl_multi_aff *tiling;
  68 };
  69
  70 struct gpu_array_info;
  71
  72 /* A group of array references in a kernel that should be handled together.
  73  * If private_tile is not NULL, then it is mapped to registers.
  74  * Otherwise, if shared_tile is not NULL, it is mapped to shared memory.
  75  * Otherwise, it is accessed from global memory.
  76  */
  77 struct gpu_array_ref_group {
  78         /* The references in this group access this array. */
  79         struct gpu_array_info *array;
  80         /* Position of this group in the list of reference groups of array. */
  81         int nr;
  82
  83         /* The following fields are use during the construction of the groups.
  84          * access is the combined access relation relative to the shared
  85          * memory tiling.  In particular, the domain of the map corresponds
  86          * to the first shared_len dimensions of the computed schedule.
  87          * write is set if any access in the group is a write.
  88          * exact_write is set if all writes are definite writes.
  89          * slice is set if there is at least one access in the group
  90          * that refers to more than one element
  91          */
  92         isl_map *access;
  93         int write;
  94         int exact_write;
  95         int slice;
  96
  97         /* The shared memory tile, NULL if none. */
  98         struct gpu_array_tile *shared_tile;
  99
 100         /* The private memory tile, NULL if none. */
 101         struct gpu_array_tile *private_tile;
 102
 103         /* References in this group; point to elements of a linked list. */
 104         int n_ref;
 105         struct gpu_stmt_access **refs;
 106
 107         /* Last shared memory tile dimension that affects tile of this group. */
 108         int last_shared;
 109 };
 110
 111 struct gpu_gen {
 112         isl_ctx *ctx;
 113         struct ppcg_options *options;
 114
 115         /* Callback for printing of AST in appropriate format. */
 116         __isl_give isl_printer *(*print)(__isl_take isl_printer *p,
 117                 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
 118                 struct gpu_types *types, void *user);
 119         void *print_user;
 120
 121         struct gpu_prog *prog;
 122         /* The generated AST. */
 123         isl_ast_node *tree;
 124
 125         /* The sequence of types for which a definition has been printed. */
 126         struct gpu_types types;
 127
 128         /* User specified tile, grid and block sizes for each kernel */
 129         isl_union_map *sizes;
 130
 131         /* Effectively used tile, grid and block sizes for each kernel */
 132         isl_union_map *used_sizes;
 133
 134         /* Identifier of current kernel. */
 135         int kernel_id;
 136         /* Pointer to the current kernel. */
 137         struct ppcg_kernel *kernel;
 138         /* Does the computed schedule exhibit any parallelism? */
 139         int any_parallelism;
 140
 141         /* First tile dimension. */
 142         int tile_first;
 143         /* Number of tile dimensions. */
 144         int tile_len;
 145         /* Number of initial parallel loops among tile dimensions. */
 146         int n_parallel;
 147
 148         /* Number of dimensions determining shared memory. */
 149         int shared_len;
 150
 151         /* Number of rows in the untiled schedule. */
 152         int untiled_len;
 153         /* Number of rows in the tiled schedule. */
 154         int tiled_len;
 155         /* Number of rows in schedule after tiling/wrapping over threads. */
 156         int thread_tiled_len;
 157
 158         /* Global untiled schedule. */
 159         isl_union_map *sched;
 160         /* Local (per kernel launch) tiled schedule. */
 161         isl_union_map *tiled_sched;
 162         /* Local schedule per shared memory tile loop iteration. */
 163         isl_union_map *local_sched;
 164
 165         /* Local tiled schedule projected onto the shared tile loops and
 166          * the loops that will be wrapped over the threads,
 167          * with all shared tile loops parametrized.
 168          */
 169         isl_union_map *shared_sched;
 170         /* Projects out the loops that will be wrapped over the threads
 171          * from shared_sched.
 172          */
 173         isl_union_map *shared_proj;
 174
 175         /* A map that takes the range of shared_sched as input,
 176          * wraps the appropriate loops over the threads and then projects
 177          * out these loops.
 178          */
 179         isl_map *privatization;
 180
 181         /* The array reference group corresponding to copy_sched. */
 182         struct gpu_array_ref_group *copy_group;
 183
 184         /* Is any array in the current kernel marked force_private? */
 185         int any_force_private;
 186
 187         /* First loop to unroll (or -1 if none) in the current part of the
 188          * schedule.
 189          */
 190         int first_unroll;
 191
 192         int n_grid;
 193         int n_block;
 194         /* Note: in the input file, the sizes of the grid and the blocks
 195          * are specified in the order x, y, z, but internally, the sizes
 196          * are stored in reverse order, so that the last element always
 197          * refers to the x dimension.
 198          */
 199         int grid_dim[2];
 200         int block_dim[3];
 201         int *tile_size;
 202 };
 203
 204 /* Print the name of the local copy of a given group of array references.
 205  */
 206 static __isl_give isl_printer *print_array_name(__isl_take isl_printer *p,
 207         struct gpu_array_ref_group *group)
 208 {
 209         int global = 0;
 210
 211         if (group->private_tile)
 212                 p = isl_printer_print_str(p, "private_");
 213         else if (group->shared_tile)
 214                 p = isl_printer_print_str(p, "shared_");
 215         else
 216                 global = 1;
 217         p = isl_printer_print_str(p, group->array->name);
 218         if (!global && group->array->n_group > 1) {
 219                 p = isl_printer_print_str(p, "_");
 220                 p = isl_printer_print_int(p, group->nr);
 221         }
 222
 223         return p;
 224 }
 225
 226 /* Collect all references to the given array and store pointers to them
 227  * in array->refs.
 228  *
 229  * If the array contains structures, then there is no need to collect
 230  * the references since we will not be computing any reference groups.
 231  */
 232 static void collect_references(struct gpu_prog *prog,
 233         struct gpu_array_info *array)
 234 {
 235         int i;
 236         int n;
 237
 238         if (array->has_compound_element)
 239                 return;
 240
 241         n = 0;
 242         for (i = 0; i < prog->n_stmts; ++i) {
 243                 struct gpu_stmt *stmt = &prog->stmts[i];
 244                 struct gpu_stmt_access *access;
 245
 246                 for (access = stmt->accesses; access; access = access->next) {
 247                         const char *name;
 248                         name = isl_map_get_tuple_name(access->access,
 249                                                       isl_dim_out);
 250                         if (name && !strcmp(array->name, name))
 251                                 n++;
 252                 }
 253         }
 254
 255         array->n_ref = n;
 256         array->refs = isl_alloc_array(prog->ctx, struct gpu_stmt_access *, n);
 257         assert(array->refs);
 258
 259         n = 0;
 260         for (i = 0; i < prog->n_stmts; ++i) {
 261                 struct gpu_stmt *stmt = &prog->stmts[i];
 262                 struct gpu_stmt_access *access;
 263
 264                 for (access = stmt->accesses; access; access = access->next) {
 265                         const char *name;
 266                         name = isl_map_get_tuple_name(access->access,
 267                                                       isl_dim_out);
 268                         if (!name || strcmp(array->name, name))
 269                                 continue;
 270
 271                         array->refs[n++] = access;
 272                 }
 273         }
 274 }
 275
 276 /* Create a gpu_array_tile for an array of dimension "n_index".
 277  */
 278 static struct gpu_array_tile *create_tile(isl_ctx *ctx, int n_index)
 279 {
 280         int i;
 281         struct gpu_array_tile *tile;
 282
 283         tile = isl_calloc_type(ctx, struct gpu_array_tile);
 284         assert(tile);
 285
 286         tile->n = n_index;
 287
 288         tile->bound = isl_alloc_array(ctx, struct gpu_array_bound, n_index);
 289         assert(tile->bound);
 290
 291         for (i = 0; i < n_index; ++i) {
 292                 tile->bound[i].size = NULL;
 293                 tile->bound[i].lb = NULL;
 294                 tile->bound[i].stride = NULL;
 295                 tile->bound[i].shift = NULL;
 296         }
 297
 298         return tile;
 299 }
 300
 301 static void *free_tile(struct gpu_array_tile *tile)
 302 {
 303         int j;
 304
 305         if (!tile)
 306                 return NULL;
 307
 308         for (j = 0; j < tile->n; ++j) {
 309                 isl_val_free(tile->bound[j].size);
 310                 isl_val_free(tile->bound[j].stride);
 311                 isl_aff_free(tile->bound[j].lb);
 312                 isl_aff_free(tile->bound[j].shift);
 313         }
 314         free(tile->bound);
 315         isl_multi_aff_free(tile->tiling);
 316         free(tile);
 317
 318         return NULL;
 319 }
 320
 321 /* Compute and return the extent of "array", taking into account the set of
 322  * accessed elements.
 323  *
 324  * In particular, the extent in the outer dimension is taken
 325  * from "accessed", while the extents in the remaining dimensions
 326  * are taken from array->extent.
 327  *
 328  * The extent in the outer dimension cannot be taken from array->extent
 329  * because that may be unbounded.  Furthermore, even if it is bounded,
 330  * it may be larger than the piece of the array that is being accessed.
 331  */
 332 static __isl_give isl_set *compute_extent(struct pet_array *array,
 333         __isl_keep isl_set *accessed)
 334 {
 335         int n_index;
 336         isl_id *id;
 337         isl_set *outer;
 338         isl_set *extent;
 339
 340         extent = isl_set_copy(array->extent);
 341
 342         n_index = isl_set_dim(accessed, isl_dim_set);
 343         if (n_index == 0)
 344                 return extent;
 345
 346         extent = isl_set_project_out(extent, isl_dim_set, 0, 1);
 347         outer = isl_set_copy(accessed);
 348         outer = isl_set_project_out(outer, isl_dim_set, 1, n_index - 1);
 349         extent = isl_set_flat_product(outer, extent);
 350         id = isl_set_get_tuple_id(accessed);
 351         extent = isl_set_set_tuple_id(extent, id);
 352
 353         return extent;
 354 }
 355
 356 /* Is the array "array" being extracted a read-only scalar?
 357  *
 358  * That is, is "array" a scalar that is never possibly written to.
 359  * An array containing structures is never considered to be a scalar.
 360  */
 361 static int is_read_only_scalar(struct gpu_array_info *array,
 362         struct gpu_prog *prog)
 363 {
 364         isl_set *space;
 365         isl_union_map *write;
 366         int empty;
 367
 368         if (array->has_compound_element)
 369                 return 0;
 370         if (array->n_index != 0)
 371                 return 0;
 372
 373         write = isl_union_map_copy(prog->may_write);
 374         space = isl_set_universe(isl_space_copy(array->space));
 375         write = isl_union_map_intersect_range(write,
 376                                                 isl_union_set_from_set(space));
 377         empty = isl_union_map_is_empty(write);
 378         isl_union_map_free(write);
 379
 380         return empty;
 381 }
 382
 383 /* Compute bounds on the host array "pa" based on the corresponding
 384  * accessed elements in "arrays"
 385  * and collect all references to the array.
 386  * Store the results in "info".
 387  *
 388  * If the array is zero-dimensional and does not contain structures,
 389  * i.e., if the array is a scalar, we check whether it is read-only.
 390  * We also check whether the array is accessed at all.
 391  */
 392 static int extract_array_info(struct gpu_prog *prog,
 393         struct gpu_array_info *info, struct pet_array *pa,
 394         __isl_keep isl_union_set *arrays)
 395 {
 396         int i, empty;
 397         const char *name;
 398         int n_index;
 399         isl_pw_aff **bounds;
 400         isl_set *accessed, *extent;
 401
 402         n_index = isl_set_dim(pa->extent, isl_dim_set);
 403         name = isl_set_get_tuple_name(pa->extent);
 404         bounds = isl_alloc_array(prog->ctx, isl_pw_aff *, n_index);
 405         if (!bounds)
 406                 return -1;
 407
 408         info->space = isl_set_get_space(pa->extent);
 409         info->name = strdup(name);
 410         info->n_index = n_index;
 411         info->bound = bounds;
 412         info->linearize = prog->scop->options->linearize_device_arrays;
 413
 414         info->type = strdup(pa->element_type);
 415         info->size = pa->element_size;
 416         info->local = pa->declared && !pa->exposed;
 417         info->has_compound_element = pa->element_is_record;
 418         info->read_only_scalar = is_read_only_scalar(info, prog);
 419
 420         accessed = isl_union_set_extract_set(arrays,
 421                                             isl_space_copy(info->space));
 422         empty = isl_set_is_empty(accessed);
 423         extent = compute_extent(pa, accessed);
 424         isl_set_free(accessed);
 425         info->extent = extent;
 426         if (empty < 0)
 427                 return -1;
 428         info->accessed = !empty;
 429         for (i = 0; i < n_index; ++i) {
 430                 isl_set *dom;
 431                 isl_local_space *ls;
 432                 isl_aff *one;
 433                 isl_pw_aff *bound;
 434
 435                 dom = isl_set_copy(extent);
 436                 dom = isl_set_project_out(dom, isl_dim_set, i + 1,
 437                                             n_index - (i + 1));
 438                 dom = isl_set_project_out(dom, isl_dim_set, 0, i);
 439                 if (!isl_set_dim_has_upper_bound(dom, isl_dim_set, 0)) {
 440                         fprintf(stderr, "unable to determine extent of '%s' "
 441                                 "in dimension %d\n", info->name, i);
 442                         dom = isl_set_free(dom);
 443                 }
 444                 bound = isl_set_dim_max(dom, 0);
 445                 dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
 446                 ls = isl_local_space_from_space(isl_set_get_space(dom));
 447                 one = isl_aff_zero_on_domain(ls);
 448                 one = isl_aff_add_constant_si(one, 1);
 449                 bound = isl_pw_aff_add(bound, isl_pw_aff_alloc(dom, one));
 450                 bound = isl_pw_aff_gist(bound, isl_set_copy(prog->context));
 451
 452                 bounds[i] = bound;
 453                 if (!isl_pw_aff_is_cst(bound))
 454                         info->linearize = 1;
 455         }
 456
 457         collect_references(prog, info);
 458
 459         return 0;
 460 }
 461
 462 /* Remove independence from the order constraints "order" on array "array".
 463  * Since the pairs of iterations in the filter relation of an independence
 464  * are guaranteed to be completely independent by the user, there is
 465  * no need to ensure that live ranges are ordered along thong pairs.
 466  * We make an exception for local variables, though, as the independence
 467  * guarantee does not apply to those.
 468  *
 469  * The order constraints are used in two places.
 470  * Those on scalars are used in check_scalar_live_ranges to check if
 471  * we need to force the scalar to be private.  Any non-local scalar
 472  * should not be forced scalar if it only appears in independent loops.
 473  * Those on non-scalars are added to the coincidence constraints
 474  * in compute_schedule because we do not support any array expansion.
 475  * Accesses to non-local arrays should not prevent a loop from being
 476  * considered coincident so we should indeed remove those constraints
 477  * from the order constraints.
 478  */
 479 static __isl_give isl_union_map *remove_independences(struct gpu_prog *prog,
 480         struct gpu_array_info *array, __isl_take isl_union_map *order)
 481 {
 482         int i;
 483
 484         for (i = 0; i < prog->scop->pet->n_independence; ++i) {
 485                 struct pet_independence *pi = prog->scop->pet->independences[i];
 486                 if (isl_union_set_contains(pi->local, array->space))
 487                         continue;
 488
 489                 order = isl_union_map_subtract(order,
 490                                                 isl_union_map_copy(pi->filter));
 491         }
 492
 493         return order;
 494 }
 495
 496 /* For each array in "prog", store the (untagged) order dependences
 497  * derived from the array in array->dep_order.
 498  * In particular, consider all references that access the given array
 499  * and take the order dependences that have one of these references
 500  * as source.  (Since an order dependence relates two references to
 501  * the same array, the target of these order dependences will also
 502  * be one of these references.)
 503  * Additionally, store the union of these array->dep_order relations
 504  * for all non-scalar arrays in prog->array_order.
 505  */
 506 void collect_order_dependences(struct gpu_prog *prog)
 507 {
 508         int i;
 509         isl_space *space;
 510         isl_union_map *accesses;
 511
 512         space = isl_union_map_get_space(prog->read);
 513         prog->array_order = isl_union_map_empty(space);
 514
 515         accesses = isl_union_map_copy(prog->scop->tagged_reads);
 516         accesses = isl_union_map_union(accesses,
 517                             isl_union_map_copy(prog->scop->tagged_may_writes));
 518         accesses = isl_union_map_universe(accesses);
 519         accesses = isl_union_map_apply_range(accesses,
 520                                             isl_union_map_copy(prog->to_outer));
 521
 522         for (i = 0; i < prog->n_array; ++i) {
 523                 struct gpu_array_info *array = &prog->array[i];
 524                 isl_set *set;
 525                 isl_union_set *uset;
 526                 isl_union_map *order;
 527
 528                 set = isl_set_universe(isl_space_copy(array->space));
 529                 uset = isl_union_set_from_set(set);
 530                 uset = isl_union_map_domain(
 531                     isl_union_map_intersect_range(isl_union_map_copy(accesses),
 532                                                     uset));
 533                 order = isl_union_map_copy(prog->scop->tagged_dep_order);
 534                 order = isl_union_map_intersect_domain(order, uset);
 535                 order = isl_union_map_zip(order);
 536                 order = isl_union_set_unwrap(isl_union_map_domain(order));
 537                 order = remove_independences(prog, array, order);
 538                 array->dep_order = order;
 539
 540                 if (gpu_array_is_scalar(array) && !array->has_compound_element)
 541                         continue;
 542
 543                 prog->array_order = isl_union_map_union(prog->array_order,
 544                                         isl_union_map_copy(array->dep_order));
 545         }
 546
 547         isl_union_map_free(accesses);
 548 }
 549
 550 /* Construct a gpu_array_info for each array referenced by prog->scop and
 551  * collect them in prog->array.
 552  *
 553  * The sizes are based on the extents and the set of possibly accessed
 554  * elements by "prog".
 555  * If there are any member accesses involved, then they are first mapped
 556  * to the outer arrays of structs.
 557  *
 558  * If we are allowing live range reordering, then also set
 559  * the dep_order field.  Otherwise leave it NULL.
 560  */
 561 static int collect_array_info(struct gpu_prog *prog)
 562 {
 563         int i;
 564         int r = 0;
 565         isl_union_set *arrays;
 566
 567         arrays = isl_union_map_range(isl_union_map_copy(prog->read));
 568         arrays = isl_union_set_union(arrays,
 569                     isl_union_map_range(isl_union_map_copy(prog->may_write)));
 570
 571         arrays = isl_union_set_apply(arrays,
 572                                         isl_union_map_copy(prog->to_outer));
 573
 574         arrays = isl_union_set_coalesce(arrays);
 575
 576         prog->n_array = prog->scop->pet->n_array;
 577         prog->array = isl_calloc_array(prog->ctx,
 578                                      struct gpu_array_info, prog->n_array);
 579         assert(prog->array);
 580         for (i = 0; i < prog->scop->pet->n_array; ++i)
 581                 if (extract_array_info(prog, &prog->array[i],
 582                                         prog->scop->pet->arrays[i], arrays) < 0)
 583                         r = -1;
 584
 585         isl_union_set_free(arrays);
 586
 587         if (prog->scop->options->live_range_reordering)
 588                 collect_order_dependences(prog);
 589
 590         return r;
 591 }
 592
 593 static void free_array_info(struct gpu_prog *prog)
 594 {
 595         int i, j;
 596
 597         for (i = 0; i < prog->n_array; ++i) {
 598                 int n_index = prog->array[i].n_index;
 599                 free(prog->array[i].type);
 600                 free(prog->array[i].name);
 601                 for (j = 0; j < n_index; ++j)
 602                         isl_pw_aff_free(prog->array[i].bound[j]);
 603                 isl_space_free(prog->array[i].space);
 604                 isl_set_free(prog->array[i].extent);
 605                 free(prog->array[i].bound);
 606                 free(prog->array[i].refs);
 607                 isl_union_map_free(prog->array[i].dep_order);
 608         }
 609         free(prog->array);
 610 }
 611
 612 /* Check if a gpu array is a scalar.  A scalar is a value that is not stored
 613  * as an array or through a pointer reference, but as a single data element.
 614  * At the moment, scalars are represented as zero-dimensional arrays.
 615  * Note that the single data element may be an entire structure.
 616  */
 617 int gpu_array_is_scalar(struct gpu_array_info *array)
 618 {
 619         return array->n_index == 0;
 620 }
 621
 622 /* Is "array" a read-only scalar?
 623  */
 624 int gpu_array_is_read_only_scalar(struct gpu_array_info *array)
 625 {
 626         return array->read_only_scalar;
 627 }
 628
 629 /* Return the set of parameter values for which the array has a positive
 630  * size in all dimensions.
 631  * If the sizes are only valid for some parameter values, then those
 632  * constraints are also taken into account.
 633  */
 634 __isl_give isl_set *gpu_array_positive_size_guard(struct gpu_array_info *array)
 635 {
 636         int i;
 637         isl_space *space;
 638         isl_set *guard;
 639
 640         space = isl_space_params(isl_space_copy(array->space));
 641         guard = isl_set_universe(space);
 642
 643         for (i = 0; i < array->n_index; ++i) {
 644                 isl_pw_aff *bound;
 645                 isl_set *guard_i, *zero;
 646
 647                 bound = isl_pw_aff_copy(array->bound[i]);
 648                 guard_i = isl_pw_aff_nonneg_set(isl_pw_aff_copy(bound));
 649                 zero = isl_pw_aff_zero_set(bound);
 650                 guard_i = isl_set_subtract(guard_i, zero);
 651                 guard = isl_set_intersect(guard, guard_i);
 652         }
 653
 654         return guard;
 655 }
 656
 657 /* Internal data structure for extract_size_of_type.
 658  * "type" specifies the name of the space that we want to extract.
 659  * "res" is used to store the subset of that space.
 660  */
 661 struct ppcg_extract_size_data {
 662         const char *type;
 663         isl_set *res;
 664 };
 665
 666 /* This function is called for each set in a union_set.
 667  * If the name of the set matches data->type, we store the
 668  * set in data->res.
 669  */
 670 static int extract_size_of_type(__isl_take isl_set *size, void *user)
 671 {
 672         struct ppcg_extract_size_data *data = user;
 673         const char *name;
 674
 675         name = isl_set_get_tuple_name(size);
 676         if (name && !strcmp(name, data->type)) {
 677                 data->res = size;
 678                 return -1;
 679         }
 680
 681         isl_set_free(size);
 682         return 0;
 683 }
 684
 685 /* Given a union map { kernel[i] -> *[...] },
 686  * return the range in the space called "type" for the kernel with
 687  * sequence number "id".
 688  */
 689 static __isl_give isl_set *extract_sizes(__isl_keep isl_union_map *sizes,
 690         const char *type, int id)
 691 {
 692         isl_space *space;
 693         isl_set *dom;
 694         isl_union_set *local_sizes;
 695         struct ppcg_extract_size_data data = { type, NULL };
 696
 697         if (!sizes)
 698                 return NULL;
 699
 700         space = isl_union_map_get_space(sizes);
 701         space = isl_space_set_from_params(space);
 702         space = isl_space_add_dims(space, isl_dim_set, 1);
 703         space = isl_space_set_tuple_name(space, isl_dim_set, "kernel");
 704         dom = isl_set_universe(space);
 705         dom = isl_set_fix_si(dom, isl_dim_set, 0, id);
 706
 707         local_sizes = isl_union_set_apply(isl_union_set_from_set(dom),
 708                                         isl_union_map_copy(sizes));
 709         isl_union_set_foreach_set(local_sizes, &extract_size_of_type, &data);
 710         isl_union_set_free(local_sizes);
 711         return data.res;
 712 }
 713
 714 /* Given a singleton set, extract the first (at most *len) elements
 715  * of the single integer tuple into *sizes and update *len if needed.
 716  */
 717 static void read_sizes_from_set(__isl_take isl_set *set, int *sizes, int *len)
 718 {
 719         int i;
 720         int dim;
 721
 722         if (!set)
 723                 return;
 724
 725         dim = isl_set_dim(set, isl_dim_set);
 726         if (dim < *len)
 727                 *len = dim;
 728
 729         for (i = 0; i < *len; ++i) {
 730                 isl_val *v;
 731
 732                 v = isl_set_plain_get_val_if_fixed(set, isl_dim_set, i);
 733                 assert(v);
 734
 735                 sizes[i] = isl_val_get_num_si(v);
 736                 isl_val_free(v);
 737         }
 738
 739         isl_set_free(set);
 740 }
 741
 742 /* Add the map { kernel[id] -> type[sizes] } to gen->used_sizes,
 743  * if the option debug->dump_sizes is set.
 744  */
 745 static void set_used_sizes(struct gpu_gen *gen, const char *type, int id,
 746         int *sizes, int len)
 747 {
 748         int i;
 749         isl_space *space;
 750         isl_map *map;
 751
 752         if (!gen->options->debug->dump_sizes)
 753                 return;
 754
 755         space = isl_union_map_get_space(gen->used_sizes);
 756         space = isl_space_set_from_params(space);
 757         space = isl_space_add_dims(space, isl_dim_set, 1);
 758         space = isl_space_set_tuple_name(space, isl_dim_set, "kernel");
 759         space = isl_space_from_domain(space);
 760         space = isl_space_add_dims(space, isl_dim_out, len);
 761         space = isl_space_set_tuple_name(space, isl_dim_out, type);
 762
 763         map = isl_map_universe(space);
 764         map = isl_map_fix_si(map, isl_dim_in, 0, id);
 765         for (i = 0; i < len; ++i)
 766                 map = isl_map_fix_si(map, isl_dim_out, i, sizes[i]);
 767
 768         gen->used_sizes = isl_union_map_add_map(gen->used_sizes, map);
 769 }
 770
 771 /* Extract user specified "tile" sizes from the "sizes" command line option,
 772  * defaulting to option->tile_size in each dimension.
 773  * Add the effectively used sizes to gen->used_sizes.
 774  */
 775 static void read_tile_sizes(struct gpu_gen *gen)
 776 {
 777         int n;
 778         isl_set *size;
 779
 780         gen->tile_size = isl_alloc_array(gen->ctx, int, gen->tile_len);
 781         assert(gen->tile_size);
 782         for (n = 0; n < gen->tile_len; ++n)
 783                 gen->tile_size[n] = gen->options->tile_size;
 784
 785         size = extract_sizes(gen->sizes, "tile", gen->kernel_id);
 786         read_sizes_from_set(size, gen->tile_size, &gen->tile_len);
 787         set_used_sizes(gen, "tile", gen->kernel_id,
 788                         gen->tile_size, gen->tile_len);
 789
 790         if (gen->n_parallel > gen->tile_len)
 791                 gen->n_parallel = gen->tile_len;
 792 }
 793
 794 /* Extract user specified "block" sizes from the "sizes" command line option,
 795  * after filling in some potentially useful defaults.
 796  * Add the effectively used sizes to gen->used_sizes.
 797  */
 798 static void read_block_sizes(struct gpu_gen *gen)
 799 {
 800         int n;
 801         isl_set *size;
 802
 803         n = gen->n_parallel;
 804         gen->n_block = (n <= 3) ? n : 3;
 805         switch (gen->n_block) {
 806         case 1:
 807                 gen->block_dim[0] = 512;
 808                 break;
 809         case 2:
 810                 gen->block_dim[0] = 32;
 811                 gen->block_dim[1] = 16;
 812                 break;
 813         default:
 814                 gen->block_dim[0] = 32;
 815                 gen->block_dim[1] = 4;
 816                 gen->block_dim[2] = 4;
 817                 break;
 818         }
 819
 820         size = extract_sizes(gen->sizes, "block", gen->kernel_id);
 821         read_sizes_from_set(size, gen->block_dim, &gen->n_block);
 822         set_used_sizes(gen, "block", gen->kernel_id,
 823                         gen->block_dim, gen->n_block);
 824 }
 825
 826 /* Extract user specified "grid" sizes from the "sizes" command line option,
 827  * after filling in some potentially useful defaults.
 828  * Add the effectively used sizes to gen->used_sizes.
 829  */
 830 static void read_grid_sizes(struct gpu_gen *gen)
 831 {
 832         int n = gen->n_parallel;
 833         isl_set *size;
 834
 835         gen->n_grid = (n <= 2) ? n : 2;
 836         switch (gen->n_grid) {
 837         case 1:
 838                 gen->grid_dim[0] = 32768;
 839                 break;
 840         default:
 841                 gen->grid_dim[0] = 256;
 842                 gen->grid_dim[1] = 256;
 843                 break;
 844         }
 845
 846         size = extract_sizes(gen->sizes, "grid", gen->kernel_id);
 847         read_sizes_from_set(size, gen->grid_dim, &gen->n_grid);
 848         set_used_sizes(gen, "grid", gen->kernel_id, gen->grid_dim, gen->n_grid);
 849 }
 850
 851 /* Extract user specified sizes from the "sizes" command line option
 852  * after filling in some potentially useful defaults.
 853  */
 854 static void read_sizes(struct gpu_gen *gen)
 855 {
 856         read_tile_sizes(gen);
 857         read_block_sizes(gen);
 858         read_grid_sizes(gen);
 859 }
 860
 861 static void *free_stmts(struct gpu_stmt *stmts, int n)
 862 {
 863         int i;
 864
 865         if (!stmts)
 866                 return NULL;
 867
 868         for (i = 0; i < n; ++i) {
 869                 struct gpu_stmt_access *access, *next;
 870
 871                 for (access = stmts[i].accesses; access; access = next) {
 872                         next = access->next;
 873                         isl_id_free(access->ref_id);
 874                         isl_map_free(access->access);
 875                         isl_map_free(access->tagged_access);
 876                         free(access);
 877                 }
 878
 879                 isl_id_free(stmts[i].id);
 880         }
 881         free(stmts);
 882
 883         return NULL;
 884 }
 885
 886 /* Construct a map from a domain of dimensionality "len"
 887  * to a domain of dimensionality "len" + "tile_len" that tiles
 888  * the "tile_len" coordinates starting at "first".
 889  * In particular, [s_i] -> [s_i / tile_size[i], s_i % tile_size[i]].
 890  * "dim" prescribes the parameters.
 891  */
 892 static __isl_give isl_map *tile(__isl_take isl_space *dim, int len,
 893         int first, int tile_len, int *tile_size)
 894 {
 895         int i;
 896         isl_basic_map *bmap;
 897         isl_constraint *c;
 898         isl_local_space *ls;
 899
 900         dim = isl_space_add_dims(dim, isl_dim_in, len);
 901         dim = isl_space_add_dims(dim, isl_dim_out, len + tile_len);
 902         bmap = isl_basic_map_universe(isl_space_copy(dim));
 903         ls = isl_local_space_from_space(dim);
 904
 905         for (i = 0; i < len - tile_len; ++i) {
 906                 int j = i < first ? i : i + tile_len;
 907                 int k = i < first ? i : i + 2 * tile_len;
 908
 909                 c = isl_equality_alloc(isl_local_space_copy(ls));
 910                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, j, -1);
 911                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, k, 1);
 912                 bmap = isl_basic_map_add_constraint(bmap, c);
 913         }
 914
 915         for (i = 0; i < tile_len; ++i) {
 916                 c = isl_equality_alloc(isl_local_space_copy(ls));
 917                 c = isl_constraint_set_coefficient_si(c, isl_dim_in,
 918                                                 first + i, -1);
 919                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 920                                                 first + i, tile_size[i]);
 921                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 922                                                 first + i + tile_len, 1);
 923                 bmap = isl_basic_map_add_constraint(bmap, c);
 924
 925                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 926                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 927                                                    first + i + tile_len, 1);
 928                 bmap = isl_basic_map_add_constraint(bmap, c);
 929
 930                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 931                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 932                                                    first + i + tile_len, -1);
 933                 c = isl_constraint_set_constant_si(c, tile_size[i] - 1);
 934                 bmap = isl_basic_map_add_constraint(bmap, c);
 935         }
 936
 937         isl_local_space_free(ls);
 938
 939         return isl_map_from_basic_map(bmap);
 940 }
 941
 942 /* Construct a map from a domain of dimensionality "len"
 943  * to a domain of dimensionality "len" + "wrap_len" that "wraps"
 944  * the "wrap_len" coordinates starting at "first" according to "wrap_size".
 945  * In particular, [s_i] -> [s_i, s_i % wrap_size[i]].
 946  * To do so, we need extra variables corresponding to [s_i / wrap_size[i]],
 947  * that are projected out at the end.
 948  * "dim" prescribes the parameters.
 949  */
 950 static __isl_give isl_map *wrap(__isl_take isl_space *dim, int len,
 951         int first, int wrap_len, int *wrap_size)
 952 {
 953         int i;
 954         isl_basic_map *bmap;
 955         isl_constraint *c;
 956         isl_local_space *ls;
 957
 958         dim = isl_space_add_dims(dim, isl_dim_in, len);
 959         dim = isl_space_add_dims(dim, isl_dim_out, len + 2 * wrap_len);
 960         bmap = isl_basic_map_universe(isl_space_copy(dim));
 961         ls = isl_local_space_from_space(dim);
 962
 963         for (i = 0; i < len; ++i) {
 964                 int k = i < first + wrap_len ? i : i + 2 * wrap_len;
 965
 966                 c = isl_equality_alloc(isl_local_space_copy(ls));
 967                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, -1);
 968                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, k, 1);
 969                 bmap = isl_basic_map_add_constraint(bmap, c);
 970         }
 971
 972         for (i = 0; i < wrap_len; ++i) {
 973                 c = isl_equality_alloc(isl_local_space_copy(ls));
 974                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 975                                                     first + i, -1);
 976                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 977                                                     first + wrap_len + i, 1);
 978                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 979                                     first + 2 * wrap_len + i, wrap_size[i]);
 980                 bmap = isl_basic_map_add_constraint(bmap, c);
 981
 982                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 983                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 984                                                     first + wrap_len + i, 1);
 985                 bmap = isl_basic_map_add_constraint(bmap, c);
 986
 987                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 988                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 989                                                     first + wrap_len + i, -1);
 990                 c = isl_constraint_set_constant_si(c, wrap_size[i] - 1);
 991                 bmap = isl_basic_map_add_constraint(bmap, c);
 992         }
 993
 994         isl_local_space_free(ls);
 995
 996         bmap = isl_basic_map_project_out(bmap, isl_dim_out,
 997                                 first + 2 * wrap_len, wrap_len);
 998
 999         return isl_map_from_basic_map(bmap);
1000 }
1001
1002 /* Add parameters with identifiers "ids" to "set".
1003  */
1004 static __isl_give isl_set *add_params(__isl_take isl_set *set,
1005         __isl_keep isl_id_list *ids)
1006 {
1007         int i, n;
1008         unsigned nparam;
1009
1010         n = isl_id_list_n_id(ids);
1011
1012         nparam = isl_set_dim(set, isl_dim_param);
1013         set = isl_set_add_dims(set, isl_dim_param, n);
1014
1015         for (i = 0; i < n; ++i) {
1016                 isl_id *id;
1017
1018                 id = isl_id_list_get_id(ids, i);
1019                 set = isl_set_set_dim_id(set, isl_dim_param, nparam + i, id);
1020         }
1021
1022         return set;
1023 }
1024
1025 /* Equate the dimensions of "set" starting at "first" to
1026  * freshly created parameters with identifiers "ids".
1027  * The number of equated dimensions is equal to the number of elements in "ids".
1028  */
1029 static __isl_give isl_set *parametrize(__isl_take isl_set *set,
1030         int first, __isl_keep isl_id_list *ids)
1031 {
1032         int i, n;
1033         unsigned nparam;
1034
1035         nparam = isl_set_dim(set, isl_dim_param);
1036
1037         set = add_params(set, ids);
1038
1039         n = isl_id_list_n_id(ids);
1040         for (i = 0; i < n; ++i)
1041                 set = isl_set_equate(set, isl_dim_param, nparam + i,
1042                                         isl_dim_set, first + i);
1043
1044         return set;
1045 }
1046
1047 /* Given a parameter space "space", create a set of dimension "len"
1048  * of which the dimensions starting at "first" are equated to
1049  * freshly created parameters with identifiers "ids".
1050  */
1051 static __isl_give isl_set *parametrization(__isl_take isl_space *space,
1052         int len, int first, __isl_keep isl_id_list *ids)
1053 {
1054         isl_set *set;
1055
1056         space = isl_space_set_from_params(space);
1057         space = isl_space_add_dims(space, isl_dim_set, len);
1058         set = isl_set_universe(space);
1059
1060         return parametrize(set, first, ids);
1061 }
1062
1063 /* Tile the B loops over the tile sizes and then tile/wrap
1064  * the T1 loops over the blocks.
1065  */
1066 static __isl_give isl_union_map *tile_schedule(struct gpu_gen *gen,
1067         __isl_take isl_union_map *sched)
1068 {
1069         isl_space *dim;
1070         isl_map *tiling, *block_tiling;
1071
1072         dim = isl_union_map_get_space(sched);
1073         tiling = tile(isl_space_copy(dim), gen->untiled_len,
1074                       gen->tile_first, gen->tile_len, gen->tile_size);
1075
1076         if (gen->options->wrap)
1077                 block_tiling = wrap(dim, gen->untiled_len + gen->tile_len,
1078                                 gen->tile_first, gen->n_grid, gen->grid_dim);
1079         else
1080                 block_tiling = tile(dim, gen->untiled_len + gen->tile_len,
1081                                 gen->tile_first, gen->n_grid, gen->grid_dim);
1082
1083         gen->tiled_len = gen->untiled_len + gen->tile_len + gen->n_grid;
1084
1085         tiling = isl_map_apply_range(tiling, block_tiling);
1086
1087         sched = isl_union_map_apply_range(sched,
1088                                              isl_union_map_from_map(tiling));
1089
1090         gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
1091
1092         return sched;
1093 }
1094
1095 /* Equate the "T1P" iterators in the tiled schedule "sched"
1096  * to the block dimensions.
1097  */
1098 static __isl_give isl_union_map *parametrize_tiled_schedule(
1099         struct gpu_gen *gen, __isl_take isl_union_map *sched)
1100 {
1101         isl_space *dim;
1102         isl_set *par;
1103
1104         dim = isl_union_map_get_space(sched);
1105         par = parametrization(dim, gen->tiled_len,
1106                 gen->tile_first + gen->n_grid, gen->kernel->block_ids);
1107         sched = isl_union_map_intersect_range(sched,
1108                                                 isl_union_set_from_set(par));
1109
1110         return sched;
1111 }
1112
1113 /* Tile/wrap the P1 loops over the threads.
1114  */
1115 static __isl_give isl_union_map *thread_tile_schedule(struct gpu_gen *gen,
1116         __isl_take isl_union_map *sched)
1117 {
1118         isl_space *dim;
1119         isl_map *tiling;
1120         isl_set *par;
1121
1122         dim = isl_union_map_get_space(sched);
1123
1124         if (gen->options->wrap)
1125                 tiling = wrap(isl_space_copy(dim), gen->tiled_len,
1126                                 gen->shared_len, gen->n_block, gen->block_dim);
1127         else
1128                 tiling = tile(isl_space_copy(dim), gen->tiled_len,
1129                                 gen->shared_len, gen->n_block, gen->block_dim);
1130         gen->thread_tiled_len = gen->tiled_len + gen->n_block;
1131
1132         sched = isl_union_map_apply_range(sched,
1133                                              isl_union_map_from_map(tiling));
1134
1135         par = parametrization(dim, gen->thread_tiled_len,
1136                 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
1137                 gen->kernel->thread_ids);
1138         sched = isl_union_map_intersect_range(sched,
1139                                                 isl_union_set_from_set(par));
1140
1141         gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
1142
1143         return sched;
1144 }
1145
1146 /* If the user asked for it, scale the shared memory tile loops
1147  * (T1T and T2) of "sched" by gen->tile_size[i].
1148  * If we are not performing "wrapping", then additionally scale the T1P
1149  * loops by gen->grid_dim[i].
1150  */
1151 static __isl_give isl_union_map *scale_tile_loops(struct gpu_gen *gen,
1152         __isl_take isl_union_map *sched)
1153 {
1154         int i;
1155         isl_space *dim;
1156         isl_basic_map *scale;
1157         isl_constraint *c;
1158         isl_local_space *ls;
1159
1160         if (!gen->options->scale_tile_loops)
1161                 return sched;
1162
1163         dim = isl_union_map_get_space(sched);
1164         dim = isl_space_add_dims(dim, isl_dim_in, gen->tiled_len);
1165         dim = isl_space_add_dims(dim, isl_dim_out, gen->tiled_len);
1166         scale = isl_basic_map_universe(isl_space_copy(dim));
1167         ls = isl_local_space_from_space(dim);
1168
1169         for (i = 0; i < gen->tiled_len; ++i) {
1170                 int f = 1;
1171
1172                 if (i >= gen->tile_first && i < gen->tile_first + gen->n_grid) {
1173                         f = gen->tile_size[i - gen->tile_first];
1174                         if (!gen->options->wrap)
1175                                 f *= gen->grid_dim[i - gen->tile_first];
1176                 } else if (i >= gen->tile_first + gen->n_grid &&
1177                            i < gen->tile_first + gen->n_grid + gen->tile_len) {
1178                         f = gen->tile_size[i - (gen->tile_first + gen->n_grid)];
1179                 }
1180
1181                 c = isl_equality_alloc(isl_local_space_copy(ls));
1182                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1183                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1184                 scale = isl_basic_map_add_constraint(scale, c);
1185         }
1186
1187         isl_local_space_free(ls);
1188
1189         sched = isl_union_map_apply_range(sched,
1190                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1191
1192         return sched;
1193 }
1194
1195 /* If we are not performing "wrapping" and if the user asked for it,
1196  * scale the thread tile loops (P1T) of "sched" by gen->block_dim[i].
1197  */
1198 static __isl_give isl_union_map *scale_thread_tile_loops(struct gpu_gen *gen,
1199         __isl_take isl_union_map *sched)
1200 {
1201         int i;
1202         isl_space *dim;
1203         isl_basic_map *scale;
1204         isl_constraint *c;
1205         isl_local_space *ls;
1206
1207         if (gen->options->wrap)
1208                 return sched;
1209         if (!gen->options->scale_tile_loops)
1210                 return sched;
1211
1212         dim = isl_union_map_get_space(sched);
1213         dim = isl_space_add_dims(dim, isl_dim_in, gen->thread_tiled_len);
1214         dim = isl_space_add_dims(dim, isl_dim_out, gen->thread_tiled_len);
1215         scale = isl_basic_map_universe(isl_space_copy(dim));
1216         ls = isl_local_space_from_space(dim);
1217
1218         for (i = 0; i < gen->thread_tiled_len; ++i) {
1219                 int f = 1;
1220
1221                 if (i >= gen->shared_len &&
1222                     i < gen->shared_len + gen->n_block)
1223                         f = gen->block_dim[i - gen->shared_len];
1224
1225                 c = isl_equality_alloc(isl_local_space_copy(ls));
1226                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1227                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1228                 scale = isl_basic_map_add_constraint(scale, c);
1229         }
1230
1231         isl_local_space_free(ls);
1232
1233         sched = isl_union_map_apply_range(sched,
1234                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1235
1236         return sched;
1237 }
1238
1239 /* If we are not performing "wrapping" and if the user asked for it,
1240  * scale the "n_tile" loops starting at "first" of "sched" by gen->block_dim[i].
1241  */
1242 static __isl_give isl_union_map *scale_access_tile_loops(struct gpu_gen *gen,
1243         __isl_take isl_union_map *sched, int len, int first, int n_tile)
1244 {
1245         int i;
1246         isl_space *dim;
1247         isl_basic_map *scale;
1248         isl_constraint *c;
1249         isl_local_space *ls;
1250
1251         if (gen->options->wrap)
1252                 return sched;
1253         if (!gen->options->scale_tile_loops)
1254                 return sched;
1255
1256         dim = isl_union_map_get_space(sched);
1257         dim = isl_space_add_dims(dim, isl_dim_in, len);
1258         dim = isl_space_add_dims(dim, isl_dim_out, len);
1259         scale = isl_basic_map_universe(isl_space_copy(dim));
1260         ls = isl_local_space_from_space(dim);
1261
1262         for (i = 0; i < len; ++i) {
1263                 int f = 1;
1264
1265                 if (i >= first && i < first + n_tile)
1266                         f = gen->kernel->block_dim[i - first];
1267
1268                 c = isl_equality_alloc(isl_local_space_copy(ls));
1269                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1270                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1271                 scale = isl_basic_map_add_constraint(scale, c);
1272         }
1273
1274         isl_local_space_free(ls);
1275
1276         sched = isl_union_map_apply_range(sched,
1277                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1278
1279         return sched;
1280 }
1281
1282 /* Add parameters p[i] with identifiers "ids" to "set",
1283  * with bounds to 0 <= p[i] < size[i].
1284  */
1285 __isl_give isl_set *add_bounded_parameters(__isl_take isl_set *set,
1286         int *size, __isl_keep isl_id_list *ids)
1287 {
1288         int i, len;
1289         unsigned nparam;
1290
1291         len = isl_id_list_n_id(ids);
1292         nparam = isl_set_dim(set, isl_dim_param);
1293         set = isl_set_add_dims(set, isl_dim_param, len);
1294
1295         for (i = 0; i < len; ++i) {
1296                 isl_id *id;
1297
1298                 id = isl_id_list_get_id(ids, i);
1299                 set = isl_set_set_dim_id(set, isl_dim_param, nparam + i, id);
1300                 set = isl_set_lower_bound_si(set, isl_dim_param, nparam + i, 0);
1301                 set = isl_set_upper_bound_si(set, isl_dim_param,
1302                                             nparam + i, size[i] - 1);
1303         }
1304
1305         return set;
1306 }
1307
1308 /* Add "len" parameters p[i] with identifiers "ids" and intersect "set"
1309  * with
1310  *
1311  *      { : 0 <= p[i] < size[i] }
1312  *
1313  * or an overapproximation.
1314  */
1315 static __isl_give isl_set *add_bounded_parameters_dynamic(
1316         __isl_take isl_set *set, __isl_keep isl_multi_pw_aff *size,
1317         __isl_keep isl_id_list *ids)
1318 {
1319         int i, len;
1320         unsigned nparam;
1321         isl_space *space;
1322         isl_local_space *ls;
1323
1324         len = isl_multi_pw_aff_dim(size, isl_dim_out);
1325         nparam = isl_set_dim(set, isl_dim_param);
1326         set = isl_set_add_dims(set, isl_dim_param, len);
1327
1328         for (i = 0; i < len; ++i) {
1329                 isl_id *id;
1330
1331                 id = isl_id_list_get_id(ids, i);
1332                 set = isl_set_set_dim_id(set, isl_dim_param, nparam + i, id);
1333         }
1334
1335         space = isl_space_params(isl_set_get_space(set));
1336         ls = isl_local_space_from_space(space);
1337         for (i = 0; i < len; ++i) {
1338                 isl_pw_aff *param, *size_i, *zero;
1339                 isl_set *bound;
1340
1341                 param = isl_pw_aff_var_on_domain(isl_local_space_copy(ls),
1342                                                 isl_dim_param, nparam + i);
1343
1344                 size_i = isl_multi_pw_aff_get_pw_aff(size, i);
1345                 bound = isl_pw_aff_lt_set(isl_pw_aff_copy(param), size_i);
1346                 bound = isl_set_from_basic_set(isl_set_simple_hull(bound));
1347                 set = isl_set_intersect_params(set, bound);
1348
1349                 zero = isl_pw_aff_zero_on_domain(isl_local_space_copy(ls));
1350                 bound = isl_pw_aff_ge_set(param, zero);
1351                 set = isl_set_intersect_params(set, bound);
1352         }
1353         isl_local_space_free(ls);
1354
1355         return set;
1356 }
1357
1358 /* Construct a map from an access to group->array to the corresponding
1359  * shared/private memory tile.
1360  * The map is of the form
1361  *
1362  *      { [D[i] -> A[a]] -> T[t] }
1363  *
1364  * where D represents the initial shared_len dimensions
1365  * of the computed schedule.
1366  */
1367 static __isl_give isl_map *shift_access(struct gpu_array_ref_group *group)
1368 {
1369         struct gpu_array_tile *tile;
1370         isl_multi_aff *tiling;
1371
1372         tile = group->private_tile;
1373         if (!tile)
1374                 tile = group->shared_tile;
1375
1376         tiling = isl_multi_aff_copy(tile->tiling);
1377
1378         return isl_map_from_multi_aff(tiling);
1379 }
1380
1381 /* Does "map" have an obviously fixed value at variable "pos" of "type"?
1382  */
1383 static int map_plain_is_fixed(isl_map *map, enum isl_dim_type type,
1384         unsigned pos)
1385 {
1386         isl_val *v;
1387         int fixed;
1388
1389         v = isl_map_plain_get_val_if_fixed(map, type, pos);
1390         if (!v)
1391                 return -1;
1392         fixed = isl_val_is_int(v);
1393         isl_val_free(v);
1394
1395         return fixed;
1396 }
1397
1398 /* Given a schedule that iterates over all elements in a piece of an array,
1399  * perform tiling/wrapping over the threads.
1400  *
1401  * In particular, we tile the final iterators so that the final thread
1402  * dimension runs over the final array dimension.
1403  * However, if those final iterators have only a single iteration,
1404  * we try to tile earlier iterators instead.
1405  */
1406 static __isl_give isl_map *tile_access_schedule(struct gpu_gen *gen,
1407         __isl_take isl_map *sched)
1408 {
1409         isl_space *dim;
1410         isl_union_map *usched;
1411         isl_map *tiling;
1412         isl_set *par;
1413         unsigned nvar = isl_map_dim(sched, isl_dim_out);
1414         int n_tile;
1415         int first;
1416
1417         n_tile = gen->kernel->n_block;
1418         if (n_tile > nvar) {
1419                 int i;
1420                 sched = isl_map_insert_dims(sched,
1421                                                 isl_dim_out, 0, n_tile - nvar);
1422                 for (i = 0; i < n_tile - nvar; ++i)
1423                         sched = isl_map_fix_si(sched, isl_dim_out, i, 0);
1424                 nvar = n_tile;
1425         }
1426
1427         first = nvar - n_tile;
1428
1429         for (; first > 0; first --)
1430                 if (!map_plain_is_fixed(sched, isl_dim_out, first + n_tile - 1))
1431                         break;
1432
1433         dim = isl_map_get_space(sched);
1434         dim = isl_space_params(dim);
1435         if (gen->options->wrap)
1436                 tiling = wrap(isl_space_copy(dim), nvar, first,
1437                                 n_tile, gen->kernel->block_dim);
1438         else
1439                 tiling = tile(isl_space_copy(dim), nvar, first,
1440                                 n_tile, gen->kernel->block_dim);
1441         sched = isl_map_apply_range(sched, tiling);
1442
1443         par = parametrization(dim, nvar + n_tile, first + n_tile,
1444                                 gen->kernel->thread_ids);
1445         sched = isl_map_intersect_range(sched, par);
1446
1447         usched = isl_union_map_from_map(sched);
1448         usched = scale_access_tile_loops(gen, usched, nvar + n_tile,
1449                                          first, n_tile);
1450         sched = isl_map_from_union_map(usched);
1451
1452         return sched;
1453 }
1454
1455 /* Return the union of all read (read = 1) and/or write (write = 1)
1456  * access relations in the group.
1457  */
1458 static __isl_give isl_union_map *group_access_relation(
1459         struct gpu_array_ref_group *group, int read, int write)
1460 {
1461         int i;
1462         isl_union_map *access;
1463
1464         access = isl_union_map_empty(isl_map_get_space(group->access));
1465         for (i = 0; i < group->n_ref; ++i) {
1466                 isl_map *map_i;
1467
1468                 if (!((read && group->refs[i]->read) ||
1469                      (write && group->refs[i]->write)))
1470                         continue;
1471                 map_i = isl_map_copy(group->refs[i]->access);
1472                 access = isl_union_map_union(access,
1473                                             isl_union_map_from_map(map_i));
1474         }
1475
1476         return access;
1477 }
1478
1479 /* Return the union of all tagged access relations in the group.
1480  */
1481 static __isl_give isl_union_map *group_tagged_access_relation(
1482         struct gpu_array_ref_group *group)
1483 {
1484         int i;
1485         isl_union_map *access;
1486
1487         access = isl_union_map_empty(isl_map_get_space(group->access));
1488         for (i = 0; i < group->n_ref; ++i) {
1489                 isl_map *map_i;
1490
1491                 map_i = isl_map_copy(group->refs[i]->tagged_access);
1492                 access = isl_union_map_union(access,
1493                                             isl_union_map_from_map(map_i));
1494         }
1495
1496         return access;
1497 }
1498
1499 /* Return the extent of "array", recomputed from the bounds.
1500  * The recomputed extent may be simpler than the original extent.
1501  */
1502 static __isl_give isl_set *array_extent(struct gpu_array_info *array)
1503 {
1504         int i;
1505         isl_id *id;
1506         isl_space *space;
1507         isl_local_space *ls;
1508         isl_set *extent;
1509
1510         id = isl_set_get_tuple_id(array->extent);
1511         space = isl_set_get_space(array->extent);
1512         extent = isl_set_universe(isl_space_copy(space));
1513         ls = isl_local_space_from_space(space);
1514         for (i = 0; i < array->n_index; ++i) {
1515                 isl_pw_aff *bound;
1516                 isl_aff *aff;
1517                 isl_pw_aff *index;
1518                 isl_set *lt;
1519
1520                 extent = isl_set_lower_bound_si(extent, isl_dim_set, i, 0);
1521
1522                 aff = isl_aff_var_on_domain(isl_local_space_copy(ls),
1523                                                 isl_dim_set, i);
1524                 index = isl_pw_aff_from_aff(aff);
1525                 bound = isl_pw_aff_copy(array->bound[i]);
1526                 bound = isl_pw_aff_from_range(bound);
1527                 bound = isl_pw_aff_add_dims(bound, isl_dim_in, array->n_index);
1528                 bound = isl_pw_aff_set_tuple_id(bound, isl_dim_in,
1529                                                 isl_id_copy(id));
1530                 lt = isl_pw_aff_lt_set(index, bound);
1531                 extent = isl_set_intersect(extent, lt);
1532         }
1533         isl_local_space_free(ls);
1534         isl_id_free(id);
1535
1536         return extent;
1537 }
1538
1539 /* Return a map from the first shared_len dimensions of the computed
1540  * schedule to the array tile in
1541  * global memory that corresponds to the shared memory copy.
1542  *
1543  * In particular, return a map
1544  *
1545  *      { D[i] -> A[a] }
1546  *
1547  * with constraints
1548  *
1549  *      tile_offset(i) <= a <= tile_offset(i) + tile_size - 1           (1)
1550  *
1551  * and
1552  *
1553  *      0 <= a <= array_size - 1                                        (2)
1554  *
1555  * Note that if some stride has been detected (i.e., when
1556  * group->shared_tile->bound[i].shift is set), then a in (1) refers
1557  * to the shifted and scaled down version.
1558  *
1559  * Constraints (1) are obtained by mapping the size constraints on the
1560  * shared/private memory tile back to the access relation.
1561  * Constraints (2) are obtained from the (recomputed) extent.
1562  */
1563 static __isl_give isl_map *group_tile(struct gpu_array_ref_group *group)
1564 {
1565         int i;
1566         int n_index = group->array->n_index;
1567         isl_map *tile;
1568         isl_space *space;
1569         isl_set *local;
1570         isl_set *extent;
1571
1572         space = isl_multi_aff_get_space(group->shared_tile->tiling);
1573         space = isl_space_range(space);
1574         local = isl_set_universe(space);
1575         for (i = 0; i < n_index; ++i) {
1576                 isl_val *bound;
1577
1578                 local = isl_set_lower_bound_si(local, isl_dim_set, i, 0);
1579                 bound = isl_val_copy(group->shared_tile->bound[i].size);
1580                 bound = isl_val_sub_ui(bound, 1);
1581                 local = isl_set_upper_bound_val(local, isl_dim_set, i, bound);
1582         }
1583         local = isl_set_preimage_multi_aff(local,
1584                                 isl_multi_aff_copy(group->shared_tile->tiling));
1585         tile = isl_set_unwrap(local);
1586         extent = array_extent(group->array);
1587         tile = isl_map_intersect_range(tile, extent);
1588
1589         return tile;
1590 }
1591
1592 /* Given a mapping "iterator_map" from the AST schedule to a domain,
1593  * return the corresponding mapping from the AST schedule to
1594  * to the first shared_len dimensions of the schedule computed by PPCG.
1595  */
1596 static __isl_give isl_pw_multi_aff *compute_sched_to_shared(struct gpu_gen *gen,
1597         __isl_take isl_pw_multi_aff *iterator_map)
1598 {
1599         isl_union_map *umap;
1600         isl_space *space;
1601         isl_map *map, *sched;;
1602
1603         space = isl_space_range(isl_pw_multi_aff_get_space(iterator_map));
1604         space = isl_space_from_domain(space);
1605         space = isl_space_add_dims(space, isl_dim_out, gen->shared_len);
1606
1607         umap = isl_union_map_copy(gen->shared_sched);
1608         umap = isl_union_map_apply_range(umap,
1609                         isl_union_map_copy(gen->shared_proj));
1610         map = isl_union_map_extract_map(umap, space);
1611         isl_union_map_free(umap);
1612
1613         sched = isl_map_preimage_domain_pw_multi_aff(map, iterator_map);
1614         sched = isl_map_detect_equalities(sched);
1615
1616         return isl_pw_multi_aff_from_map(sched);
1617 }
1618
1619 /* Set unroll[j] if the input dimension j is involved in
1620  * the index expression represented by ma.
1621  */
1622 static int check_unroll(__isl_take isl_set *set, __isl_take isl_multi_aff *ma,
1623         void *user)
1624 {
1625         int i, j;
1626         int n_in = isl_multi_aff_dim(ma, isl_dim_in);
1627         int n_out = isl_multi_aff_dim(ma, isl_dim_out);
1628         int *unroll = user;
1629
1630         for (i = 0; i < n_out; ++i) {
1631                 isl_aff *aff;
1632
1633                 aff = isl_multi_aff_get_aff(ma, i);
1634                 for (j = 0; j < n_in; ++j)
1635                         if (isl_aff_involves_dims(aff, isl_dim_in, j, 1))
1636                                 unroll[j] = 1;
1637                 isl_aff_free(aff);
1638         }
1639
1640         isl_set_free(set);
1641         isl_multi_aff_free(ma);
1642         return 0;
1643 }
1644
1645 /* Given an array pos mapping input dimensions to the corresponding
1646  * output dimension, construct the corresponding map.
1647  */
1648 static __isl_give isl_map *permutation(__isl_take isl_space *dim,
1649         int *pos, int len)
1650 {
1651         int i;
1652         isl_constraint *c;
1653         isl_basic_map *bmap;
1654         isl_local_space *ls;
1655
1656         dim = isl_space_add_dims(dim, isl_dim_in, len);
1657         dim = isl_space_add_dims(dim, isl_dim_out, len);
1658         bmap = isl_basic_map_universe(isl_space_copy(dim));
1659         ls = isl_local_space_from_space(dim);
1660
1661         for (i = 0; i < len; ++i) {
1662                 c = isl_equality_alloc(isl_local_space_copy(ls));
1663                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i,
1664                                                       -1);
1665                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, pos[i],
1666                                                       1);
1667                 bmap = isl_basic_map_add_constraint(bmap, c);
1668         }
1669         isl_local_space_free(ls);
1670
1671         return isl_map_from_basic_map(bmap);
1672 }
1673
1674 /* Remove the private tiles from all array reference groups,
1675  * except for the groups of arrays that are marked force_private.
1676  */
1677 static void remove_private_tiles(struct gpu_gen *gen)
1678 {
1679         int i, j;
1680
1681         for (i = 0; i < gen->prog->n_array; ++i) {
1682                 struct gpu_array_info *array = &gen->prog->array[i];
1683
1684                 if (array->force_private)
1685                         continue;
1686
1687                 for (j = 0; j < array->n_group; ++j) {
1688                         struct gpu_array_ref_group *group = array->groups[j];
1689
1690                         group->private_tile = free_tile(group->private_tile);
1691                 }
1692         }
1693 }
1694
1695 /* Find all loops involved in any of the index expressions for any of
1696  * the private accesses, move them innermost and then mark them as
1697  * requiring unrolling by setting gen->first_unroll.
1698  * The loops involved should all be parallel because of the checks
1699  * we performed in check_private_group_access.  Moving them innermost
1700  * is therefore a valid transformation.
1701  *
1702  * If any of the arrays are marked force_private, however, then
1703  * those loops may not be parallel with respect to the marked arrays.
1704  * If any of the loops would have to be moved innermost for the
1705  * (non forced) private accesses and if there are any force_private
1706  * arrays, then we revert the decision to map the selected arrays
1707  * to private memory.  An alternative solution would be to expand
1708  * the force_private arrays.
1709  *
1710  * Loops up to gen->shared_len are generated before the mapping to
1711  * threads is applied.  They should therefore be ignored.
1712  *
1713  * We compute the hidden equalities of the schedule first
1714  * since we will need them in our calls to isl_pw_multi_aff_from_map
1715  * and because we want to make sure that the same equalities
1716  * are also available to the code generator.
1717  */
1718 static __isl_give isl_union_map *interchange_for_unroll(struct gpu_gen *gen,
1719         __isl_take isl_union_map *sched)
1720 {
1721         int i, j;
1722         int unroll[gen->thread_tiled_len];
1723         int perm[gen->thread_tiled_len];
1724         isl_space *dim;
1725         isl_map *permute;
1726         int len = gen->shared_len + gen->n_parallel + gen->n_block;
1727
1728         gen->first_unroll = -1;
1729
1730         sched = isl_union_map_detect_equalities(sched);
1731         for (i = 0; i < gen->thread_tiled_len; ++i)
1732                 unroll[i] = 0;
1733         for (i = 0; i < gen->prog->n_array; ++i) {
1734                 struct gpu_array_info *array = &gen->prog->array[i];
1735
1736                 for (j = 0; j < array->n_group; ++j) {
1737                         isl_union_map *access;
1738                         isl_map *acc;
1739                         isl_pw_multi_aff *pma;
1740
1741                         if (!array->groups[j]->private_tile)
1742                                 continue;
1743
1744                         access = group_access_relation(array->groups[j], 1, 1);
1745                         access = isl_union_map_apply_domain(access,
1746                                                 isl_union_map_copy(sched));
1747
1748                         acc = isl_map_from_union_map(access);
1749                         pma = isl_pw_multi_aff_from_map(acc);
1750                         isl_pw_multi_aff_foreach_piece(pma,
1751                                                         &check_unroll, unroll);
1752
1753                         isl_pw_multi_aff_free(pma);
1754                 }
1755         }
1756
1757         for (i = gen->shared_len; i < len; ++i)
1758                 if (unroll[i])
1759                         break;
1760
1761         if (i >= len)
1762                 return sched;
1763
1764         for (i = len; i < gen->thread_tiled_len; ++i)
1765                 if (unroll[i])
1766                         return sched;
1767
1768         if (gen->any_force_private) {
1769                 remove_private_tiles(gen);
1770                 return sched;
1771         }
1772
1773         j = 0;
1774         for (i = 0; i < gen->shared_len; ++i)
1775                 perm[i] = j++;
1776         for (i = gen->shared_len; i < gen->thread_tiled_len; ++i)
1777                 if (!unroll[i])
1778                         perm[i] = j++;
1779         gen->first_unroll = j - gen->shared_len;
1780         for (i = gen->shared_len; i < len; ++i)
1781                 if (unroll[i])
1782                         perm[i] = j++;
1783
1784         dim = isl_union_map_get_space(sched);
1785         permute = permutation(dim, perm, gen->thread_tiled_len);
1786         sched = isl_union_map_apply_range(sched,
1787                                           isl_union_map_from_map(permute));
1788
1789         return sched;
1790 }
1791
1792 /* Given a constraint
1793  *
1794  *              a(p,i) + j = g f(e)
1795  *
1796  * or -a(p,i) - j = g f(e) if sign < 0,
1797  * store a(p,i) in bound->shift and g (stride) in bound->stride.
1798  * a(p,i) is assumed to be an expression in only the parameters
1799  * and the input dimensions.
1800  */
1801 static void extract_stride(__isl_keep isl_constraint *c,
1802         struct gpu_array_bound *bound, __isl_keep isl_val *stride, int sign)
1803 {
1804         int i;
1805         isl_val *v;
1806         isl_space *space;
1807         unsigned nparam;
1808         unsigned nvar;
1809         isl_aff *aff;
1810
1811         isl_val_free(bound->stride);
1812         bound->stride = isl_val_copy(stride);
1813
1814         space = isl_constraint_get_space(c);
1815         space = isl_space_domain(space);
1816
1817         nparam = isl_space_dim(space, isl_dim_param);
1818         nvar = isl_space_dim(space, isl_dim_set);
1819
1820         v = isl_constraint_get_constant_val(c);
1821         if (sign < 0)
1822                 v = isl_val_neg(v);
1823         aff = isl_aff_zero_on_domain(isl_local_space_from_space(space));
1824         aff = isl_aff_set_constant_val(aff, v);
1825
1826         for (i = 0; i < nparam; ++i) {
1827                 if (!isl_constraint_involves_dims(c, isl_dim_param, i, 1))
1828                         continue;
1829                 v = isl_constraint_get_coefficient_val(c, isl_dim_param, i);
1830                 if (sign < 0)
1831                         v = isl_val_neg(v);
1832                 aff = isl_aff_add_coefficient_val(aff, isl_dim_param, i, v);
1833         }
1834
1835         for (i = 0; i < nvar; ++i) {
1836                 if (!isl_constraint_involves_dims(c, isl_dim_in, i, 1))
1837                         continue;
1838                 v = isl_constraint_get_coefficient_val(c, isl_dim_in, i);
1839                 if (sign < 0)
1840                         v = isl_val_neg(v);
1841                 aff = isl_aff_add_coefficient_val(aff, isl_dim_in, i, v);
1842         }
1843
1844         bound->shift = aff;
1845 }
1846
1847 /* Given an equality constraint of a map with a single output dimension j,
1848  * check if the constraint is of the form
1849  *
1850  *              a(p,i) + j = g f(e)
1851  *
1852  * with a(p,i) an expression in the parameters and input dimensions
1853  * and f(e) an expression in the existentially quantified variables.
1854  * If so, and if g is larger than any such g from a previously considered
1855  * constraint, then call extract_stride to record the stride information
1856  * in bound.
1857  */
1858 static int check_stride_constraint(__isl_take isl_constraint *c, void *user)
1859 {
1860         int i;
1861         isl_ctx *ctx;
1862         isl_val *v;
1863         unsigned n_div;
1864         struct gpu_array_bound *bound = user;
1865
1866         ctx = isl_constraint_get_ctx(c);
1867         n_div = isl_constraint_dim(c, isl_dim_div);
1868         v = isl_constraint_get_coefficient_val(c, isl_dim_out, 0);
1869
1870         if (n_div && (isl_val_is_one(v) || isl_val_is_negone(v))) {
1871                 int s = isl_val_sgn(v);
1872                 isl_val *stride = isl_val_zero(ctx);
1873
1874                 isl_val_free(v);
1875                 for (i = 0; i < n_div; ++i) {
1876                         v = isl_constraint_get_coefficient_val(c,
1877                                                                 isl_dim_div, i);
1878                         stride = isl_val_gcd(stride, v);
1879                 }
1880                 if (!isl_val_is_zero(stride) &&
1881                     isl_val_gt(stride, bound->stride))
1882                         extract_stride(c, bound, stride, s);
1883
1884                 isl_val_free(stride);
1885         } else
1886                 isl_val_free(v);
1887
1888         isl_constraint_free(c);
1889         return 0;
1890 }
1891
1892 /* Given contraints on an array index i, check if we can find
1893  * a shift a(p) and a stride g such that
1894  *
1895  *      a(p) + i = 0 mod g
1896  *
1897  * If so, record the information in bound and apply the mapping
1898  * i -> (i + a(p))/g to the array index in bounds and return
1899  * the new constraints.
1900  * If not, simply return the original constraints.
1901  *
1902  * If bounds is a subset of the space
1903  *
1904  *      D -> i
1905  *
1906  * then the bound recorded in bound->shift is of the form
1907  *
1908  *      D -> s(D)
1909  *
1910  * with s(D) equal to a(p) above.
1911  * Next, we construct a mapping of the form
1912  *
1913  *      [D -> i] -> [D -> (i + S(D))/g]
1914  *
1915  * This mapping is computed as follows.
1916  * We first introduce "i" in the domain through precomposition
1917  * with [D -> i] -> D obtaining
1918  *
1919  *      [D -> i] -> s(D)
1920  *
1921  * Adding [D -> i] -> i produces
1922  *
1923  *      [D -> i] -> i + s(D)
1924  *
1925  * and the domain product with [D -> i] -> D yields
1926  *
1927  *      [D -> i] -> [D -> i + s(D)]
1928  *
1929  * Composition with [D -> i] -> [D -> i/g] gives the desired result.
1930  */
1931 static __isl_give isl_basic_map *check_stride(struct gpu_array_bound *bound,
1932         __isl_take isl_basic_map *bounds)
1933 {
1934         isl_space *space;
1935         isl_basic_map *hull;
1936         isl_basic_map *shift, *id, *bmap, *scale;
1937         isl_basic_set *bset;
1938         isl_aff *aff;
1939
1940         bound->stride = NULL;
1941
1942         hull = isl_basic_map_affine_hull(isl_basic_map_copy(bounds));
1943
1944         isl_basic_map_foreach_constraint(hull, &check_stride_constraint, bound);
1945
1946         isl_basic_map_free(hull);
1947
1948         if (!bound->stride)
1949                 return bounds;
1950
1951         shift = isl_basic_map_from_aff(isl_aff_copy(bound->shift));
1952         space = isl_basic_map_get_space(bounds);
1953         bmap = isl_basic_map_domain_map(isl_basic_map_universe(space));
1954         shift = isl_basic_map_apply_range(bmap, shift);
1955         space = isl_basic_map_get_space(bounds);
1956         id = isl_basic_map_range_map(isl_basic_map_universe(space));
1957         shift = isl_basic_map_sum(id, shift);
1958         space = isl_basic_map_get_space(bounds);
1959         id = isl_basic_map_domain_map(isl_basic_map_universe(space));
1960         shift = isl_basic_map_range_product(id, shift);
1961
1962         space = isl_space_domain(isl_basic_map_get_space(bounds));
1963         id = isl_basic_map_identity(isl_space_map_from_set(space));
1964         space = isl_space_range(isl_basic_map_get_space(bounds));
1965         aff = isl_aff_zero_on_domain(isl_local_space_from_space(space));
1966         aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1);
1967         aff = isl_aff_scale_down_val(aff, isl_val_copy(bound->stride));
1968         scale = isl_basic_map_from_aff(aff);
1969         scale = isl_basic_map_product(id, scale);
1970
1971         bmap = isl_basic_map_apply_range(shift, scale);
1972         bset = isl_basic_set_apply(isl_basic_map_wrap(bounds), bmap);
1973         bounds = isl_basic_set_unwrap(bset);
1974
1975         return bounds;
1976 }
1977
1978 /* Data used in compute_array_dim_size and compute_size_in_direction.
1979  *
1980  * pos is the position of the variable representing the array index,
1981  * i.e., the variable for which want to compute the size.  This variable
1982  * is also the last variable in the set.
1983  */
1984 struct gpu_size_info {
1985         isl_basic_set *bset;
1986         struct gpu_array_bound *bound;
1987         int pos;
1988 };
1989
1990 /* Given a constraint from the basic set describing the bounds on
1991  * an array index, check if it is a lower bound, say m i >= b(x), and,
1992  * if so, check whether the expression "i - ceil(b(x)/m) + 1" has a constant
1993  * upper bound.  If so, and if this bound is smaller than any bound
1994  * derived from earlier constraints, set the size to this bound on
1995  * the expression and the lower bound to ceil(b(x)/m).
1996  */
1997 static int compute_size_in_direction(__isl_take isl_constraint *c, void *user)
1998 {
1999         struct gpu_size_info *size = user;
2000         unsigned nparam;
2001         unsigned n_div;
2002         isl_val *v;
2003         isl_aff *aff;
2004         isl_aff *lb;
2005
2006         nparam = isl_basic_set_dim(size->bset, isl_dim_param);
2007         n_div = isl_constraint_dim(c, isl_dim_div);
2008
2009         if (isl_constraint_involves_dims(c, isl_dim_div, 0, n_div) ||
2010             !isl_constraint_is_lower_bound(c, isl_dim_set, size->pos)) {
2011                 isl_constraint_free(c);
2012                 return 0;
2013         }
2014
2015         aff = isl_constraint_get_bound(c, isl_dim_set, size->pos);
2016         aff = isl_aff_ceil(aff);
2017
2018         lb = isl_aff_copy(aff);
2019
2020         aff = isl_aff_neg(aff);
2021         aff = isl_aff_add_coefficient_si(aff, isl_dim_in, size->pos, 1);
2022
2023         v = isl_basic_set_max_val(size->bset, aff);
2024         isl_aff_free(aff);
2025
2026         if (isl_val_is_int(v)) {
2027                 v = isl_val_add_ui(v, 1);
2028                 if (!size->bound->size || isl_val_lt(v, size->bound->size)) {
2029                         isl_val_free(size->bound->size);
2030                         size->bound->size = isl_val_copy(v);
2031                         lb = isl_aff_drop_dims(lb, isl_dim_in, size->pos, 1);
2032                         isl_aff_free(size->bound->lb);
2033                         size->bound->lb = isl_aff_copy(lb);
2034                 }
2035         }
2036         isl_val_free(v);
2037         isl_aff_free(lb);
2038
2039         isl_constraint_free(c);
2040
2041         return 0;
2042 }
2043
2044 /* Given a basic map "bounds" that maps parameters and input dimensions
2045  * to a single output dimension, look for an expression in the parameters
2046  * and input dimensions such that the range of the output dimension shifted
2047  * by this expression is a constant.
2048  *
2049  * In particular, we currently only consider lower bounds on the output
2050  * dimension as candidate expressions.
2051  */
2052 static int compute_array_dim_size(struct gpu_array_bound *bound,
2053         __isl_take isl_basic_map *bounds)
2054 {
2055         struct gpu_size_info size;
2056
2057         bounds = isl_basic_map_detect_equalities(bounds);
2058         bounds = check_stride(bound, bounds);
2059
2060         bound->size = NULL;
2061         bound->lb = NULL;
2062
2063         size.bound = bound;
2064         size.pos = isl_basic_map_dim(bounds, isl_dim_in);
2065         size.bset = isl_basic_map_wrap(bounds);
2066         size.bset = isl_basic_set_flatten(size.bset);
2067         size.bset = isl_set_simple_hull(isl_basic_set_compute_divs(size.bset));
2068         isl_basic_set_foreach_constraint(size.bset, &compute_size_in_direction,
2069                                         &size);
2070         isl_basic_set_free(size.bset);
2071
2072         return bound->size ? 0 : -1;
2073 }
2074
2075 /* Check if we can find a memory tile for the given array
2076  * based on the given accesses, and if so, put the results in "tile".
2077  *
2078  * We project the accesses on each index in turn and look for a parametric
2079  * offset such that the size is constant.
2080  */
2081 static int can_tile(__isl_keep isl_map *access, struct gpu_array_tile *tile)
2082 {
2083         int i;
2084
2085         for (i = 0; i < tile->n; ++i) {
2086                 isl_map *access_i;
2087                 isl_basic_map *hull;
2088
2089                 access_i = isl_map_copy(access);
2090                 access_i = isl_map_project_out(access_i, isl_dim_out, 0, i);
2091                 access_i = isl_map_project_out(access_i, isl_dim_out,
2092                                             1, tile->n - (i + 1));
2093                 access_i = isl_map_compute_divs(access_i);
2094                 hull = isl_map_simple_hull(access_i);
2095                 if (compute_array_dim_size(&tile->bound[i], hull) < 0)
2096                         return 0;
2097         }
2098
2099         return 1;
2100 }
2101
2102 /* Construct a map with input the shared tile loops and the loops that
2103  * will be wrapped around the threads that relates these later loops
2104  * to the thread indices and then projects them out.
2105  */
2106 static __isl_give isl_map *compute_privatization(struct gpu_gen *gen)
2107 {
2108         isl_map *priv;
2109         isl_map *tiling;
2110         isl_map *proj;
2111         isl_set *par;
2112         isl_space *dim;
2113
2114         dim = isl_union_map_get_space(gen->shared_sched);
2115
2116         if (gen->options->wrap)
2117                 tiling = wrap(isl_space_copy(dim), gen->shared_len + gen->n_block,
2118                                 gen->shared_len, gen->n_block, gen->block_dim);
2119         else
2120                 tiling = tile(isl_space_copy(dim), gen->shared_len + gen->n_block,
2121                                 gen->shared_len, gen->n_block, gen->block_dim);
2122
2123         priv = tiling;
2124
2125         par = parametrization(dim, gen->shared_len + 2 * gen->n_block,
2126                 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
2127                 gen->kernel->thread_ids);
2128
2129         priv = isl_map_align_params(priv, isl_set_get_space(par));
2130         priv = isl_map_intersect_range(priv, par);
2131
2132         dim = isl_map_get_space(priv);
2133         dim = isl_space_drop_dims(dim, isl_dim_in, 0, isl_space_dim(dim, isl_dim_in));
2134         dim = isl_space_drop_dims(dim, isl_dim_out, 0, isl_space_dim(dim, isl_dim_out));
2135         proj = projection(dim, gen->shared_len + 2 * gen->n_block,
2136                           gen->shared_len);
2137
2138         priv = isl_map_apply_range(priv, proj);
2139
2140         return priv;
2141 }
2142
2143 /* Construct a map from domain_dim to domain_dim that increments
2144  * the dimension at position "pos" and leaves all other dimensions
2145  * constant.
2146  */
2147 static __isl_give isl_map *next(__isl_take isl_space *domain_dim, int pos)
2148 {
2149         int i;
2150         int len = isl_space_dim(domain_dim, isl_dim_set);
2151         isl_space *dim;
2152         isl_basic_map *next;
2153         isl_local_space *ls;
2154
2155         dim = isl_space_map_from_set(domain_dim);
2156         next = isl_basic_map_universe(isl_space_copy(dim));
2157         ls = isl_local_space_from_space(dim);
2158
2159         for (i = 0; i < len; ++i) {
2160                 isl_constraint *c;
2161
2162                 c = isl_equality_alloc(isl_local_space_copy(ls));
2163                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, 1);
2164                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
2165                 if (i == pos)
2166                         c = isl_constraint_set_constant_si(c, 1);
2167                 next = isl_basic_map_add_constraint(next, c);
2168         }
2169
2170         isl_local_space_free(ls);
2171
2172         return isl_map_from_basic_map(next);
2173 }
2174
2175 /* Check if the given access is coalesced.
2176  * That is, check whether incrementing the dimension that will get
2177  * wrapped over the last thread index results in incrementing
2178  * the last array index.
2179  *
2180  * This function is only called for access relations without reuse and
2181  * kernels with at least one block dimension.
2182  */
2183 static int access_is_coalesced(struct gpu_gen *gen,
2184         __isl_keep isl_union_map *access)
2185 {
2186         isl_space *dim;
2187         isl_map *access_map;
2188         isl_map *next_thread_x;
2189         isl_map *next_element;
2190         isl_map *map;
2191         int coalesced;
2192
2193         access = isl_union_map_copy(access);
2194         access = isl_union_map_apply_domain(access,
2195                                 isl_union_map_copy(gen->tiled_sched));
2196         access_map = isl_map_from_union_map(access);
2197
2198         dim = isl_map_get_space(access_map);
2199         dim = isl_space_domain(dim);
2200         next_thread_x = next(dim, gen->shared_len + gen->n_block - 1);
2201
2202         dim = isl_map_get_space(access_map);
2203         dim = isl_space_range(dim);
2204         next_element = next(dim, isl_space_dim(dim, isl_dim_set) - 1);
2205
2206         map = isl_map_apply_domain(next_thread_x, isl_map_copy(access_map));
2207         map = isl_map_apply_range(map, access_map);
2208
2209         coalesced = isl_map_is_subset(map, next_element);
2210
2211         isl_map_free(next_element);
2212         isl_map_free(map);
2213
2214         return coalesced;
2215 }
2216
2217 /* Given an access relation in terms of the first gen->shared_len + gen->n_block
2218  * dimensions of the computed schedule, check if it is bijective for
2219  * fixed values of the first gen->shared_len dimensions.
2220  * We perform this check by equating these dimensions to parameters.
2221  */
2222 static int access_is_bijective(struct gpu_gen *gen, __isl_keep isl_map *access)
2223 {
2224         int res;
2225         isl_set *par;
2226         isl_space *space;
2227         isl_id_list *ids;
2228
2229         access = isl_map_copy(access);
2230         space = isl_space_params(isl_map_get_space(access));
2231         ids = ppcg_scop_generate_names(gen->prog->scop, gen->shared_len, "s");
2232         par = parametrization(space, gen->shared_len + gen->n_block, 0, ids);
2233         isl_id_list_free(ids);
2234         access = isl_map_intersect_domain(access, par);
2235         res = isl_map_is_bijective(access);
2236         isl_map_free(access);
2237
2238         return res;
2239 }
2240
2241 /* Look for the last shared tile loop that affects the offset of "tile"
2242  * and return the result.
2243  * If there is no such loop, then return the index of the loop
2244  * before the first shared tile loop, in particular gen->tile_first - 1.
2245  */
2246 static int compute_tile_last_shared(struct gpu_gen *gen,
2247         struct gpu_array_tile *tile)
2248 {
2249         int i, j;
2250
2251         for (j = gen->shared_len - 1; j >= gen->tile_first; --j) {
2252                 for (i = 0; i < tile->n; ++i) {
2253                         isl_aff *lb;
2254                         isl_aff *shift;
2255
2256                         lb = tile->bound[i].lb;
2257                         if (isl_aff_involves_dims(lb, isl_dim_in, j, 1))
2258                                 break;
2259
2260                         shift = tile->bound[i].shift;
2261                         if (!shift)
2262                                 continue;
2263                         if (isl_aff_involves_dims(shift, isl_dim_in, j, 1))
2264                                 break;
2265                 }
2266                 if (i < tile->n)
2267                         break;
2268         }
2269
2270         return j;
2271 }
2272
2273 /* Look for the last shared tile loop that affects the offset of the
2274  * shared or private tile and store the result in group->last_shared.
2275  * If there is no such loop, then group->last_shared is set to a value
2276  * before the first shared tile loop, in particular gen->tile_first - 1.
2277  * If there is no tile defined on the array reference group,
2278  * then set group->last_shared to gen->shared_len - 1.
2279  */
2280 static void set_last_shared(struct gpu_gen *gen,
2281         struct gpu_array_ref_group *group)
2282 {
2283         struct gpu_array_tile *tile;
2284
2285         group->last_shared = gen->shared_len - 1;
2286
2287         tile = group->private_tile;
2288         if (!tile)
2289                 tile = group->shared_tile;
2290         if (!tile)
2291                 return;
2292
2293         group->last_shared = compute_tile_last_shared(gen, tile);
2294 }
2295
2296 /* Compute the size of the tile specified by "tile"
2297  * in number of elements and return the result.
2298  */
2299 static __isl_give isl_val *tile_size(isl_ctx *ctx, struct gpu_array_tile *tile)
2300 {
2301         int i;
2302         isl_val *size;
2303
2304         size = isl_val_one(ctx);
2305
2306         for (i = 0; i < tile->n; ++i)
2307                 size = isl_val_mul(size, isl_val_copy(tile->bound[i].size));
2308
2309         return size;
2310 }
2311
2312 /* If max_shared_memory is not set to infinity (-1), then make
2313  * sure that the total amount of shared memory required by the
2314  * array reference groups mapped to shared memory is no larger
2315  * than this maximum.
2316  *
2317  * We apply a greedy approach and discard (keep in global memory)
2318  * those groups that would result in a total memory size that
2319  * is larger than the maximum.
2320  *
2321  * This function should be called after any function that may
2322  * affect the decision on whether to place a reference group
2323  * in private, shared or global memory.
2324  */
2325 static void check_shared_memory_bound(struct gpu_gen *gen)
2326 {
2327         int i, j;
2328         isl_val *left, *size;
2329
2330         if (gen->options->max_shared_memory < 0)
2331                 return;
2332
2333         left = isl_val_int_from_si(gen->ctx, gen->options->max_shared_memory);
2334
2335         for (i = 0; i < gen->prog->n_array; ++i) {
2336                 struct gpu_array_info *array = &gen->prog->array[i];
2337
2338                 for (j = 0; j < array->n_group; ++j) {
2339                         struct gpu_array_ref_group *group;
2340
2341                         group = array->groups[j];
2342                         if (group->private_tile)
2343                                 continue;
2344                         if (!group->shared_tile)
2345                                 continue;
2346
2347                         size = tile_size(gen->ctx, group->shared_tile);
2348                         size = isl_val_mul_ui(size, array->size);
2349
2350                         if (isl_val_le(size, left)) {
2351                                 left = isl_val_sub(left, size);
2352                                 continue;
2353                         }
2354                         isl_val_free(size);
2355
2356                         group->shared_tile = free_tile(group->shared_tile);
2357                 }
2358         }
2359
2360         isl_val_free(left);
2361 }
2362
2363 /* Given a description of an array tile "tile" and the "space"
2364  *
2365  *      { D -> A }
2366  *
2367  * where D represents the first shared_len schedule dimensions
2368  * and A represents the array, construct an isl_multi_aff
2369  *
2370  *      { [D[i] -> A[a]] -> A'[a'] }
2371  *
2372  * with A' a scaled down copy of A according to the shifts and strides
2373  * in "tile".  In particular,
2374  *
2375  *      a' = (a + shift(i))/stride
2376  *
2377  * "insert_array" represents
2378  *
2379  *      { [D -> A] -> D }
2380  *
2381  * and is used to insert A into the domain of functions that only
2382  * reference D.
2383  */
2384 static __isl_give isl_multi_aff *strided_tile(
2385         struct gpu_array_tile *tile, __isl_keep isl_space *space,
2386         __isl_keep isl_multi_aff *insert_array)
2387 {
2388         int i;
2389         isl_ctx *ctx;
2390         isl_multi_aff *shift;
2391         isl_multi_val *stride;
2392         isl_space *space2;
2393         isl_local_space *ls;
2394         isl_multi_aff *tiling;
2395
2396         ctx = isl_space_get_ctx(space);
2397         space2 = isl_space_domain(isl_space_copy(space));
2398         ls = isl_local_space_from_space(space2);
2399         space2 = isl_space_range(isl_space_copy(space));
2400         stride = isl_multi_val_zero(space2);
2401         shift = isl_multi_aff_zero(isl_space_copy(space));
2402
2403         for (i = 0; i < tile->n; ++i) {
2404                 struct gpu_array_bound *bound = &tile->bound[i];
2405                 isl_val *stride_i;
2406                 isl_aff *shift_i;
2407
2408                 if (tile->bound[i].shift) {
2409                         stride_i = isl_val_copy(bound->stride);
2410                         shift_i = isl_aff_copy(bound->shift);
2411                 } else {
2412                         stride_i = isl_val_one(ctx);
2413                         shift_i = isl_aff_zero_on_domain(
2414                                         isl_local_space_copy(ls));
2415                 }
2416
2417                 stride = isl_multi_val_set_val(stride, i, stride_i);
2418                 shift = isl_multi_aff_set_aff(shift, i, shift_i);
2419         }
2420         isl_local_space_free(ls);
2421
2422         shift = isl_multi_aff_pullback_multi_aff(shift,
2423                                     isl_multi_aff_copy(insert_array));
2424
2425         tiling = isl_multi_aff_range_map(isl_space_copy(space));
2426         tiling = isl_multi_aff_add(tiling, shift);
2427         tiling = isl_multi_aff_scale_down_multi_val(tiling, stride);
2428
2429         return tiling;
2430 }
2431
2432 /* Compute a tiling for the array reference group "group".
2433  *
2434  * The tiling is of the form
2435  *
2436  *      { [D[i] -> A[a]] -> T[t] }
2437  *
2438  * where D represents the first shared_len schedule dimensions,
2439  * A represents the global array and T represents the shared or
2440  * private memory tile.  The name of T is the name of the local
2441  * array.
2442  *
2443  * If there is any stride in the accesses, then the mapping is
2444  *
2445  *      t = (a + shift(i))/stride - lb(i)
2446  *
2447  * otherwise, it is simply
2448  *
2449  *      t = a - lb(i)
2450  */
2451 static void compute_group_tiling(struct gpu_array_ref_group *group)
2452 {
2453         int i;
2454         struct gpu_array_tile *tile;
2455         struct gpu_array_info *array = group->array;
2456         isl_space *space;
2457         isl_multi_aff *tiling, *lb, *insert_array;
2458         isl_printer *p;
2459         char *local_name;
2460
2461         tile = group->private_tile;
2462         if (!tile)
2463                 tile = group->shared_tile;
2464         if (!tile)
2465                 return;
2466
2467         space = isl_map_get_space(group->access);
2468         insert_array = isl_multi_aff_domain_map(isl_space_copy(space));
2469
2470         for (i = 0; i < tile->n; ++i)
2471                 if (tile->bound[i].shift)
2472                         break;
2473
2474         if (i < tile->n)
2475                 tiling = strided_tile(tile, space, insert_array);
2476         else
2477                 tiling = isl_multi_aff_range_map(isl_space_copy(space));
2478
2479         lb = isl_multi_aff_zero(space);
2480         for (i = 0; i < tile->n; ++i) {
2481                 isl_aff *lb_i = isl_aff_copy(tile->bound[i].lb);
2482                 lb = isl_multi_aff_set_aff(lb, i, lb_i);
2483         }
2484         lb = isl_multi_aff_pullback_multi_aff(lb, insert_array);
2485
2486         tiling = isl_multi_aff_sub(tiling, lb);
2487
2488         p = isl_printer_to_str(isl_multi_aff_get_ctx(tiling));
2489         p = print_array_name(p, group);
2490         local_name = isl_printer_get_str(p);
2491         isl_printer_free(p);
2492         tiling = isl_multi_aff_set_tuple_name(tiling, isl_dim_out, local_name);
2493         free(local_name);
2494
2495         tile->tiling = tiling;
2496 }
2497
2498 /* Compute a tiling for all the array reference groups.
2499  */
2500 static void compute_group_tilings(struct gpu_gen *gen)
2501 {
2502         int i, j;
2503
2504         for (i = 0; i < gen->prog->n_array; ++i) {
2505                 struct gpu_array_info *array = &gen->prog->array[i];
2506
2507                 for (j = 0; j < array->n_group; ++j)
2508                         compute_group_tiling(array->groups[j]);
2509         }
2510 }
2511
2512 /* Fill up the groups array with singleton groups, i.e., one group
2513  * per reference, initializing the array, access, write, n_ref and refs fields.
2514  * In particular the access field is initialized to the scheduled
2515  * access relation of the array reference.
2516  *
2517  * Return the number of elements initialized, i.e., the number of
2518  * active references in the current kernel.
2519  */
2520 static int populate_array_references(struct gpu_array_info *array,
2521         __isl_keep isl_union_map *sched, struct gpu_array_ref_group **groups)
2522 {
2523         int i;
2524         int n;
2525         isl_ctx *ctx = isl_union_map_get_ctx(sched);
2526
2527         n = 0;
2528         for (i = 0; i < array->n_ref; ++i) {
2529                 isl_union_map *umap;
2530                 isl_map *map;
2531                 struct gpu_array_ref_group *group;
2532                 struct gpu_stmt_access *access = array->refs[i];
2533
2534                 map = isl_map_copy(access->access);
2535                 umap = isl_union_map_from_map(map);
2536                 umap = isl_union_map_apply_domain(umap,
2537                                 isl_union_map_copy(sched));
2538
2539                 if (isl_union_map_is_empty(umap)) {
2540                         isl_union_map_free(umap);
2541                         continue;
2542                 }
2543
2544                 map = isl_map_from_union_map(umap);
2545                 map = isl_map_detect_equalities(map);
2546
2547                 group = isl_calloc_type(ctx, struct gpu_array_ref_group);
2548                 assert(group);
2549                 group->array = array;
2550                 group->access = map;
2551                 group->write = access->write;
2552                 group->exact_write = access->exact_write;
2553                 group->slice = access->n_index < array->n_index;
2554                 group->refs = &array->refs[i];
2555                 group->n_ref = 1;
2556
2557                 groups[n++] = group;
2558         }
2559
2560         return n;
2561 }
2562
2563 /* If group->n_ref == 1, then group->refs was set by
2564  * populate_array_references to point directly into
2565  * group->array->refs and should not be freed.
2566  * If group->n_ref > 1, then group->refs was set by join_groups
2567  * to point to a newly allocated array.
2568  */
2569 static void free_array_ref_group(struct gpu_array_ref_group *group)
2570 {
2571         if (!group)
2572                 return;
2573         free_tile(group->shared_tile);
2574         free_tile(group->private_tile);
2575         isl_map_free(group->access);
2576         if (group->n_ref > 1)
2577                 free(group->refs);
2578         free(group);
2579 }
2580
2581 /* Given a map where the input dimensions represent the tile loops,
2582  * eliminate the innermost of those that have a fixed value
2583  * until we reach one that does not (obviously) have a fixed value.
2584  */
2585 static __isl_give isl_map *eliminate_fixed_inner_loops(
2586         __isl_take isl_map *access)
2587 {
2588         int i, n;
2589
2590         n = isl_map_dim(access, isl_dim_in);
2591
2592         for (i = n - 1; i >= 0; --i) {
2593                 if (!map_plain_is_fixed(access, isl_dim_in, i))
2594                         break;
2595                 access = isl_map_eliminate(access, isl_dim_in, i, 1);
2596         }
2597         return access;
2598 }
2599
2600 /* Check if the access relations of group1 and group2 overlap within
2601  * the innermost loop.  In particular, ignore any inner dimension
2602  * with a fixed value.
2603  * The copying to and from shared memory will be performed within
2604  * the innermost actual loop so we are only allowed to consider
2605  * the dimensions up to that innermost loop while checking whether
2606  * two access relations overlap.
2607  */
2608 static int accesses_overlap(struct gpu_array_ref_group *group1,
2609         struct gpu_array_ref_group *group2)
2610 {
2611         int empty;
2612         isl_map *access1, *access2;
2613
2614         access1 = isl_map_copy(group1->access);
2615         access1 = eliminate_fixed_inner_loops(access1);
2616         access2 = isl_map_copy(group2->access);
2617         access2 = eliminate_fixed_inner_loops(access2);
2618         access1 = isl_map_intersect(access1, access2);
2619         empty = isl_map_is_empty(access1);
2620         isl_map_free(access1);
2621
2622         return !empty;
2623 }
2624
2625 /* Combine the given two groups into a single group, containing
2626  * the references of both groups.
2627  */
2628 static struct gpu_array_ref_group *join_groups(
2629         struct gpu_array_ref_group *group1,
2630         struct gpu_array_ref_group *group2)
2631 {
2632         int i;
2633         isl_ctx *ctx;
2634         struct gpu_array_ref_group *group;
2635
2636         ctx = isl_map_get_ctx(group1->access);
2637         group = isl_calloc_type(ctx, struct gpu_array_ref_group);
2638         assert(group);
2639         group->array = group1->array;
2640         group->access = isl_map_union(isl_map_copy(group1->access),
2641                                         isl_map_copy(group2->access));
2642         group->write = group1->write || group2->write;
2643         group->exact_write = group1->exact_write && group2->exact_write;
2644         group->slice = group1->slice || group2->slice;
2645         group->n_ref = group1->n_ref + group2->n_ref;
2646         group->refs = isl_alloc_array(ctx, struct gpu_stmt_access *,
2647                                         group->n_ref);
2648         assert(group->refs);
2649         for (i = 0; i < group1->n_ref; ++i)
2650                 group->refs[i] = group1->refs[i];
2651         for (i = 0; i < group2->n_ref; ++i)
2652                 group->refs[group1->n_ref + i] = group2->refs[i];
2653
2654         return group;
2655 }
2656
2657 /* Combine the given two groups into a single group and free
2658  * the original two groups.
2659  */
2660 static struct gpu_array_ref_group *join_groups_and_free(
2661         struct gpu_array_ref_group *group1,
2662         struct gpu_array_ref_group *group2)
2663 {
2664         struct gpu_array_ref_group *group;
2665
2666         group = join_groups(group1, group2);
2667         free_array_ref_group(group1);
2668         free_array_ref_group(group2);
2669         return group;
2670 }
2671
2672 /* Report that the array reference group with the given access relation
2673  * is not mapped to shared memory in the given kernel because
2674  * it does not exhibit any reuse and is considered to be coalesced.
2675  */
2676 static void report_no_reuse_and_coalesced(struct ppcg_kernel *kernel,
2677         __isl_keep isl_union_map *access)
2678 {
2679         isl_ctx *ctx;
2680         isl_printer *p;
2681
2682         ctx = isl_union_map_get_ctx(access);
2683         p = isl_printer_to_file(ctx, stdout);
2684         p = isl_printer_print_str(p, "Array reference group ");
2685         p = isl_printer_print_union_map(p, access);
2686         p = isl_printer_print_str(p,
2687             " not considered for mapping to shared memory in kernel");
2688         p = isl_printer_print_int(p, kernel->id);
2689         p = isl_printer_print_str(p,
2690             " because it exhibits no reuse and is considered to be coalesced");
2691         p = isl_printer_end_line(p);
2692         isl_printer_free(p);
2693 }
2694
2695 /* Compute the private and/or shared memory tiles for the array
2696  * reference group "group" of array "array".
2697  * Return 0 on success and -1 on error.
2698  *
2699  * If the array is a read-only scalar or if the user requested
2700  * not to use shared or private memory, then we do not need to do anything.
2701  *
2702  * If any reference in the reference group accesses more than one element,
2703  * then we would have to make sure that the layout in shared memory
2704  * is the same as that in global memory.  Since we do not handle this yet
2705  * (and it may not even be possible), we refuse to map to private or
2706  * shared memory in such cases.
2707  *
2708  * If the array group involves any may writes (that are not must writes),
2709  * then we would have to make sure that we load the data into shared/private
2710  * memory first in case the data is not written by the kernel
2711  * (but still written back out to global memory).
2712  * Since we don't have any such mechanism at the moment, we don't
2713  * compute shared/private tiles for groups involving may writes.
2714  *
2715  * We only try to compute a shared memory tile if there is any reuse
2716  * or if the access is not coalesced.
2717  *
2718  * For computing a private memory tile, we also require that there is
2719  * some reuse.  Moreover, we require that the access is private
2720  * to the thread.  That is, we check that any given array element
2721  * is only accessed by a single thread.
2722  * We compute an access relation that maps the shared tile loop iterators
2723  * and the shared point loop iterators that will be wrapped over the
2724  * threads to the array elements.
2725  * We actually check that those iterators that will be wrapped
2726  * partition the array space.  This check is stricter than necessary
2727  * since several iterations may be mapped onto the same thread
2728  * and then they could be allowed to access the same memory elements,
2729  * but our check does not allow this situation.
2730  *
2731  * We also check that the index expression only depends on parallel
2732  * loops.  That way, we can move those loops innermost and unroll them.
2733  * Again, we use a test that is stricter than necessary.
2734  * We actually check whether the index expression only depends
2735  * on the iterators that are wrapped over the threads.
2736  * These are necessarily parallel, but there may be more parallel loops.
2737  *
2738  * Combining the injectivity of the first test with the single-valuedness
2739  * of the second test, we simply test for bijectivity.
2740  *
2741  * If the array is marked force_private, then we bypass all checks
2742  * and assume we can (and should) use registers.
2743  *
2744  * If it turns out we can (or have to) use registers, we compute
2745  * the private memory tile size using can_tile, after introducing a dependence
2746  * on the thread indices.
2747  */
2748 static int compute_group_bounds_core(struct gpu_gen *gen,
2749         struct gpu_array_ref_group *group)
2750 {
2751         isl_ctx *ctx = isl_space_get_ctx(group->array->space);
2752         isl_union_map *access;
2753         int n_index = group->array->n_index;
2754         int no_reuse, coalesced;
2755         isl_map *acc;
2756         int force_private = group->array->force_private;
2757         int use_shared = gen->options->use_shared_memory && gen->n_block > 0;
2758         int use_private = force_private || gen->options->use_private_memory;
2759
2760         if (!use_shared && !use_private)
2761                 return 0;
2762         if (gpu_array_is_read_only_scalar(group->array))
2763                 return 0;
2764         if (!force_private && !group->exact_write)
2765                 return 0;
2766         if (group->slice)
2767                 return 0;
2768
2769         access = group_access_relation(group, 1, 1);
2770         no_reuse = isl_union_map_is_injective(access);
2771         if (use_shared && no_reuse)
2772                 coalesced = access_is_coalesced(gen, access);
2773
2774         if (gen->options->debug->verbose && use_shared && no_reuse && coalesced)
2775                 report_no_reuse_and_coalesced(gen->kernel, access);
2776
2777         if (use_shared && (!no_reuse || !coalesced)) {
2778                 group->shared_tile = create_tile(ctx, group->array->n_index);
2779                 if (!can_tile(group->access, group->shared_tile))
2780                         group->shared_tile = free_tile(group->shared_tile);
2781         }
2782
2783         if (!force_private && (!use_private || no_reuse)) {
2784                 isl_union_map_free(access);
2785                 return 0;
2786         }
2787
2788         access = isl_union_map_apply_domain(access,
2789                                         isl_union_map_copy(gen->shared_sched));
2790
2791         acc = isl_map_from_union_map(access);
2792
2793         if (!force_private && !access_is_bijective(gen, acc)) {
2794                 isl_map_free(acc);
2795                 return 0;
2796         }
2797
2798         group->private_tile = create_tile(gen->ctx, n_index);
2799         acc = isl_map_apply_domain(acc, isl_map_copy(gen->privatization));
2800         if (!can_tile(acc, group->private_tile))
2801                 group->private_tile = free_tile(group->private_tile);
2802
2803         isl_map_free(acc);
2804
2805         if (force_private && !group->private_tile)
2806                 isl_die(ctx, isl_error_internal,
2807                         "unable to map array reference group to registers",
2808                         return -1);
2809
2810         return 0;
2811 }
2812
2813 /* Compute the private and/or shared memory tiles for the array
2814  * reference group "group" of array "array" and set last_shared.
2815  * Return 0 on success and -1 on error.
2816  */
2817 static int compute_group_bounds(struct gpu_gen *gen,
2818         struct gpu_array_ref_group *group)
2819 {
2820         if (compute_group_bounds_core(gen, group) < 0)
2821                 return -1;
2822         set_last_shared(gen, group);
2823
2824         return 0;
2825 }
2826
2827 /* If two groups have overlapping access relations (as determined by
2828  * the "overlap" function) and if one of them involves a write,
2829  * then merge the two groups into one.
2830  * If "compute_bounds" is set, then call compute_group_bounds
2831  * on the merged groups.
2832  *
2833  * Return the updated number of groups.
2834  * Return -1 on error.
2835  */
2836 static int group_writes(struct gpu_gen *gen,
2837         int n, struct gpu_array_ref_group **groups,
2838         int (*overlap)(struct gpu_array_ref_group *group1,
2839                 struct gpu_array_ref_group *group2), int compute_bounds)
2840 {
2841         int i, j;
2842
2843         for (i = 0; i < n; ++i) {
2844                 for (j = n - 1; j > i; --j) {
2845                         if (!groups[i]->write && !groups[j]->write)
2846                                 continue;
2847
2848                         if (!overlap(groups[i], groups[j]))
2849                                 continue;
2850
2851                         groups[i] = join_groups_and_free(groups[i], groups[j]);
2852                         if (j != n - 1)
2853                                 groups[j] = groups[n - 1];
2854                         groups[n - 1] = NULL;
2855                         n--;
2856
2857                         if (compute_bounds &&
2858                             compute_group_bounds(gen, groups[i]) < 0)
2859                                 return -1;
2860                 }
2861         }
2862
2863         return n;
2864 }
2865
2866 /* If two groups have overlapping access relations (within the innermost
2867  * loop) and if one of them involves a write, then merge the two groups
2868  * into one.
2869  *
2870  * Return the updated number of groups.
2871  */
2872 static int group_overlapping_writes(struct gpu_gen *gen,
2873         int n, struct gpu_array_ref_group **groups)
2874 {
2875         return group_writes(gen, n, groups, &accesses_overlap, 0);
2876 }
2877
2878 /* Check if the access relations of group1 and group2 overlap within
2879  * the outermost min(group1->last_shared, group2->last_shared) loops.
2880  */
2881 static int last_shared_accesses_overlap(struct gpu_array_ref_group *group1,
2882         struct gpu_array_ref_group *group2)
2883 {
2884         int last_shared;
2885         int dim;
2886         int empty;
2887         isl_map *map_i, *map_j, *map;
2888
2889         last_shared = group1->last_shared;
2890         if (group2->last_shared < last_shared)
2891                 last_shared = group2->last_shared;
2892         map_i = isl_map_copy(group1->access);
2893         dim = isl_map_dim(map_i, isl_dim_in);
2894         map_i = isl_map_eliminate(map_i, isl_dim_in,
2895                                 last_shared + 1, dim - (last_shared + 1));
2896         map_j = isl_map_copy(group2->access);
2897         map_j = isl_map_eliminate(map_j, isl_dim_in,
2898                                 last_shared + 1, dim - (last_shared + 1));
2899         map = isl_map_intersect(map_i, map_j);
2900         empty = isl_map_is_empty(map);
2901         isl_map_free(map);
2902
2903         return !empty;
2904 }
2905
2906 /* If two groups have overlapping access relations (within the outer
2907  * last_shared loops) and if one of them involves a write,
2908  * then merge the two groups into one.
2909  *
2910  * Return the updated number of groups.
2911  */
2912 static int group_last_shared_overlapping_writes(struct gpu_gen *gen, int n,
2913         struct gpu_array_ref_group **groups)
2914 {
2915         return group_writes(gen, n, groups, &last_shared_accesses_overlap, 1);
2916 }
2917
2918 /* Is the size of the tile specified by "tile" smaller than the sum of
2919  * the sizes of the tiles specified by "tile1" and "tile2"?
2920  */
2921 static int smaller_tile(isl_ctx *ctx, struct gpu_array_tile *tile,
2922         struct gpu_array_tile *tile1, struct gpu_array_tile *tile2)
2923 {
2924         int smaller;
2925         isl_val *size, *size1, *size2;
2926
2927         size = tile_size(ctx, tile);
2928         size1 = tile_size(ctx, tile1);
2929         size2 = tile_size(ctx, tile2);
2930
2931         size = isl_val_sub(size, size1);
2932         size = isl_val_sub(size, size2);
2933         smaller = isl_val_is_neg(size);
2934
2935         isl_val_free(size);
2936
2937         return smaller;
2938 }
2939
2940 /* Given an initial grouping of array references and shared memory tiles
2941  * for each group that allows for a shared memory tile, merge two groups
2942  * if both have a shared memory tile, the merged group also has
2943  * a shared memory tile and the size of the tile for the merge group
2944  * is smaller than the sum of the tile sizes of the individual groups.
2945  *
2946  * If merging two groups decreases the "last_shared" dimension of
2947  * one or both of the two groups, then we need to check for overlapping
2948  * writes again.
2949  *
2950  * Return the number of groups after merging.
2951  * Return -1 on error.
2952  */
2953 static int group_common_shared_memory_tile(struct gpu_gen *gen,
2954         struct gpu_array_info *array, int n,
2955         struct gpu_array_ref_group **groups)
2956 {
2957         int i, j;
2958         int recompute_overlap = 0;
2959         isl_ctx *ctx = isl_space_get_ctx(array->space);
2960
2961         for (i = 0; i < n; ++i) {
2962                 if (!groups[i]->shared_tile)
2963                         continue;
2964                 for (j = n - 1; j > i; --j) {
2965                         isl_map *map;
2966                         int empty;
2967                         struct gpu_array_ref_group *group;
2968
2969                         if (!groups[j]->shared_tile)
2970                                 continue;
2971
2972                         map = isl_map_intersect(isl_map_copy(groups[i]->access),
2973                                             isl_map_copy(groups[j]->access));
2974                         empty = isl_map_is_empty(map);
2975                         isl_map_free(map);
2976
2977                         if (empty)
2978                                 continue;
2979
2980                         group = join_groups(groups[i], groups[j]);
2981                         if (compute_group_bounds(gen, group) < 0) {
2982                                 free_array_ref_group(group);
2983                                 return -1;
2984                         }
2985                         if (!group->shared_tile ||
2986                             !smaller_tile(ctx, group->shared_tile,
2987                                         groups[i]->shared_tile,
2988                                         groups[j]->shared_tile)) {
2989                                 free_array_ref_group(group);
2990                                 continue;
2991                         }
2992
2993                         if (group->last_shared < groups[i]->last_shared ||
2994                             group->last_shared < groups[j]->last_shared)
2995                                 recompute_overlap = 1;
2996                         free_array_ref_group(groups[i]);
2997                         free_array_ref_group(groups[j]);
2998                         groups[i] = group;
2999                         if (j != n - 1)
3000                                 groups[j] = groups[n - 1];
3001                         n--;
3002                 }
3003         }
3004
3005         if (recompute_overlap)
3006                 n = group_last_shared_overlapping_writes(gen, n, groups);
3007         return n;
3008 }
3009
3010 /* Set array->n_group and array->groups to n and groups.
3011  *
3012  * Additionally, set the "nr" field of each group
3013  * and the "group" field of each reference in each group.
3014  */
3015 static void set_array_groups(struct gpu_array_info *array,
3016         int n, struct gpu_array_ref_group **groups)
3017 {
3018         int i, j;
3019
3020         array->n_group = n;
3021         array->groups = groups;
3022
3023         for (i = 0; i < n; ++i) {
3024                 groups[i]->nr = i;
3025
3026                 for (j = 0; j < groups[i]->n_ref; ++j)
3027                         groups[i]->refs[j]->group = i;
3028         }
3029 }
3030
3031 /* Group array references that should be considered together when
3032  * deciding whether to access them from private, shared or global memory.
3033  * Return -1 on error.
3034  *
3035  * In particular, if two array references overlap and if one of them
3036  * is a write, then the two references are grouped together.
3037  * We first perform an initial grouping based only on the access relation.
3038  * After computing shared and private memory tiles, we check for
3039  * overlapping writes again, but this time taking into account
3040  * the "last_shared" property.
3041  *
3042  * Furthermore, if two groups admit a shared memory tile and if the
3043  * combination of the two also admits a shared memory tile, we merge
3044  * the two groups.
3045  *
3046  * If the array contains structures, then there is no need to compute
3047  * reference groups since we do not map such arrays to private or shared
3048  * memory.
3049  */
3050 static int group_array_references(struct gpu_gen *gen,
3051         struct gpu_array_info *array, __isl_keep isl_union_map *sched)
3052 {
3053         int i;
3054         int n;
3055         isl_ctx *ctx = isl_union_map_get_ctx(sched);
3056         struct gpu_array_ref_group **groups;
3057
3058         if (array->has_compound_element)
3059                 return 0;
3060
3061         groups = isl_calloc_array(ctx, struct gpu_array_ref_group *,
3062                                         array->n_ref);
3063         if (!groups)
3064                 return -1;
3065
3066         n = populate_array_references(array, sched, groups);
3067
3068         n = group_overlapping_writes(gen, n, groups);
3069
3070         for (i = 0; i < n; ++i)
3071                 if (compute_group_bounds(gen, groups[i]) < 0)
3072                         n = -1;
3073
3074         n = group_last_shared_overlapping_writes(gen, n, groups);
3075
3076         n = group_common_shared_memory_tile(gen, array, n, groups);
3077
3078         set_array_groups(array, n, groups);
3079
3080         if (n >= 0)
3081                 return 0;
3082
3083         for (i = 0; i < array->n_ref; ++i)
3084                 free_array_ref_group(groups[i]);
3085         return -1;
3086 }
3087
3088 /* Take tiled_sched, project it onto the shared tile loops and
3089  * the loops that will be wrapped over the threads and
3090  * store the result in gen->shared_sched.
3091  * Also compute a projection that projects out the loops that will be
3092  * wrapped over the threads and store this projection in gen->shared_proj.
3093  */
3094 static void compute_shared_sched(struct gpu_gen *gen)
3095 {
3096         isl_space *dim;
3097         isl_map *proj;
3098         isl_set *par;
3099         isl_union_map *sched;
3100
3101         sched = isl_union_map_copy(gen->tiled_sched);
3102
3103         dim = isl_union_map_get_space(sched);
3104         proj = projection(dim, gen->tiled_len, gen->shared_len + gen->n_block);
3105         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
3106
3107         dim = isl_union_map_get_space(sched);
3108         proj = projection(dim, gen->shared_len + gen->n_block, gen->shared_len);
3109
3110         gen->shared_sched = sched;
3111         gen->shared_proj = isl_union_map_from_map(proj);
3112 }
3113
3114 /* For each scalar in the input program, check if there are any
3115  * order dependences active inside the current kernel, within
3116  * the same iteration of the host schedule.
3117  * If so, mark the scalar as force_private so that it will be
3118  * mapped to a register.
3119  */
3120 static void check_scalar_live_ranges(struct gpu_gen *gen)
3121 {
3122         int i;
3123         isl_map *proj;
3124         isl_union_map *sched;
3125         isl_union_set *domain;
3126         isl_union_map *same_host_iteration;
3127
3128         gen->any_force_private = 0;
3129
3130         if (!gen->options->live_range_reordering)
3131                 return;
3132
3133         sched = gen->shared_sched;
3134         sched = isl_union_map_universe(isl_union_map_copy(sched));
3135         domain = isl_union_map_domain(sched);
3136
3137         sched = isl_union_map_copy(gen->sched);
3138         proj = projection(isl_union_map_get_space(sched),
3139                             gen->untiled_len, gen->tile_first);
3140         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
3141         same_host_iteration = isl_union_map_apply_range(sched,
3142                             isl_union_map_reverse(isl_union_map_copy(sched)));
3143
3144         for (i = 0; i < gen->prog->n_array; ++i) {
3145                 struct gpu_array_info *array = &gen->prog->array[i];
3146                 isl_union_map *order;
3147
3148                 array->force_private = 0;
3149                 if (array->n_index != 0)
3150                         continue;
3151                 order = isl_union_map_copy(array->dep_order);
3152                 order = isl_union_map_intersect_domain(order,
3153                                                     isl_union_set_copy(domain));
3154                 order = isl_union_map_intersect_range(order,
3155                                                     isl_union_set_copy(domain));
3156                 order = isl_union_map_intersect(order,
3157                                     isl_union_map_copy(same_host_iteration));
3158                 if (!isl_union_map_is_empty(order)) {
3159                         array->force_private = 1;
3160                         gen->any_force_private = 1;
3161                 }
3162                 isl_union_map_free(order);
3163         }
3164
3165         isl_union_map_free(same_host_iteration);
3166         isl_union_set_free(domain);
3167 }
3168
3169 /* Group references of all arrays in the program.
3170  */
3171 static int group_references(struct gpu_gen *gen)
3172 {
3173         int i;
3174         int r = 0;
3175         isl_union_map *sched;
3176
3177         sched = isl_union_map_apply_range(isl_union_map_copy(gen->shared_sched),
3178                                           isl_union_map_copy(gen->shared_proj));
3179
3180         for (i = 0; i < gen->prog->n_array; ++i) {
3181                 r = group_array_references(gen, &gen->prog->array[i], sched);
3182                 if (r < 0)
3183                         break;
3184         }
3185
3186         isl_union_map_free(sched);
3187
3188         return r;
3189 }
3190
3191 /* Free all array information that is local to the current kernel.
3192  */
3193 static void free_local_array_info(struct gpu_gen *gen)
3194 {
3195         int i, j;
3196
3197         for (i = 0; i < gen->prog->n_array; ++i) {
3198                 struct gpu_array_info *array = &gen->prog->array[i];
3199
3200                 for (j = 0; j < array->n_group; ++j)
3201                         free_array_ref_group(array->groups[j]);
3202                 free(array->groups);
3203         }
3204 }
3205
3206 /* Compute the size of a bounding box around the origin and "set",
3207  * where "set" is assumed to contain only non-negative elements.
3208  * In particular, compute the maximal value of "set" in each direction
3209  * and add one.
3210  */
3211 static __isl_give isl_multi_pw_aff *extract_size(__isl_take isl_set *set,
3212         __isl_keep isl_set *context)
3213 {
3214         int i, n;
3215         isl_multi_pw_aff *mpa;
3216
3217         n = isl_set_dim(set, isl_dim_set);
3218         mpa = isl_multi_pw_aff_zero(isl_set_get_space(set));
3219         for (i = 0; i < n; ++i) {
3220                 isl_space *space;
3221                 isl_aff *one;
3222                 isl_pw_aff *bound;
3223
3224                 bound = isl_set_dim_max(isl_set_copy(set), i);
3225                 bound = isl_pw_aff_coalesce(bound);
3226                 bound = isl_pw_aff_gist(bound, isl_set_copy(context));
3227
3228                 space = isl_pw_aff_get_domain_space(bound);
3229                 one = isl_aff_zero_on_domain(isl_local_space_from_space(space));
3230                 one = isl_aff_add_constant_si(one, 1);
3231                 bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one));
3232                 mpa = isl_multi_pw_aff_set_pw_aff(mpa, i, bound);
3233         }
3234         isl_set_free(set);
3235
3236         return mpa;
3237 }
3238
3239 /* Compute the effective grid size as a list of the sizes in each dimension.
3240  *
3241  * The grid size specified by the user or set by default
3242  * in read_grid_sizes() and applied in tile_schedule(),
3243  * may be too large for the given code in the sense that
3244  * it may contain blocks that don't need to execute anything.
3245  * We therefore don't return this grid size, but instead the
3246  * smallest grid size that ensures that all blocks that actually
3247  * execute code are included in the grid.
3248  *
3249  * We first extract a description of the grid, i.e., the possible values
3250  * of the block ids, from gen->tiled_sched.
3251  * The block ids are parameters in gen->tiled_sched.
3252  * We simply need to change them into set dimensions.
3253  *
3254  * Then, for each block dimension, we compute the maximal value of the block id
3255  * and add one.
3256  */
3257 static __isl_give isl_multi_pw_aff *extract_grid_size(struct gpu_gen *gen,
3258         struct ppcg_kernel *kernel)
3259 {
3260         int i;
3261         isl_set *grid;
3262
3263         grid = isl_union_map_params(isl_union_map_copy(gen->tiled_sched));
3264         grid = isl_set_from_params(grid);
3265         grid = isl_set_add_dims(grid, isl_dim_set, gen->n_grid);
3266         for (i = 0; i < gen->n_grid; ++i) {
3267                 int pos;
3268                 isl_id *id;
3269
3270                 id = isl_id_list_get_id(kernel->block_ids, i);
3271                 pos = isl_set_find_dim_by_id(grid, isl_dim_param, id);
3272                 isl_id_free(id);
3273                 assert(pos >= 0);
3274                 grid = isl_set_equate(grid, isl_dim_param, pos, isl_dim_set, i);
3275                 grid = isl_set_project_out(grid, isl_dim_param, pos, 1);
3276         }
3277
3278         return extract_size(grid, kernel->context);
3279 }
3280
3281 /* Compute the size of a fixed bounding box around the origin and "set",
3282  * where "set" is assumed to contain only non-negative elements,
3283  * and store the results in "size".
3284  * In particular, compute the maximal value of "set" in each direction
3285  * and add one.
3286  */
3287 static void extract_fixed_size(__isl_take isl_set *set, int *size)
3288 {
3289         int i, n;
3290         isl_local_space *ls;
3291         isl_aff *obj;
3292
3293         n = isl_set_dim(set, isl_dim_set);
3294         ls = isl_local_space_from_space(isl_set_get_space(set));
3295         obj = isl_aff_zero_on_domain(ls);
3296         for (i = 0; i < n; ++i) {
3297                 isl_val *max;
3298
3299                 obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 1);
3300                 max = isl_set_max_val(set, obj);
3301                 size[i] = isl_val_get_num_si(max) + 1;
3302                 isl_val_free(max);
3303                 obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 0);
3304         }
3305         isl_aff_free(obj);
3306         isl_set_free(set);
3307 }
3308
3309 /* Compute the effective block size as a list of the sizes in each dimension
3310  * and store the sizes in kernel->block_dim.
3311  *
3312  * The block size specified by the user or set by default
3313  * in read_block_sizes() and applied in thread_tile_schedule(),
3314  * may be too large for the given code in the sense that
3315  * it may contain threads that don't need to execute anything.
3316  * We therefore don't store this block size in kernel->block_dim,
3317  * but instead the smallest block size that ensures that all threads
3318  * that actually execute code are included in the block.
3319  *
3320  * The current implementation eliminates all parameters, ensuring
3321  * that the size is a fixed constant in each dimension.
3322  * In principle we could also compute parametric sizes.
3323  * We would have to make sure to project out all b%d and t%d parameters,
3324  * however.
3325  */
3326 static void extract_block_size(struct gpu_gen *gen, struct ppcg_kernel *kernel)
3327 {
3328         int i;
3329         int nparam;
3330         isl_set *block;
3331         isl_multi_pw_aff *mpa;
3332
3333         block = isl_union_map_params(isl_union_map_copy(gen->local_sched));
3334         block = isl_set_from_params(block);
3335         block = isl_set_add_dims(block, isl_dim_set, gen->n_block);
3336         kernel->n_block = gen->n_block;
3337         for (i = 0; i < gen->n_block; ++i) {
3338                 int pos;
3339                 isl_id *id;
3340
3341                 id = isl_id_list_get_id(kernel->thread_ids, i);
3342                 pos = isl_set_find_dim_by_id(block, isl_dim_param, id);
3343                 isl_id_free(id);
3344                 assert(pos >= 0);
3345                 block = isl_set_equate(block, isl_dim_param, pos,
3346                                         isl_dim_set, i);
3347         }
3348         nparam = isl_set_dim(block, isl_dim_param);
3349         block = isl_set_project_out(block, isl_dim_param, 0, nparam);
3350
3351         extract_fixed_size(block, kernel->block_dim);
3352 }
3353
3354 void ppcg_kernel_free(void *user)
3355 {
3356         struct ppcg_kernel *kernel = user;
3357         int i;
3358
3359         if (!kernel)
3360                 return;
3361
3362         isl_id_list_free(kernel->block_ids);
3363         isl_id_list_free(kernel->thread_ids);
3364         isl_multi_pw_aff_free(kernel->grid_size);
3365         isl_set_free(kernel->context);
3366         isl_union_set_free(kernel->arrays);
3367         isl_space_free(kernel->space);
3368         isl_ast_node_free(kernel->tree);
3369
3370         for (i = 0; i < kernel->n_array; ++i)
3371                 isl_pw_aff_list_free(kernel->array[i].bound);
3372         free(kernel->array);
3373
3374         for (i = 0; i < kernel->n_var; ++i) {
3375                 free(kernel->var[i].name);
3376                 isl_vec_free(kernel->var[i].size);
3377         }
3378         free(kernel->var);
3379
3380         free(kernel);
3381 }
3382
3383 static void create_kernel_var(isl_ctx *ctx, struct gpu_array_ref_group *group,
3384         struct ppcg_kernel_var *var)
3385 {
3386         int j;
3387         struct gpu_array_tile *tile;
3388         isl_printer *p;
3389         char *name;
3390
3391         var->array = group->array;
3392
3393         tile = group->private_tile;
3394         var->type = ppcg_access_private;
3395         if (!tile) {
3396                 tile = group->shared_tile;
3397                 var->type = ppcg_access_shared;
3398         }
3399
3400         p = isl_printer_to_str(ctx);
3401         p = print_array_name(p, group);
3402         var->name = isl_printer_get_str(p);
3403         isl_printer_free(p);
3404
3405         var->size = isl_vec_alloc(ctx, group->array->n_index);
3406
3407         for (j = 0; j < group->array->n_index; ++j)
3408                 var->size = isl_vec_set_element_val(var->size, j,
3409                                             isl_val_copy(tile->bound[j].size));
3410 }
3411
3412 static void create_kernel_vars(struct gpu_gen *gen, struct ppcg_kernel *kernel)
3413 {
3414         int i, j, n;
3415
3416         n = 0;
3417         for (i = 0; i < gen->prog->n_array; ++i) {
3418                 struct gpu_array_info *array = &gen->prog->array[i];
3419
3420                 for (j = 0; j < array->n_group; ++j) {
3421                         struct gpu_array_ref_group *group = array->groups[j];
3422                         if (group->private_tile || group->shared_tile)
3423                                 ++n;
3424                 }
3425         }
3426
3427         kernel->n_var = n;
3428         kernel->var = isl_calloc_array(gen->ctx, struct ppcg_kernel_var, n);
3429         assert(kernel->var);
3430
3431         n = 0;
3432         for (i = 0; i < gen->prog->n_array; ++i) {
3433                 struct gpu_array_info *array = &gen->prog->array[i];
3434
3435                 for (j = 0; j < array->n_group; ++j) {
3436                         struct gpu_array_ref_group *group = array->groups[j];
3437                         if (!group->private_tile && !group->shared_tile)
3438                                 continue;
3439                         create_kernel_var(gen->ctx, group, &kernel->var[n]);
3440                         ++n;
3441                 }
3442         }
3443 }
3444
3445 /* Replace "pa" by the zero function defined over the universe domain
3446  * in the space of "pa".
3447  */
3448 static __isl_give isl_pw_aff *set_universally_zero(__isl_take isl_pw_aff *pa)
3449 {
3450         isl_space *space;
3451         isl_aff *zero;
3452
3453         space = isl_space_domain(isl_pw_aff_get_space(pa));
3454         isl_pw_aff_free(pa);
3455         zero = isl_aff_zero_on_domain(isl_local_space_from_space(space));
3456
3457         return isl_pw_aff_from_aff(zero);
3458 }
3459
3460 /* The sizes of the arrays on the host that have been computed by
3461  * extract_array_info may depend on the parameters.  Use the extra
3462  * constraints on the parameters that are valid at "host_domain"
3463  * to simplify these expressions and store the results in kernel->array.
3464  *
3465  * We only need these localized bounds for arrays that are accessed
3466  * by the current kernel.  If we have found at least one reference group
3467  * then the array is accessed by the kernel.  If the array has compound
3468  * elements then we skipped the construction of array reference groups.
3469  *
3470  * The resulting sizes may be functions that are nowhere defined
3471  * in case the access function cannot possibly access anything inside
3472  * the kernel for some reason.  If so, they are replaced by the zero
3473  * function.  Since the access function cannot actually access anything,
3474  * there is no harm in printing the array sizes as zero.
3475  */
3476 static void localize_bounds(struct gpu_gen *gen, struct ppcg_kernel *kernel,
3477         __isl_keep isl_set *host_domain)
3478 {
3479         int i, j;
3480         isl_set *context;
3481
3482         kernel->array = isl_calloc_array(gen->ctx,
3483                             struct gpu_local_array_info, gen->prog->n_array);
3484         assert(kernel->array);
3485         kernel->n_array = gen->prog->n_array;
3486
3487         context = isl_set_copy(host_domain);
3488         context = isl_set_params(context);
3489
3490         for (i = 0; i < gen->prog->n_array; ++i) {
3491                 struct gpu_array_info *array = &gen->prog->array[i];
3492                 isl_pw_aff_list *local;
3493
3494                 if (array->n_group == 0 && !array->has_compound_element)
3495                         continue;
3496
3497                 local = isl_pw_aff_list_alloc(gen->ctx, array->n_index);
3498
3499                 for (j = 0; j < array->n_index; ++j) {
3500                         isl_pw_aff *pwaff;
3501                         int empty;
3502
3503                         pwaff = isl_pw_aff_copy(array->bound[j]);
3504                         pwaff = isl_pw_aff_gist(pwaff, isl_set_copy(context));
3505                         empty = isl_pw_aff_is_empty(pwaff);
3506                         if (empty < 0)
3507                                 pwaff = isl_pw_aff_free(pwaff);
3508                         else if (empty)
3509                                 pwaff = set_universally_zero(pwaff);
3510                         local = isl_pw_aff_list_add(local, pwaff);
3511                 }
3512
3513                 kernel->array[i].n_index = array->n_index;
3514                 kernel->array[i].bound = local;
3515         }
3516         isl_set_free(context);
3517 }
3518
3519 /* Find the element in gen->stmt that has the given "id".
3520  * Return NULL if no such gpu_stmt can be found.
3521  */
3522 static struct gpu_stmt *find_stmt(struct gpu_prog *prog, __isl_keep isl_id *id)
3523 {
3524         int i;
3525
3526         for (i = 0; i < prog->n_stmts; ++i) {
3527                 if (id == prog->stmts[i].id)
3528                         break;
3529         }
3530
3531         return i < prog->n_stmts ? &prog->stmts[i] : NULL;
3532 }
3533
3534 /* Set gen->tile_len and gen->n_parallel to those of the statement
3535  * affected by the first map (part of the schedule)
3536  * on which this function is called.
3537  * Because of the way the schedule is constructed, the other statements
3538  * in the list, if any, should have the same values for these properties.
3539  */
3540 static int extract_tile_len(__isl_take isl_map *map, void *user)
3541 {
3542         struct gpu_gen *gen = (struct gpu_gen *) user;
3543         isl_id *id;
3544         struct gpu_stmt *stmt;
3545
3546         id = isl_map_get_tuple_id(map, isl_dim_in);
3547         stmt = find_stmt(gen->prog, id);
3548         isl_id_free(id);
3549
3550         isl_map_free(map);
3551
3552         if (!stmt)
3553                 isl_die(gen->ctx, isl_error_unknown,
3554                         "statement not found", return -1);
3555
3556         gen->tile_len = stmt->tile_len;
3557         gen->n_parallel = stmt->n_parallel;
3558
3559         return -1;
3560 }
3561
3562 void ppcg_kernel_stmt_free(void *user)
3563 {
3564         int i;
3565         struct ppcg_kernel_stmt *stmt = user;
3566
3567         if (!stmt)
3568                 return;
3569
3570         switch (stmt->type) {
3571         case ppcg_kernel_copy:
3572                 isl_ast_expr_free(stmt->u.c.index);
3573                 isl_ast_expr_free(stmt->u.c.local_index);
3574                 break;
3575         case ppcg_kernel_domain:
3576                 isl_id_to_ast_expr_free(stmt->u.d.ref2expr);
3577                 break;
3578         case ppcg_kernel_sync:
3579                 break;
3580         }
3581
3582         free(stmt);
3583 }
3584
3585 /* Set the options of "context" to
3586  *
3587  *      { space -> [x] : x >= first }
3588  */
3589 static __isl_give isl_ast_build *set_unroll(
3590         __isl_take isl_ast_build *build, __isl_take isl_space *space,
3591         int first)
3592 {
3593         isl_ctx *ctx;
3594         isl_map *unroll;
3595         isl_union_map *opt;
3596
3597         ctx = isl_ast_build_get_ctx(build);
3598
3599         space = isl_space_from_domain(space);
3600         space = isl_space_add_dims(space, isl_dim_out, 1);
3601         space = isl_space_set_tuple_name(space, isl_dim_out, "unroll");
3602         unroll = isl_map_universe(space);
3603         unroll = isl_map_lower_bound_si(unroll, isl_dim_out, 0, first);
3604         opt = isl_union_map_from_map(unroll);
3605
3606         build = isl_ast_build_set_options(build, opt);
3607
3608         return build;
3609 }
3610
3611 /* Extend the schedule "schedule" with the part of "extension"
3612  * starting at "first" up to "len".
3613  */
3614 static __isl_give isl_union_map *extend_schedule(
3615         __isl_take isl_union_map *schedule,
3616         __isl_take isl_union_map *extension, int first, int len)
3617 {
3618         isl_space *space;
3619         isl_map *proj;
3620         isl_union_map *umap;
3621         isl_set *set;
3622
3623         space = isl_union_map_get_space(schedule);
3624         space = isl_space_set_from_params(space);
3625         space = isl_space_add_dims(space, isl_dim_set, len);
3626         proj = isl_set_identity(isl_set_universe(space));
3627         proj = isl_map_project_out(proj, isl_dim_out, 0, first);
3628         extension = isl_union_map_apply_range(extension,
3629                                                 isl_union_map_from_map(proj));
3630
3631         schedule = isl_union_map_range_product(schedule, extension);
3632
3633         return schedule;
3634 }
3635
3636 /* Return the gpu_stmt_access in the list "accesses" that corresponds
3637  * to "ref_id".
3638  */
3639 static struct gpu_stmt_access *find_access(struct gpu_stmt_access *accesses,
3640         __isl_keep isl_id *ref_id)
3641 {
3642         struct gpu_stmt_access *access;
3643
3644         for (access = accesses; access; access = access->next)
3645                 if (access->ref_id == ref_id)
3646                         return access;
3647
3648         return NULL;
3649 }
3650
3651 /* Return the index of the array called "name" in the list of arrays.
3652  */
3653 static int find_array_index(struct gpu_gen *gen, const char *name)
3654 {
3655         int i;
3656
3657         for (i = 0; i < gen->prog->n_array; ++i)
3658                 if (!strcmp(name, gen->prog->array[i].name))
3659                         return i;
3660
3661         return -1;
3662 }
3663
3664 /* Internal data structure for the index and AST expression transformation
3665  * callbacks for pet_stmt_build_ast_exprs.
3666  *
3667  * "accesses" is the list of gpu_stmt_access in the statement.
3668  * "iterator_map" expresses the statement iterators in terms of
3669  * the AST loop iterators.
3670  * "sched2shared" expresses the first shared_len dimensions of
3671  * the computed schedule in terms of the AST loop iterators.
3672  *
3673  * The following fields are set in transform_index and used in transform_expr.
3674  * "array" is the array that is being accessed.
3675  * "global" is set if the global array is accessed (rather than
3676  * shared/private memory).
3677  * "local_array" refers to information on the array specialized
3678  * to the current kernel.
3679  */
3680 struct ppcg_transform_data {
3681         struct gpu_gen *gen;
3682         struct gpu_stmt_access *accesses;
3683         isl_pw_multi_aff *iterator_map;
3684         isl_pw_multi_aff *sched2shared;
3685
3686         struct gpu_array_info *array;
3687         int global;
3688         struct gpu_local_array_info *local_array;
3689 };
3690
3691 /* Return the name of the outer array (of structs) accessed by "access".
3692  */
3693 static const char *get_outer_array_name(__isl_keep isl_map *access)
3694 {
3695         isl_space *space;
3696         const char *name;
3697
3698         space = isl_space_range(isl_map_get_space(access));
3699         while (space && isl_space_is_wrapping(space))
3700                 space = isl_space_domain(isl_space_unwrap(space));
3701         name = isl_space_get_tuple_name(space, isl_dim_set);
3702         isl_space_free(space);
3703
3704         return name;
3705 }
3706
3707 /* Index transformation callback for pet_stmt_build_ast_exprs.
3708  *
3709  * "index" expresses the array indices in terms of statement iterators
3710  *
3711  * We first reformulate "index" in terms of the AST loop iterators.
3712  * Then we check if we are accessing the global array or
3713  * a shared/private copy.  In the former case, we simply return
3714  * the updated index.  If "index" is an affine expression rather
3715  * than an array access, then we also return the updated index here.
3716  *
3717  * If no reference groups have been computed for the array,
3718  * then we can only be accessing the global array.
3719  *
3720  * Otherwise, we apply the tiling to the index.
3721  * This tiling is of the form
3722  *
3723  *      [D -> A] -> T
3724  *
3725  * The index is of the form
3726  *
3727  *      L -> A
3728  *
3729  * We update the tiling to refer to the AST loop iterators
3730  *
3731  *      [L -> A] -> T
3732  *
3733  * and modify index to keep track of those iterators
3734  *
3735  *      L -> [L -> A]
3736  *
3737  * Combining these two yields a tiled index expression in terms
3738  * of the AST loop iterators
3739  *
3740  *      L -> T
3741  */
3742 static __isl_give isl_multi_pw_aff *transform_index(
3743         __isl_take isl_multi_pw_aff *index, __isl_keep isl_id *ref_id,
3744         void *user)
3745 {
3746         struct ppcg_transform_data *data = user;
3747         struct gpu_stmt_access *access;
3748         struct gpu_array_ref_group *group;
3749         struct gpu_array_tile *tile;
3750         isl_pw_multi_aff *iterator_map;
3751         int i;
3752         const char *name;
3753         isl_space *space;
3754         isl_multi_pw_aff *tiling;
3755         isl_pw_multi_aff *pma;
3756         isl_multi_pw_aff *mpa;
3757
3758         data->array = NULL;
3759
3760         iterator_map = isl_pw_multi_aff_copy(data->iterator_map);
3761         index = isl_multi_pw_aff_pullback_pw_multi_aff(index, iterator_map);
3762
3763         access = find_access(data->accesses, ref_id);
3764         if (!access)
3765                 return index;
3766         if (!isl_map_has_tuple_name(access->access, isl_dim_out))
3767                 return index;
3768
3769         name = get_outer_array_name(access->access);
3770         i = find_array_index(data->gen, name);
3771         if (i < 0)
3772                 isl_die(isl_multi_pw_aff_get_ctx(index), isl_error_internal,
3773                         "cannot find array",
3774                         return isl_multi_pw_aff_free(index));
3775         data->array = &data->gen->prog->array[i];
3776         data->local_array = &data->gen->kernel->array[i];
3777
3778         if (access->group < 0) {
3779                 data->global = 1;
3780                 return index;
3781         }
3782
3783         group = data->array->groups[access->group];
3784         tile = group->private_tile;
3785         if (!tile)
3786                 tile = group->shared_tile;
3787         data->global = !tile;
3788         if (!tile)
3789                 return index;
3790
3791         space = isl_space_range(isl_multi_pw_aff_get_space(index));
3792         space = isl_space_map_from_set(space);
3793         pma = isl_pw_multi_aff_identity(space);
3794         pma = isl_pw_multi_aff_product(
3795                         isl_pw_multi_aff_copy(data->sched2shared), pma);
3796         tiling = isl_multi_pw_aff_from_multi_aff(
3797                                     isl_multi_aff_copy(tile->tiling));
3798         tiling = isl_multi_pw_aff_pullback_pw_multi_aff(tiling, pma);
3799
3800         space = isl_space_domain(isl_multi_pw_aff_get_space(index));
3801         space = isl_space_map_from_set(space);
3802         mpa = isl_multi_pw_aff_identity(space);
3803         index = isl_multi_pw_aff_range_product(mpa, index);
3804         index = isl_multi_pw_aff_pullback_multi_pw_aff(tiling, index);
3805
3806         return index;
3807 }
3808
3809 /* Dereference "expr" by adding an index [0].
3810  * The original "expr" is assumed not to have any indices.
3811  *
3812  * If "expr" is a member access, then the dereferencing needs
3813  * to be applied to the structure argument of this member access.
3814  */
3815 static __isl_give isl_ast_expr *dereference(__isl_take isl_ast_expr *expr)
3816 {
3817         isl_ctx *ctx;
3818         isl_ast_expr *arg0, *res;
3819         isl_ast_expr_list *list;
3820
3821         arg0 = isl_ast_expr_get_op_arg(expr, 0);
3822         if (!arg0)
3823                 return isl_ast_expr_free(expr);
3824         if (isl_ast_expr_get_type(arg0) == isl_ast_expr_op &&
3825             isl_ast_expr_get_op_type(arg0) == isl_ast_op_member) {
3826                 isl_ast_expr *arg;
3827
3828                 arg = isl_ast_expr_get_op_arg(arg0, 0);
3829                 arg = dereference(arg);
3830                 arg0 = isl_ast_expr_set_op_arg(arg0, 0, arg);
3831                 expr = isl_ast_expr_set_op_arg(expr, 0, arg0);
3832
3833                 return expr;
3834         }
3835         isl_ast_expr_free(arg0);
3836
3837         ctx = isl_ast_expr_get_ctx(expr);
3838         res = isl_ast_expr_from_val(isl_val_zero(ctx));
3839         list = isl_ast_expr_list_from_ast_expr(res);
3840         res = isl_ast_expr_get_op_arg(expr, 0);
3841         res = isl_ast_expr_access(res, list);
3842         isl_ast_expr_free(expr);
3843
3844         return res;
3845 }
3846
3847 /* Linearize the index expression "expr" based on the array bounds
3848  * of "array".
3849  *
3850  * That is, transform expression
3851  *
3852  *      A[i_0][i_1]...[i_n]
3853  *
3854  * to
3855  *
3856  *      A[(..((i_0 * b_1 + i_1) ... ) * b_n + i_n]
3857  *
3858  * where b_0, b_1, ..., b_n are the bounds on the array.
3859  *
3860  * If the base of "expr" is a member access, then the linearization needs
3861  * to be applied to the structure argument of this member access.
3862  *
3863  * In the base case, if "expr" has no arguments (other than the name of
3864  * the array), then we are passing an entire array to a function.
3865  * In this case, there is nothing to linearize.
3866  * Note that at this point an expression with no arguments can
3867  * only be an entire array because the scalar case and
3868  * the case of single struct are handled by the caller.
3869  *
3870  * If the number of specified index expressions in "expr"
3871  * is smaller than the dimension of the accessed array,
3872  * then the missing i_j also do not appear in the linearized expression.
3873  * Furthermore, since such an expression does not refer to a single
3874  * element while the default linearized expression would refer to
3875  * a single element, we return the expression
3876  *
3877  *      A + (..((i_0 * b_1 + i_1) ... ) * b_n]
3878  *
3879  * instead.  Note that because of the special case handling above,
3880  * we can assume here that here that there is at least one index expression.
3881  */
3882 __isl_give isl_ast_expr *gpu_local_array_info_linearize_index(
3883         struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr)
3884 {
3885         int i, n;
3886         isl_ctx *ctx;
3887         isl_set *context;
3888         isl_ast_expr *arg0;
3889         isl_ast_expr *res;
3890         isl_ast_expr_list *list;
3891         isl_ast_build *build;
3892
3893         arg0 = isl_ast_expr_get_op_arg(expr, 0);
3894         if (isl_ast_expr_get_type(arg0) == isl_ast_expr_op &&
3895             isl_ast_expr_get_op_type(arg0) == isl_ast_op_member) {
3896                 isl_ast_expr *arg;
3897
3898                 arg = isl_ast_expr_get_op_arg(arg0, 0);
3899                 arg = gpu_local_array_info_linearize_index(array, arg);
3900                 arg0 = isl_ast_expr_set_op_arg(arg0, 0, arg);
3901                 expr = isl_ast_expr_set_op_arg(expr, 0, arg0);
3902
3903                 return expr;
3904         }
3905         isl_ast_expr_free(arg0);
3906
3907         if (isl_ast_expr_get_op_n_arg(expr) == 1)
3908                 return expr;
3909
3910         ctx = isl_ast_expr_get_ctx(expr);
3911         context = isl_set_universe(isl_space_params_alloc(ctx, 0));
3912         build = isl_ast_build_from_context(context);
3913
3914         n = isl_ast_expr_get_op_n_arg(expr);
3915         res = isl_ast_expr_get_op_arg(expr, 1);
3916         for (i = 1; i < array->n_index; ++i) {
3917                 isl_pw_aff *bound_i;
3918                 isl_ast_expr *expr_i;
3919
3920                 bound_i = isl_pw_aff_list_get_pw_aff(array->bound, i);
3921                 expr_i = isl_ast_build_expr_from_pw_aff(build, bound_i);
3922                 res = isl_ast_expr_mul(res, expr_i);
3923
3924                 if (i + 1 >= n)
3925                         continue;
3926                 expr_i = isl_ast_expr_get_op_arg(expr, i + 1);
3927                 res = isl_ast_expr_add(res, expr_i);
3928         }
3929
3930         isl_ast_build_free(build);
3931
3932         if (1 + array->n_index > n) {
3933                 res = isl_ast_expr_add(isl_ast_expr_get_op_arg(expr, 0), res);
3934         } else {
3935                 list = isl_ast_expr_list_from_ast_expr(res);
3936                 res = isl_ast_expr_get_op_arg(expr, 0);
3937                 res = isl_ast_expr_access(res, list);
3938         }
3939
3940         isl_ast_expr_free(expr);
3941
3942         return res;
3943 }
3944
3945 /* AST expression transformation callback for pet_stmt_build_ast_exprs.
3946  *
3947  * If the AST expression refers to an array that is not accessed
3948  * at all, then this means the value of the expression is not used,
3949  * so we might as well print zero (NULL pointer) instead.
3950  *
3951  * If the AST expression refers to a global scalar that is not
3952  * a read-only scalar, then its address was passed to the kernel and
3953  * we need to dereference it.
3954  *
3955  * If the AST expression refers to an access to a global array,
3956  * then we linearize the access exploiting the bounds in data->local_array.
3957  */
3958 static __isl_give isl_ast_expr *transform_expr(__isl_take isl_ast_expr *expr,
3959         __isl_keep isl_id *id, void *user)
3960 {
3961         struct ppcg_transform_data *data = user;
3962
3963         if (!data->array)
3964                 return expr;
3965         if (!data->array->accessed) {
3966                 isl_ctx *ctx;
3967
3968                 ctx = isl_ast_expr_get_ctx(expr);
3969                 isl_ast_expr_free(expr);
3970                 return isl_ast_expr_from_val(isl_val_zero(ctx));
3971         }
3972         if (gpu_array_is_read_only_scalar(data->array))
3973                 return expr;
3974         if (!data->global)
3975                 return expr;
3976         if (data->array->n_index == 0)
3977                 return dereference(expr);
3978         if (!data->array->linearize)
3979                 return expr;
3980
3981         return gpu_local_array_info_linearize_index(data->local_array, expr);
3982 }
3983
3984 /* This function is called for each instance of a user statement
3985  * in the kernel.
3986  *
3987  * We attach a struct ppcg_kernel_stmt to the "node", containing
3988  * a computed AST expression for each access.
3989  * These AST expressions are computed from iterator_map,
3990  * which expresses the domain
3991  * elements in terms of the generated loops, and sched2shared,
3992  * which expresses the first shared_len dimensions of the schedule
3993  * computed by PPCG in terms of the generated loops.
3994  */
3995 static __isl_give isl_ast_node *at_each_domain(__isl_take isl_ast_node *node,
3996         __isl_keep isl_ast_build *build, void *user)
3997 {
3998         struct ppcg_transform_data data;
3999         struct gpu_gen *gen = (struct gpu_gen *) user;
4000         struct ppcg_kernel_stmt *stmt;
4001         isl_id *id;
4002         isl_pw_multi_aff *sched2shared;
4003         isl_map *map;
4004         isl_pw_multi_aff *iterator_map;
4005         isl_ast_expr *expr, *arg;
4006         isl_union_map *schedule;
4007
4008         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
4009         if (!stmt)
4010                 return isl_ast_node_free(node);
4011
4012         expr = isl_ast_node_user_get_expr(node);
4013         arg = isl_ast_expr_get_op_arg(expr, 0);
4014         id = isl_ast_expr_get_id(arg);
4015
4016         schedule = isl_ast_build_get_schedule(build);
4017         map = isl_map_reverse(isl_map_from_union_map(schedule));
4018         iterator_map = isl_pw_multi_aff_from_map(map);
4019         sched2shared = compute_sched_to_shared(gen,
4020                                         isl_pw_multi_aff_copy(iterator_map));
4021
4022         stmt->type = ppcg_kernel_domain;
4023         stmt->u.d.stmt = find_stmt(gen->prog, id);
4024         if (!stmt->u.d.stmt)
4025                 isl_die(gen->ctx, isl_error_internal,
4026                         "statement not found", goto error);
4027
4028         data.gen = gen;
4029         data.accesses = stmt->u.d.stmt->accesses;
4030         data.iterator_map = iterator_map;
4031         data.sched2shared = sched2shared;
4032         stmt->u.d.ref2expr = pet_stmt_build_ast_exprs(stmt->u.d.stmt->stmt,
4033                                             build, &transform_index, &data,
4034                                             &transform_expr, &data);
4035
4036         isl_id_free(id);
4037         isl_pw_multi_aff_free(iterator_map);
4038         isl_pw_multi_aff_free(sched2shared);
4039         isl_ast_expr_free(arg);
4040         isl_ast_expr_free(expr);
4041
4042         id = isl_id_alloc(gen->ctx, NULL, stmt);
4043         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
4044         return isl_ast_node_set_annotation(node, id);
4045 error:
4046         isl_id_free(id);
4047         isl_pw_multi_aff_free(iterator_map);
4048         ppcg_kernel_stmt_free(stmt);
4049         isl_pw_multi_aff_free(sched2shared);
4050         return isl_ast_node_free(node);
4051 }
4052
4053 /* This function is called when code has been generated for the shared
4054  * tile loops.  The "schedule" refers only to the original statements.
4055  *
4056  * We extend the schedule with that part of gen->local_sched that hasn't
4057  * been taken into account yet.  This introduces parameters referring
4058  * to thread ids in the schedule, so we add them (with the appropriate
4059  * bounds to the context as well).
4060  * Finally, we set the appropriate unrolling options
4061  * if gen->first_unroll is set.
4062  */
4063 static __isl_give isl_ast_node *create_domain_leaf(
4064         __isl_take isl_union_map *schedule, __isl_take isl_ast_build *build,
4065         void *user)
4066 {
4067         struct gpu_gen *gen = (struct gpu_gen *) user;
4068         isl_space *space;
4069         isl_union_map *sched;
4070         isl_ast_node *tree;
4071         isl_set *set;
4072         isl_id_list *iterators;
4073         int n;
4074
4075         schedule = extend_schedule(schedule,
4076                         isl_union_map_copy(gen->local_sched),
4077                         gen->shared_len, gen->thread_tiled_len);
4078
4079         space = isl_ast_build_get_schedule_space(build);
4080         set = isl_set_universe(space);
4081         set = add_bounded_parameters(set, gen->kernel->block_dim,
4082                                         gen->kernel->thread_ids);
4083         build = isl_ast_build_restrict(build, set);
4084
4085         n = gen->thread_tiled_len - gen->shared_len;
4086
4087         if (gen->first_unroll >= 0) {
4088                 space = isl_space_set_alloc(gen->ctx, 0, n);
4089                 build = set_unroll(build, space, gen->first_unroll);
4090         }
4091         iterators = ppcg_scop_generate_names(gen->prog->scop, n, "c");
4092         build = isl_ast_build_set_iterators(build, iterators);
4093         build = isl_ast_build_set_at_each_domain(build, &at_each_domain, gen);
4094         tree = isl_ast_build_ast_from_schedule(build, schedule);
4095         isl_ast_build_free(build);
4096
4097         return tree;
4098 }
4099
4100 /* This function is called for each statement node in the AST of the code
4101  * for copying to or from shared/private memory.
4102  * Attach a pointer to a ppcg_kernel_stmt representing the copy
4103  * statement to the node.
4104  * The statement name is "read" or "write", depending on whether we are
4105  * reading from global memory or writing to global memory.
4106  * The name of the T space is {shared,private}_<array>.
4107  *
4108  * The schedule is of the form
4109  *
4110  *      type[A -> T] -> L
4111  *
4112  * where A refers to a piece of an array and T to the corresponding
4113  * shifted tile.  We split this schedule into mappings L -> A and L -> T
4114  * and store the corresponding expressions in stmt->index and stmt->local_index,
4115  * where stmt points to the ppcg_kernel_stmt that is attached to the node.
4116  */
4117 static __isl_give isl_ast_node *attach_copy_stmt(__isl_take isl_ast_node *node,
4118         __isl_keep isl_ast_build *build, void *user)
4119 {
4120         struct gpu_gen *gen = (struct gpu_gen *) user;
4121         struct ppcg_kernel_stmt *stmt;
4122         isl_id *id;
4123         isl_ast_expr *expr;
4124         isl_space *space;
4125         isl_map *access, *local_access, *map;
4126         isl_pw_multi_aff *pma;
4127         const char *type;
4128         int array_index;
4129
4130         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
4131         if (!stmt)
4132                 return isl_ast_node_free(node);
4133
4134         access = isl_map_from_union_map(isl_ast_build_get_schedule(build));
4135         type = isl_map_get_tuple_name(access, isl_dim_in);
4136         stmt->u.c.read = !strcmp(type, "read");
4137         access = isl_map_reverse(access);
4138         space = isl_space_unwrap(isl_space_range(isl_map_get_space(access)));
4139         local_access = isl_map_copy(access);
4140
4141         map = isl_map_domain_map(isl_map_universe(isl_space_copy(space)));
4142         id = isl_map_get_tuple_id(access, isl_dim_out);
4143         map = isl_map_set_tuple_id(map, isl_dim_in, id);
4144         access = isl_map_apply_range(access, map);
4145         pma = isl_pw_multi_aff_from_map(access);
4146         expr = isl_ast_build_access_from_pw_multi_aff(build, pma);
4147         stmt->u.c.index = expr;
4148
4149         map = isl_map_range_map(isl_map_universe(space));
4150         id = isl_map_get_tuple_id(local_access, isl_dim_out);
4151         map = isl_map_set_tuple_id(map, isl_dim_in, id);
4152         local_access = isl_map_apply_range(local_access, map);
4153         pma = isl_pw_multi_aff_from_map(local_access);
4154         expr = isl_ast_build_access_from_pw_multi_aff(build, pma);
4155         stmt->u.c.local_index = expr;
4156
4157         stmt->u.c.array = gen->copy_group->array;
4158         array_index = stmt->u.c.array - gen->prog->array;
4159         stmt->u.c.local_array = &gen->kernel->array[array_index];
4160         stmt->type = ppcg_kernel_copy;
4161
4162         id = isl_id_alloc(gen->ctx, NULL, stmt);
4163         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
4164         return isl_ast_node_set_annotation(node, id);
4165 }
4166
4167 /* Given a schedule of the form
4168  *
4169  *      [S -> A] -> L
4170  *
4171  * (with S the first shared_len dimensions of the computed schedule,
4172  * A the array and L the schedule correponding to the generated loops),
4173  * indicating where to copy the array elements that need to be copied,
4174  * construct code for performing the copying.
4175  *
4176  * "group" is the array reference group that is being copied
4177  * "type" is either "read" or "write"
4178  * private is set if copying needs to be performed to/from registers
4179  *
4180  * We first construct a mapping to a shifted tile of the array,
4181  *
4182  *      [S -> A] -> T(S,A)                                      (1)
4183  *
4184  * If private is set, then we also use this mapping as a schedule
4185  * (which is already thread-specific and will be completely unrolled).
4186  * Otherwise, we wrap/tile the range over the threads.
4187  * The result is
4188  *
4189  *      [S -> A] -> T'(S,A)
4190  *
4191  * Combined with the given schedule, we have
4192  *
4193  *      [S -> A] -> [L -> T'(S,A)]                              (2)
4194  *
4195  * From the shifted tile mapping, we construct a mapping
4196  *
4197  *      [S -> A] -> [A -> T(S,A)]
4198  *
4199  * and apply it to the schedule (2), obtaining
4200  *
4201  *      [A -> T(S(L),A)] -> [L -> T'(S(L),A)]
4202  *
4203  * Note that we can project out S because it is uniquely defined by L.
4204  */
4205 static __isl_give isl_ast_node *copy_access(struct gpu_gen *gen,
4206         __isl_take isl_map *sched,
4207         const char *type, struct gpu_array_ref_group *group,
4208         __isl_take isl_ast_build *build, int private)
4209 {
4210         isl_space *space;
4211         isl_ast_node *tree;
4212         isl_map *schedule, *shift, *map;
4213         isl_set *set;
4214         isl_id_list *iterators;
4215         int n;
4216
4217         shift = shift_access(group);
4218
4219         schedule = isl_map_copy(shift);
4220         schedule = isl_map_reset_tuple_id(schedule, isl_dim_out);
4221         if (!private)
4222                 schedule = tile_access_schedule(gen, schedule);
4223
4224         n = isl_map_dim(schedule, isl_dim_out);
4225         set = isl_set_universe(isl_ast_build_get_schedule_space(build));
4226         set = add_bounded_parameters(set, gen->kernel->block_dim,
4227                                         gen->kernel->thread_ids);
4228
4229         schedule = isl_map_range_product(sched, schedule);
4230
4231         space = isl_space_domain(isl_map_get_space(shift));
4232         map = isl_map_range_map(isl_map_universe(isl_space_unwrap(space)));
4233         map = isl_map_range_product(map, shift);
4234
4235         schedule = isl_map_apply_domain(schedule, map);
4236
4237         schedule = isl_map_set_tuple_name(schedule, isl_dim_in, type);
4238
4239         build = isl_ast_build_restrict(build, set);
4240
4241         gen->copy_group = group;
4242
4243         if (private) {
4244                 space = isl_space_range(isl_map_get_space(schedule));
4245                 space = isl_space_range(isl_space_unwrap(space));
4246                 build = set_unroll(build, space, 0);
4247         }
4248         iterators = ppcg_scop_generate_names(gen->prog->scop, n, "c");
4249         build = isl_ast_build_set_iterators(build, iterators);
4250         build = isl_ast_build_set_at_each_domain(build, &attach_copy_stmt, gen);
4251         tree = isl_ast_build_ast_from_schedule(build,
4252                                             isl_union_map_from_map(schedule));
4253         isl_ast_build_free(build);
4254
4255         return tree;
4256 }
4257
4258 /* Return code for reading into or writing from shared memory
4259  * the given array reference group.
4260  *
4261  * If we are performing a read from global memory to shared memory and
4262  * if the array involved is not a scalar, then we copy
4263  * the entire tile to shared memory.  This may result in some extra
4264  * elements getting copied, but it should lead to simpler code
4265  * (which means that fewer registers may be needed) and less divergence.
4266  *
4267  * Otherwise, we only copy the elements that will be read or have been written
4268  * in the kernel.
4269  *
4270  *
4271  * The input "sched" is of the form.
4272  *
4273  *      type[S -> A] -> L
4274  *
4275  * with S the first shared_len dimensions of the computed schedule,
4276  * A the array and L the schedule correponding to the generated loops.
4277  *
4278  * We first drop "type",
4279  *
4280  *      [S -> A] -> L
4281  *
4282  * If the above conditions are satisfied, we project out A,
4283  * resulting in
4284  *
4285  *      S -> L
4286  *
4287  * and then introduce the group tile [S -> T], resulting in
4288  *
4289  *      [S -> T] -> L
4290  */
4291 static __isl_give isl_ast_node *copy_group_shared_accesses(
4292         struct gpu_gen *gen, struct gpu_array_ref_group *group,
4293         __isl_take isl_map *sched, __isl_take isl_ast_build *build)
4294 {
4295         const char *type;
4296         int read;
4297         isl_union_map *access;
4298
4299         type = isl_map_get_tuple_name(sched, isl_dim_in);
4300         read = !strcmp(type, "read");
4301
4302         sched = isl_map_reset_tuple_id(sched, isl_dim_in);
4303
4304         if (read && !gpu_array_is_scalar(group->array)) {
4305                 isl_space *space;
4306                 isl_map *map;
4307
4308                 space = isl_space_domain(isl_map_get_space(sched));
4309                 space = isl_space_unwrap(space);
4310                 map = isl_map_domain_map(isl_map_universe(space));
4311                 sched = isl_map_apply_domain(sched, map);
4312
4313                 map = group_tile(group);
4314                 map = isl_map_reverse(isl_map_domain_map(map));
4315                 sched = isl_map_apply_domain(sched, map);
4316         }
4317
4318         return copy_access(gen, sched, type, group, build, 0);
4319 }
4320
4321 /* Return code for reading into or writing from private memory
4322  * the given array reference group.
4323  *
4324  * Let S be the first shared_len dimensions of the computed schedule,
4325  * D the iteration domains, A the array and L the schedule correponding
4326  * to the generated loops.
4327  * "sched" is of the form
4328  *
4329  *      type[S -> A] -> L
4330  *
4331  * where type is either "read" or "write".
4332  * We apply the privatization D -> S(t), with t the thread ids,
4333  * to the access relation D -> A to obtain the privatized access relation
4334  *
4335  *      S(t) -> A
4336  *
4337  * We drop the type from "sched" and intersect with the privatized access
4338  * relation to obtain
4339  *
4340  *      [S(t) -> A] -> L
4341  */
4342 static __isl_give isl_ast_node *copy_group_private_accesses(
4343         struct gpu_gen *gen, struct gpu_array_ref_group *group,
4344         __isl_take isl_map *sched, __isl_take isl_ast_build *build)
4345 {
4346         const char *type;
4347         int read;
4348         isl_union_map *priv;
4349         isl_union_map *access;
4350         isl_map *access_map;
4351
4352         type = isl_map_get_tuple_name(sched, isl_dim_in);
4353         read = !strcmp(type, "read");
4354
4355         priv = isl_union_map_from_map(isl_map_copy(gen->privatization));
4356         priv = isl_union_map_apply_range(isl_union_map_copy(gen->shared_sched),
4357                                         priv);
4358
4359         access = group_access_relation(group, read, !read);
4360         access = isl_union_map_apply_domain(access, priv);
4361         access_map = isl_map_from_union_map(access);
4362
4363         sched = isl_map_reset_tuple_id(sched, isl_dim_in);
4364         sched = isl_map_intersect_domain(sched, isl_map_wrap(access_map));
4365
4366         return copy_access(gen, sched, type, group, build, 1);
4367 }
4368
4369 /* Return code for reading into or writing from shared or private memory.
4370  *
4371  * "schedule" is of the form
4372  *
4373  *      type[S -> A] -> L
4374  *
4375  * with S be the first shared_len dimensions of the computed schedule,
4376  * A the array and L the schedule correponding to the generated loops.
4377  * The array reference group is attached to "type".
4378  */
4379 static __isl_give isl_ast_node *create_access_leaf(
4380         struct gpu_gen *gen, __isl_take isl_map *schedule,
4381         __isl_take isl_ast_build *build)
4382 {
4383         struct gpu_array_ref_group *group;
4384         isl_id *id;
4385
4386         id = isl_map_get_tuple_id(schedule, isl_dim_in);
4387         group = isl_id_get_user(id);
4388         isl_id_free(id);
4389
4390         if (group->private_tile)
4391                 return copy_group_private_accesses(gen, group, schedule,
4392                                                         build);
4393         else
4394                 return copy_group_shared_accesses(gen, group, schedule,
4395                                                         build);
4396 }
4397
4398 /* Create a domain node representing a synchronization.
4399  */
4400 static __isl_give isl_ast_node *create_sync_leaf(
4401         struct gpu_gen *gen, __isl_take isl_map *schedule,
4402         __isl_take isl_ast_build *build)
4403 {
4404         struct ppcg_kernel_stmt *stmt;
4405         isl_id *id;
4406         isl_space *space;
4407         isl_ast_node *node;
4408         isl_ast_expr *expr;
4409
4410         isl_map_free(schedule);
4411
4412         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
4413         if (!stmt)
4414                 return NULL;
4415
4416         stmt->type = ppcg_kernel_sync;
4417
4418         space = isl_ast_build_get_schedule_space(build);
4419         space = isl_space_from_domain(space);
4420         space = isl_space_set_tuple_name(space, isl_dim_out, "sync");
4421         expr = isl_ast_build_call_from_pw_multi_aff(build,
4422                     isl_pw_multi_aff_from_multi_aff(isl_multi_aff_zero(space)));
4423         node = isl_ast_node_alloc_user(expr);
4424         isl_ast_build_free(build);
4425
4426         id = isl_id_alloc(gen->ctx, NULL, stmt);
4427         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
4428         return isl_ast_node_set_annotation(node, id);
4429 }
4430
4431 /* This function is called during the code generation at the point
4432  * where the schedule domain element is completely determined by
4433  * the generated code.  The input schedule contains the original
4434  * statements as well as synchronization and copy "statements".
4435  * The latter are scheduled at different points than any of the original
4436  * statements, so they will only arrive here in isolation.
4437  *
4438  * If the current schedule only refers to a single statement,
4439  * we check if it is a copy or synchronization statement and
4440  * call the appropriate functions.
4441  * Otherwise, we assume we are dealing with the original statements
4442  * and we call create_domain_leaf.
4443  */
4444 static __isl_give isl_ast_node *create_kernel_leaf(
4445         __isl_take isl_ast_build *build, void *user)
4446 {
4447         struct gpu_gen *gen = (struct gpu_gen *) user;
4448         isl_map *map;
4449         isl_union_map *schedule;
4450         const char *name;
4451
4452         schedule = isl_ast_build_get_schedule(build);
4453
4454         if (isl_union_map_n_map(schedule) != 1)
4455                 return create_domain_leaf(schedule, build, user);
4456
4457         map = isl_map_from_union_map(schedule);
4458         name = isl_map_get_tuple_name(map, isl_dim_in);
4459         if (!strcmp(name, "read") || !strcmp(name, "write"))
4460                 return create_access_leaf(gen, map, build);
4461         if (!strcmp(name, "sync"))
4462                 return create_sync_leaf(gen, map, build);
4463
4464         return create_domain_leaf(isl_union_map_from_map(map), build, user);
4465 }
4466
4467 /* Mark all odd schedule dimensions as "atomic" (when the even dimensions
4468  * have value 0) and all even schedule dimensions as "unroll".
4469  *
4470  * That is, the options look as follows
4471  *
4472  *      { [0, b, 0, d, ..., 0] -> atomic[i] : exists a : i = 2 a + 1;
4473  *        [a, b, c, d, ..., z] -> unroll[i] : exists a : i = 2 a }
4474  *
4475  * The even positions are used to be able to schedule copying blocks
4476  * and synchronization before or after each level of the shared memory
4477  * tile loops and we want to make sure that code for these is generated
4478  * separately (within each level).
4479  */
4480 static __isl_give isl_ast_build *set_atomic_and_unroll(
4481         __isl_take isl_ast_build *build,
4482         __isl_take isl_space *space, int sched_len)
4483 {
4484         isl_ctx *ctx;
4485         isl_map *map;
4486         isl_constraint *c;
4487         isl_union_map *opt;
4488         isl_local_space *ls;
4489         int i, n;
4490
4491         ctx = isl_ast_build_get_ctx(build);
4492
4493         space = isl_space_params(space);
4494         space = isl_space_add_dims(space, isl_dim_set, sched_len);
4495         space = isl_space_from_domain(space);
4496         space = isl_space_add_dims(space, isl_dim_out, 2);
4497         map = isl_map_universe(isl_space_copy(space));
4498         for (i = 0; i < sched_len; i += 2)
4499                 map = isl_map_fix_si(map, isl_dim_in, i, 0);
4500         ls = isl_local_space_from_space(isl_map_get_space(map));
4501         c = isl_equality_alloc(ls);
4502         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 0, 1);
4503         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 1, 2);
4504         c = isl_constraint_set_constant_si(c, 1);
4505         map = isl_map_add_constraint(map, c);
4506         map = isl_map_project_out(map, isl_dim_out, 1, 1);
4507         map = isl_map_set_tuple_name(map, isl_dim_out, "atomic");
4508         opt = isl_union_map_from_map(map);
4509
4510         map = isl_map_universe(space);
4511         ls = isl_local_space_from_space(isl_map_get_space(map));
4512         c = isl_equality_alloc(ls);
4513         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 0, 1);
4514         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 1, 2);
4515         map = isl_map_add_constraint(map, c);
4516         map = isl_map_project_out(map, isl_dim_out, 1, 1);
4517         map = isl_map_set_tuple_name(map, isl_dim_out, "unroll");
4518         opt = isl_union_map_add_map(opt, map);
4519
4520         build = isl_ast_build_set_options(build, opt);
4521
4522         return build;
4523 }
4524
4525 /* Return a map that maps a space of dimension gen->shared_len
4526  * to its last dimensions starting at gen->tile_first.
4527  * The range is of dimension
4528  *
4529  *      2 * (gen->shared_len - gen->tile_first) + 1
4530  *
4531  * The input dimensions are mapped to the odd dimensions in the output,
4532  * while the even dimensions (except 2*pos) are fixed to 0.
4533  * Output dimension 2*pos (if pos >= 0) is fixed to "val".
4534  * If pos >= 0, then only the pos first dimensions starting at gen->tile_first
4535  * are mapped to the output.  The remaining input dimensions are projected
4536  * out and the corresponding output dimensions are fixed to 0.
4537  */
4538 static __isl_give isl_map *insert_even(struct gpu_gen *gen,
4539         __isl_take isl_space *space, int pos, int val)
4540 {
4541         int i, n;
4542         isl_map *proj;
4543
4544         space = isl_space_set_from_params(space);
4545         space = isl_space_add_dims(space, isl_dim_set, gen->shared_len);
4546         space = isl_space_map_from_set(space);
4547         proj = isl_map_identity(space);
4548         proj = isl_map_project_out(proj, isl_dim_out, 0, gen->tile_first);
4549         n = gen->shared_len - gen->tile_first;
4550         for (i = 0; i <= n; ++i) {
4551                 proj = isl_map_insert_dims(proj, isl_dim_out, 2 * i, 1);
4552                 if (i == pos)
4553                         proj = isl_map_fix_si(proj, isl_dim_out, 2 * i, val);
4554                 else
4555                         proj = isl_map_fix_si(proj, isl_dim_out, 2 * i, 0);
4556         }
4557
4558         if (pos < 0)
4559                 return proj;
4560
4561         proj = isl_map_eliminate(proj, isl_dim_in, gen->tile_first + pos,
4562                                 gen->shared_len - (gen->tile_first + pos));
4563         for (i = pos; i < n; ++i)
4564                 proj = isl_map_fix_si(proj, isl_dim_out, 2 * i + 1, 0);
4565
4566         return proj;
4567 }
4568
4569 /* Given the AST context schedule "schedule" and the mapping from
4570  * domains to the shared tile loops "shared_sched", add a schedule
4571  * for a synchronization operation at position "val" of loop level "pos".
4572  *
4573  * schedule is of the form
4574  *
4575  *      D -> L
4576  *
4577  * (with D the iteration domains and L the already generated loops),
4578  * while shared_sched is of the form
4579  *
4580  *      D -> S
4581  *
4582  * We combine them into
4583  *
4584  *      L -> S
4585  *
4586  * apply a mapping
4587  *
4588  *      [s_0,...] -> [0,s_{tile_first},0,..., val, 0, 0, ... 0]
4589  *
4590  * and use the result as a schedule for "sync".
4591  */
4592 static __isl_give isl_union_map *add_sync_schedule(struct gpu_gen *gen,
4593         __isl_take isl_union_map *res, __isl_keep isl_union_map *schedule,
4594         __isl_keep isl_union_map *shared_sched, int pos, int val)
4595 {
4596         isl_space *space;
4597         isl_map *proj, *map;
4598
4599         shared_sched = isl_union_map_copy(shared_sched);
4600         schedule = isl_union_map_copy(schedule);
4601
4602         space = isl_union_map_get_space(shared_sched);
4603         schedule = isl_union_map_apply_domain(shared_sched, schedule);
4604         map = isl_map_from_union_map(schedule);
4605
4606         proj = insert_even(gen, space, pos, val);
4607         map = isl_map_apply_range(map, proj);
4608         map = isl_map_from_range(isl_map_wrap(map));
4609         map = isl_map_set_tuple_name(map, isl_dim_in, "sync");
4610
4611         res = isl_union_map_add_map(res, map);
4612
4613         return res;
4614 }
4615
4616 /* Given a set of wrapped references "ref", return the corresponding
4617  * access relations based on the tagged access relations "tagged".
4618  *
4619  * The elements of "ref" are of the form
4620  *
4621  *      [D -> R]
4622  *
4623  * with D an iteration domains and R a reference.
4624  * The elements of "tagged" are of the form
4625  *
4626  *      [D -> R] -> A
4627  *
4628  * with A an array.
4629  *
4630  * Extend "tagged" to include the iteration domain in the range, i.e.,
4631  *
4632  *      [D -> R] -> [D -> A]
4633  *
4634  * apply the result to "ref" and then unwrap the resulting set
4635  * to obtain relations of the form
4636  *
4637  *      D -> A
4638  */
4639 static __isl_give isl_union_map *wrapped_reference_to_access(
4640         __isl_take isl_union_set *ref, __isl_take isl_union_map *tagged)
4641 {
4642         isl_union_map *tag2access;
4643
4644         tag2access = isl_union_map_copy(tagged);
4645         tag2access = isl_union_map_universe(tag2access);
4646         tag2access = isl_union_set_unwrap(isl_union_map_domain(tag2access));
4647         tag2access = isl_union_map_domain_map(tag2access);
4648         tag2access = isl_union_map_range_product(tag2access, tagged);
4649
4650         ref = isl_union_set_coalesce(ref);
4651         ref = isl_union_set_apply(ref, tag2access);
4652
4653         return isl_union_set_unwrap(ref);
4654 }
4655
4656 /* Given an access relation "access" from "group", remove those reads
4657  * if ("read" is 1) or writes (if "read" is 0) that are only needed to
4658  * communicate data within the same iteration of the last_shared dimension
4659  * of the group.
4660  *
4661  * If the access is a read then it is either an element of
4662  *
4663  *      live_in union (range flow)
4664  *
4665  * where live_in and flow may be overapproximations, or
4666  * it reads an uninitialized value (that is not live-in because
4667  * there is an intermediate kill) or it reads a value that was
4668  * written within the same (compound) statement instance.
4669  * If the access is a write then it is either an element of
4670  *
4671  *      live_out union (domain flow)
4672  *
4673  * or it writes a value that is never read (and is not live-out
4674  * because of an intermediate kill) or only
4675  * within the same (compound) statement instance.
4676  * In both cases, the access relation is also a subset of
4677  * the group access relation.
4678  *
4679  * The cases where an uninitialized value is read or a value is written
4680  * that is never read or where the dataflow occurs within a statement
4681  * instance are also considered local and may also be removed.
4682  *
4683  * Essentially, we compute the intersection of "access" with either
4684  *
4685  *      live_in union (range non-local-flow)
4686  *
4687  * or
4688  *
4689  *      live_out union (domain non-local-flow)
4690  *
4691  * We first construct a relation "local"
4692  *
4693  *      [[D -> R] -> [D' -> R']]
4694  *
4695  * of pairs of domain iterations accessing the reference group
4696  * and references in the group that are scheduled to the same iteration
4697  * of the last_shared dimension.
4698  *
4699  * If this relation does not intersect the dataflow dependences,
4700  * then there is nothing we can possibly remove, unless the dataflow
4701  * dependences themselves only relate a subset of the accesses.
4702  * In particular, the accesses may not be involved in any dataflow
4703  * dependences, either because they are uninitialized reads/dead writes
4704  * or because the dataflow occurs inside a statement instance.
4705  *
4706  * Since the computation below may break up the access relation
4707  * into smaller pieces, we only perform the intersection with
4708  * the non-local dependent accesses if the local pairs
4709  * intersect the dataflow dependences.  Otherwise, we intersect
4710  * with the universe of the non-local dependent accesses.
4711  * This should at least remove accesses from statements that
4712  * do not participate in any dependences.
4713  *
4714  * In particular, we remove the "local" dataflow dependences from
4715  * the set of all dataflow dependences.
4716  * Note that if the potential dataflow dependences are an overapproximation
4717  * of the actual dataflow dependences, then the result remains an
4718  * overapproximation of the non-local dataflow dependences.
4719  * Copying to/from global memory is only needed for the references
4720  * in the domain/range of the result or for accesses that are live out/in
4721  * for the entire scop.
4722  *
4723  * We therefore map the domain/range of the "external" relation
4724  * to the corresponding access relation and take the union with
4725  * the live out/in relation.
4726  */
4727 static __isl_give isl_union_map *remove_local_accesses(struct gpu_gen *gen,
4728         struct gpu_array_ref_group *group, __isl_take isl_union_map *access,
4729         int read)
4730 {
4731         int empty;
4732         isl_union_map *tagger;
4733         isl_union_set *domain;
4734         isl_space *space;
4735         isl_union_map *sched, *local, *tagged, *external;
4736         isl_union_set *tag_set;
4737         isl_map *proj;
4738
4739         if (isl_union_map_is_empty(access))
4740                 return access;
4741
4742         tagged = group_tagged_access_relation(group);
4743
4744         sched = isl_union_map_copy(gen->sched);
4745
4746         space = isl_union_map_get_space(sched);
4747         proj = projection(space, gen->untiled_len, group->last_shared + 1);
4748         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
4749
4750         tagger = isl_union_map_copy(gen->prog->scop->tagger);
4751         domain = isl_union_map_domain(isl_union_map_copy(tagged));
4752         tagger = isl_union_map_intersect_range(tagger, domain);
4753         sched = isl_union_map_apply_domain(sched, tagger);
4754
4755         local = isl_union_map_apply_range(sched,
4756                             isl_union_map_reverse(isl_union_map_copy(sched)));
4757         local = isl_union_map_intersect(local,
4758                         isl_union_map_copy(gen->prog->scop->tagged_dep_flow));
4759
4760         empty = isl_union_map_is_empty(local);
4761
4762         external = isl_union_map_copy(gen->prog->scop->tagged_dep_flow);
4763         external = isl_union_map_intersect_params(external,
4764                                 isl_set_copy(gen->prog->scop->context));
4765         external = isl_union_map_subtract(external, local);
4766
4767         if (read) {
4768                 tag_set = isl_union_map_range(external);
4769                 external = wrapped_reference_to_access(tag_set, tagged);
4770                 external = isl_union_map_union(external,
4771                                 isl_union_map_copy(gen->prog->scop->live_in));
4772         } else {
4773                 tag_set = isl_union_map_domain(external);
4774                 external = wrapped_reference_to_access(tag_set, tagged);
4775                 external = isl_union_map_union(external,
4776                                 isl_union_map_copy(gen->prog->scop->live_out));
4777         }
4778
4779         if (empty < 0)
4780                 external = isl_union_map_free(external);
4781         else if (empty)
4782                 external = isl_union_map_universe(external);
4783
4784         access = isl_union_map_intersect(access, external);
4785
4786         return access;
4787 }
4788
4789 /* Given the AST context schedule "schedule" and the mapping from
4790  * domains to the shared tile loops "shared_sched", add a schedule
4791  * for copying an array reference group to/from shared/private memory.
4792  * "read" is set if data should be copied from global memory
4793  * to shared/private memory.
4794  * "k" represents the current group
4795  * "s" is the total number of groups
4796  *
4797  * We schedule an operation before or after the innermost loop
4798  * of "shared_sched" that affects the tile of the array reference group.
4799  *
4800  * schedule is of the form
4801  *
4802  *      D -> L
4803  *
4804  * (with D the iteration domains and L the already generated loops),
4805  * while shared_sched is of the form
4806  *
4807  *      D -> S
4808  *
4809  * We first compute the access relation for the reference group
4810  *
4811  *      D -> A
4812  *
4813  * and remove from this access relation those reads or writes
4814  * that only needed to communicate data within the same iteration
4815  * of the last_shared dimension of the group.
4816  * We then combine what is left with shared_sched into
4817  *
4818  *      D -> [S -> A]
4819  *
4820  * If this results in an empty relation, no copying needs to be performed
4821  * at this point.
4822  * Otherwise, we invert the relation and combine it with "schedule" into
4823  *
4824  *      [S -> A] -> L
4825  *
4826  * The actual additional piece of the schedule is obtained from combining
4827  *
4828  *      [S -> A] -> S
4829  *
4830  * with a mapping
4831  *
4832  *      [s_0,...] -> [0,s_{tile_first},0,..., val, 0, 0, ... 0]
4833  *
4834  * The position of "val" corresponds to the innermost loop that affects
4835  * the tile and the value indicates where the copying is scheduled
4836  * with respect to the actual kernel code (at value 0).
4837  * Reads are schedule before the code, writes to global memory from
4838  * private memory are scheduled at values 1 to s, writes to global
4839  * memory from shared memory are scheduled at values s + 2 to 2 * s + 1.
4840  *
4841  * If we are scheduling a read from global memory to shared memory,
4842  * we insert a synchronization before the kernel code (at the innermost
4843  * level).
4844  * If we are scheduling a write to global memory, then we add
4845  * a synchronization after all writes (at value 2 *s + 2).
4846  * However, there is no need for a synchronization after the outermost loop.
4847  * A write to global memory from private memory at the innermost level
4848  * does not require a synchronization, because it is covered by
4849  * the synchronization after the kernel inserted by body_schedule.
4850  */
4851 static __isl_give isl_union_map *add_group_schedule(struct gpu_gen *gen,
4852         __isl_take isl_union_map *res, __isl_keep isl_union_map *schedule,
4853         __isl_keep isl_union_map *shared_sched,
4854         struct gpu_array_ref_group *group, int read, int k, int s)
4855 {
4856         int n;
4857         int pos, val;
4858         isl_space *space;
4859         isl_union_map *access;
4860         isl_map *map, *proj, *access_map;
4861         isl_id *id;
4862
4863         access = group_access_relation(group, read, !read);
4864         access = remove_local_accesses(gen, group, access, read);
4865         access = isl_union_map_range_product(isl_union_map_copy(shared_sched),
4866                                                 access);
4867
4868         if (isl_union_map_is_empty(access)) {
4869                 isl_union_map_free(access);
4870                 return res;
4871         }
4872
4873         access = isl_union_map_reverse(access);
4874         access = isl_union_map_apply_range(access,
4875                                             isl_union_map_copy(schedule));
4876         access_map = isl_map_from_union_map(access);
4877
4878         space = isl_space_copy(group->array->space);
4879         space = isl_space_from_range(space);
4880         space = isl_space_add_dims(space, isl_dim_in, gen->shared_len);
4881         map = isl_map_domain_map(isl_map_universe(space));
4882
4883         space = isl_union_map_get_space(schedule);
4884         pos = group->last_shared + 1 - gen->tile_first;
4885         assert(pos >= 0);
4886         if (read)
4887                 val = -2 - k;
4888         else if (group->private_tile)
4889                 val = 1 + k;
4890         else
4891                 val = 1 + s + 1 + k;
4892         proj = insert_even(gen, space, pos, val);
4893         map = isl_map_apply_range(map, proj);
4894
4895         access_map = isl_map_range_product(access_map, map);
4896
4897         id = isl_id_alloc(gen->ctx, read ? "read" : "write", group);
4898         access_map = isl_map_set_tuple_id(access_map, isl_dim_in, id);
4899
4900         res = isl_union_map_add_map(res, access_map);
4901
4902         n = gen->shared_len - gen->tile_first;
4903         if (read) {
4904                 if (!group->private_tile)
4905                         res = add_sync_schedule(gen, res, schedule,
4906                                                 shared_sched, n, -1);
4907         } else {
4908                 if (pos == 0)
4909                         return res;
4910                 if (pos == n && group->private_tile)
4911                         return res;
4912                 res = add_sync_schedule(gen, res, schedule, shared_sched,
4913                                         pos, 2 * s + 2);
4914         }
4915
4916         return res;
4917 }
4918
4919 /* Return a schedule for the shared tile loops based on the current
4920  * AST context schedule.
4921  *
4922  * We create a "shared_sched" that maps the domains to the first
4923  * shared_len dimensions of the computed schedule, project out the
4924  * first tile_first dimensions (as these are already covered by
4925  * the host code) and insert "statement-level" dimensions at even
4926  * positions so that we can schedule copy blocks and synchronization
4927  * before/after each level.
4928  *
4929  * In particular, copy blocks are inserted inside the innermost
4930  * level that affect the tile.  For the copying to global memory,
4931  * those from private memory are scheduled before those from shared
4932  * memory such that synchronization can be inserted between the two
4933  * at the innermost level.
4934  * Synchronization is inserted at the innermost level before the
4935  * actual kernel code if there is any copying from global memory
4936  * to shared memory.  It is inserted unconditionally at the innermost
4937  * level after the actual kernel code and the copying to global memory
4938  * from private memory (if any).  Finally, it is inserted after
4939  * any copying to global memory, except at the outermost level
4940  * and at the innermost level if there is no copying from shared
4941  * memory.  The copying from private memory is covered by the unconditional
4942  * synchronization at the innermost level.
4943  */
4944 static __isl_give isl_union_map *body_schedule(struct gpu_gen *gen,
4945         __isl_take isl_union_map *schedule)
4946 {
4947         isl_space *space;
4948         isl_union_map *res;
4949         isl_union_map *shared_sched;
4950         isl_union_map *sched;
4951         isl_map *proj, *map;
4952         int i, j, k, s;
4953
4954         shared_sched = isl_union_map_copy(gen->tiled_sched);
4955         proj = projection(isl_union_map_get_space(shared_sched),
4956                                 gen->tiled_len, gen->shared_len);
4957         shared_sched = isl_union_map_apply_range(shared_sched,
4958                                 isl_union_map_from_map(proj));
4959         space = isl_union_map_get_space(shared_sched);
4960         proj = insert_even(gen, space, -1, 0);
4961         sched = isl_union_map_apply_range(isl_union_map_copy(shared_sched),
4962                                 isl_union_map_from_map(proj));
4963
4964         res = isl_union_map_range_product(isl_union_map_copy(schedule), sched);
4965
4966         s = 0;
4967         for (i = 0; i < gen->prog->n_array; ++i)
4968                 s += gen->prog->array[i].n_group;
4969
4970         k = 0;
4971         for (i = 0; i < gen->prog->n_array; ++i) {
4972                 struct gpu_array_info *array = &gen->prog->array[i];
4973
4974                 for (j = 0; j < array->n_group; ++j) {
4975                         struct gpu_array_ref_group *group;
4976
4977                         group = array->groups[j];
4978                         if (!group->private_tile && !group->shared_tile)
4979                                 continue;
4980                         res = add_group_schedule(gen, res, schedule,
4981                                                 shared_sched, group, 0, k, s);
4982                         res = add_group_schedule(gen, res, schedule,
4983                                                 shared_sched, group, 1, k, s);
4984                         ++k;
4985                 }
4986         }
4987
4988         res = add_sync_schedule(gen, res, schedule, shared_sched,
4989                             gen->shared_len - gen->tile_first, 1 + s);
4990
4991         isl_union_map_free(shared_sched);
4992         isl_union_map_free(schedule);
4993
4994         return res;
4995 }
4996
4997 /* Generate code for "kernel" in the given "context".
4998  *
4999  * We first generate code for the shared tile loops (T1T, T1P and T2)
5000  * in a context that includes the block ids.
5001  * Within each iteration of these loops an additional code generation
5002  * is performed (within create_kernel_leaf) for the rest of the schedule
5003  * in a context that includes the thread ids.
5004  */
5005 static __isl_give isl_ast_node *generate_kernel(struct gpu_gen *gen,
5006         __isl_keep isl_ast_build *build, __isl_keep isl_set *host_domain,
5007         __isl_keep isl_multi_pw_aff *grid_size)
5008 {
5009         isl_space *space;
5010         isl_set *set;
5011         isl_id_list *iterators;
5012         isl_union_map *schedule;
5013         isl_ast_node *tree;
5014         int sched_len;
5015
5016         schedule = isl_ast_build_get_schedule(build);
5017
5018         build = isl_ast_build_copy(build);
5019         build = isl_ast_build_restrict(build, isl_set_copy(host_domain));
5020         space = isl_ast_build_get_schedule_space(build);
5021         set = isl_set_universe(isl_space_copy(space));
5022         set = add_bounded_parameters_dynamic(set, grid_size,
5023                                                 gen->kernel->block_ids);
5024         build = isl_ast_build_restrict(build, set);
5025
5026         schedule = body_schedule(gen, schedule);
5027
5028         sched_len = 2 * (gen->shared_len - gen->tile_first) + 1;
5029
5030         build = set_atomic_and_unroll(build, space, sched_len);
5031         iterators = ppcg_scop_generate_names(gen->prog->scop, sched_len, "g");
5032         build = isl_ast_build_set_iterators(build, iterators);
5033         build = isl_ast_build_set_create_leaf(build, &create_kernel_leaf, gen);
5034         tree = isl_ast_build_ast_from_schedule(build, schedule);
5035         isl_ast_build_free(build);
5036
5037         return tree;
5038 }
5039
5040 /* Attach "id" to the given node.
5041  */
5042 static __isl_give isl_ast_node *attach_id(__isl_take isl_ast_node *node,
5043         __isl_keep isl_ast_build *build, void *user)
5044 {
5045         isl_id *id = user;
5046
5047         node = isl_ast_node_set_annotation(node, id);
5048
5049         return node;
5050 }
5051
5052 /* Construct an AST node for performing a kernel launch and attach
5053  * the information about the kernel to that node.
5054  *
5055  * The kernel AST has been constructed in the context of the range
5056  * of "schedule".  In particular, the grid size has been computed
5057  * in the context.  We therefore still need to make sure that these
5058  * constraints are expressed in the code.  We do this by creating a schedule
5059  *
5060  *      kernel[] -> [S -> []]
5061  *
5062  * where S is the schedule domain, i.e., the range of "schedule".
5063  * The AST generation will then create a single call surrounded by
5064  * all the condition in "S" that have not been expressed yet.
5065  *
5066  * The kernel information is attached to this node in attach_id.
5067  */
5068 static __isl_give isl_ast_node *construct_launch(
5069         __isl_take isl_ast_build *build, __isl_take isl_union_map *schedule,
5070         __isl_take struct ppcg_kernel *kernel)
5071 {
5072         isl_id *id;
5073         isl_ctx *ctx;
5074         isl_union_set *domain;
5075         isl_set *set;
5076         isl_map *map;
5077         isl_ast_node *node;
5078
5079         ctx = isl_ast_build_get_ctx(build);
5080
5081         id = isl_id_alloc(ctx, NULL, kernel);
5082         id = isl_id_set_free_user(id, &ppcg_kernel_free);
5083
5084         domain = isl_union_map_range(schedule);
5085         set = isl_set_from_union_set(domain);
5086         map = isl_map_from_domain(set);
5087         map = isl_map_from_range(isl_map_wrap(map));
5088         map = isl_map_set_tuple_name(map, isl_dim_in, "kernel");
5089         schedule = isl_union_map_from_map(map);
5090
5091         build = isl_ast_build_set_at_each_domain(build, &attach_id, id);
5092         node = isl_ast_build_ast_from_schedule(build, schedule);
5093         isl_ast_build_free(build);
5094
5095         return node;
5096 }
5097
5098 /* This function is called for each leaf in the AST of the host code.
5099  * We first specialize the schedule to the site of the leaf, compute
5100  * the size of shared memory and then construct the body of the host code
5101  * and the associated kernel.
5102  *
5103  * The necessary information for printing the kernel launch is
5104  * stored in a struct ppcg_kernel and attached to the leaf node
5105  * created to represent the launch.
5106  */
5107 static __isl_give isl_ast_node *create_host_leaf(
5108         __isl_take isl_ast_build *build, void *user)
5109 {
5110         struct gpu_gen *gen = (struct gpu_gen *) user;
5111         isl_id *id;
5112         isl_ast_node *node;
5113         struct ppcg_kernel *kernel;
5114         isl_set *host_domain;
5115         isl_union_map *schedule;
5116         isl_union_map *local_sched;
5117         isl_union_map *access;
5118         isl_union_set *domain;
5119         int i;
5120
5121         schedule = isl_ast_build_get_schedule(build);
5122
5123         isl_union_map_foreach_map(schedule, &extract_tile_len, gen);
5124         read_sizes(gen);
5125
5126         domain = isl_union_map_domain(isl_union_map_copy(schedule));
5127
5128         local_sched = isl_union_map_copy(gen->sched);
5129         local_sched = isl_union_map_intersect_domain(local_sched, domain);
5130         access = isl_union_map_union(isl_union_map_copy(gen->prog->read),
5131                                      isl_union_map_copy(gen->prog->may_write));
5132         access = isl_union_map_apply_domain(access,
5133                                             isl_union_map_copy(local_sched));
5134
5135         kernel = gen->kernel = isl_calloc_type(gen->ctx, struct ppcg_kernel);
5136         if (!kernel)
5137                 goto error;
5138         kernel->block_ids = ppcg_scop_generate_names(gen->prog->scop,
5139                                                 gen->n_grid, "b");
5140         kernel->thread_ids = ppcg_scop_generate_names(gen->prog->scop,
5141                                                 gen->n_block, "t");
5142
5143         gen->tiled_sched = tile_schedule(gen, local_sched);
5144         gen->tiled_sched = parametrize_tiled_schedule(gen, gen->tiled_sched);
5145         gen->tiled_sched = scale_tile_loops(gen, gen->tiled_sched);
5146
5147         gen->local_sched = isl_union_map_copy(gen->tiled_sched);
5148         gen->local_sched = thread_tile_schedule(gen, gen->local_sched);
5149         gen->local_sched = scale_thread_tile_loops(gen, gen->local_sched);
5150
5151         kernel->id = gen->kernel_id++;
5152         kernel->context = isl_union_map_params(isl_union_map_copy(schedule));
5153         kernel->grid_size = extract_grid_size(gen, kernel);
5154         extract_block_size(gen, kernel);
5155         kernel->arrays = isl_union_map_range(access);
5156         kernel->arrays = isl_union_set_apply(kernel->arrays,
5157                                 isl_union_map_copy(gen->prog->to_outer));
5158         kernel->space = isl_ast_build_get_schedule_space(build);
5159
5160         compute_shared_sched(gen);
5161         gen->privatization = compute_privatization(gen);
5162         check_scalar_live_ranges(gen);
5163         if (group_references(gen) < 0)
5164                 schedule = isl_union_map_free(schedule);
5165         host_domain = isl_set_from_union_set(isl_union_map_range(
5166                                                 isl_union_map_copy(schedule)));
5167         localize_bounds(gen, kernel, host_domain);
5168
5169         gen->local_sched = interchange_for_unroll(gen, gen->local_sched);
5170         check_shared_memory_bound(gen);
5171         compute_group_tilings(gen);
5172
5173         kernel->tree = generate_kernel(gen, build, host_domain,
5174                                         kernel->grid_size);
5175         create_kernel_vars(gen, kernel);
5176
5177         free_local_array_info(gen);
5178         isl_map_free(gen->privatization);
5179         isl_union_map_free(gen->local_sched);
5180         isl_union_map_free(gen->tiled_sched);
5181         isl_union_map_free(gen->shared_sched);
5182         isl_union_map_free(gen->shared_proj);
5183         isl_set_free(host_domain);
5184         free(gen->tile_size);
5185
5186         node = construct_launch(build, schedule, kernel);
5187
5188         return node;
5189 error:
5190         isl_union_map_free(schedule);
5191         return NULL;
5192 }
5193
5194 /* Use isl to generate code for the outer gen->tile_first loops
5195  * of the global schedule in gen->sched, resulting in the host code.
5196  * Within each iteration of this partial schedule, i.e., for each kernel
5197  * launch, create_host_leaf takes care of generating the kernel code.
5198  */
5199 static __isl_give isl_ast_node *generate_host_code(struct gpu_gen *gen)
5200 {
5201         isl_ast_build *build;
5202         isl_ast_node *tree;
5203         isl_union_map *sched;
5204         isl_map *proj;
5205         isl_id_list *iterators;
5206
5207         sched = isl_union_map_copy(gen->sched);
5208         proj = projection(isl_union_map_get_space(sched),
5209                             gen->untiled_len, gen->tile_first);
5210         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
5211
5212         isl_options_set_ast_build_group_coscheduled(gen->ctx, 1);
5213         build = isl_ast_build_from_context(isl_set_copy(gen->prog->context));
5214         iterators = ppcg_scop_generate_names(gen->prog->scop,
5215                                                 gen->tile_first, "h");
5216         build = isl_ast_build_set_iterators(build, iterators);
5217         build = isl_ast_build_set_create_leaf(build, &create_host_leaf, gen);
5218         tree = isl_ast_build_ast_from_schedule(build, sched);
5219         isl_ast_build_free(build);
5220
5221         return tree;
5222 }
5223
5224 __isl_give isl_union_map *extract_sizes_from_str(isl_ctx *ctx, const char *str)
5225 {
5226         if (!str)
5227                 return NULL;
5228         return isl_union_map_read_from_str(ctx, str);
5229 }
5230
5231 /* Information about the outermost tilable bands in the forest of bands.
5232  *
5233  * tile_len and n_parallel are only sets on band_info structures
5234  * that correspond to outermost bands.  For other bands (in particular,
5235  * ancestors of the outermost bands), n_parallal is set to 0.
5236  *
5237  * prefix is the (padded) schedule leading up to the outermost tilable bands.
5238  *
5239  * tile_first is the number of schedule dimensions in prefix.
5240  *
5241  * suffix is the schedule of the outermost tilable bands and their descendants.
5242  */
5243 struct band_info {
5244         struct gpu_gen *gen;
5245         int tile_first;
5246         int tile_len;
5247         int n_parallel;
5248         isl_union_map *prefix;
5249         isl_union_map *suffix;
5250 };
5251
5252 /* Set tile_len and n_parallel of the statement to that of
5253  * their outermost band, recorded in the band_info.
5254  */
5255 static int set_stmt_tile_len(__isl_take isl_map *map, void *user)
5256 {
5257         struct band_info *info = user;
5258         struct gpu_stmt *stmt;
5259         isl_id *id;
5260
5261         id = isl_map_get_tuple_id(map, isl_dim_in);
5262         stmt = find_stmt(info->gen->prog, id);
5263         isl_id_free(id);
5264
5265         stmt->tile_len = info->tile_len;
5266         stmt->n_parallel = info->n_parallel;
5267
5268         isl_map_free(map);
5269
5270         return 0;
5271 }
5272
5273 static void select_outer_band(struct gpu_gen *gen,
5274         __isl_take isl_schedule_node *node, int pos, struct band_info *info);
5275
5276 /* Check if this band node is tilable and has any parallel loops.  If so,
5277  * take it as the outermost tilable band.  If not, continue looking for the
5278  * outermost tilable band in the children of the current band.
5279  */
5280 static void band_select_outer_band(struct gpu_gen *gen,
5281         __isl_take isl_schedule_node *node, int pos, struct band_info *info)
5282 {
5283         int n = isl_schedule_node_band_n_member(node);
5284         int n_parallel;
5285
5286         for (n_parallel = 0; n_parallel < n; ++n_parallel)
5287                 if (!isl_schedule_node_band_member_get_coincident(node,
5288                                                                 n_parallel))
5289                         break;
5290
5291         if (!isl_schedule_node_band_get_permutable(node) || n_parallel == 0) {
5292                 node = isl_schedule_node_child(node, 0);
5293                 select_outer_band(gen, node, pos + n, info);
5294                 return;
5295         }
5296
5297         info->n_parallel = n_parallel;
5298         gen->any_parallelism = 1;
5299         info->gen = gen;
5300         info->tile_first = pos;
5301         info->tile_len = n;
5302         info->prefix = isl_schedule_node_get_prefix_schedule_union_map(node);
5303         info->suffix = isl_schedule_node_get_subtree_schedule_union_map(node);
5304         isl_union_map_foreach_map(info->prefix, &set_stmt_tile_len, info);
5305
5306         isl_schedule_node_free(node);
5307 }
5308
5309 /* Comparison function that returns a non-zero value for band_infos
5310  * with different tile_len fields or different n_parallel fields.
5311  */
5312 static int cmp_band(const void *p1, const void *p2)
5313 {
5314         const struct band_info *info1 = p1;
5315         const struct band_info *info2 = p2;
5316
5317         if (info1->tile_len != info2->tile_len)
5318                 return info1->tile_len - info2->tile_len;
5319
5320         return info1->n_parallel - info2->n_parallel;
5321 }
5322
5323 /* Extend "umap" with coordinates with fixed value "val"
5324  * to a total length of "dst_len", assuming the original dimension is "src_len".
5325  */
5326 static __isl_give isl_union_map *extend_range(
5327         __isl_take isl_union_map *umap, int src_len, int dst_len, int val)
5328 {
5329         isl_space *dim;
5330         isl_map *map;
5331         int i;
5332
5333         dim = isl_union_map_get_space(umap);
5334         map = isl_map_reverse(projection(dim, dst_len, src_len));
5335         for (i = src_len; i < dst_len; ++i)
5336                 map = isl_map_fix_si(map, isl_dim_out, i, val);
5337
5338         umap = isl_union_map_apply_range(umap, isl_union_map_from_map(map));
5339
5340         return umap;
5341 }
5342
5343 /* Insert a new dimension at position "pos" in the range of "umap"
5344  * with fixed value "val", assuming the original dimension of the range
5345  * of "umap" is "src_len".
5346  */
5347 static __isl_give isl_union_map *insert_range(__isl_take isl_union_map *umap,
5348         int src_len, int pos, int val)
5349 {
5350         isl_space *space;
5351         isl_map *map;
5352
5353         space = isl_union_map_get_space(umap);
5354         map = project_out(space, src_len + 1, pos, 1);
5355         map = isl_map_reverse(map);
5356         map = isl_map_fix_si(map, isl_dim_out, pos, val);
5357
5358         umap = isl_union_map_apply_range(umap, isl_union_map_from_map(map));
5359
5360         return umap;
5361 }
5362
5363 /* Group bands with the same values for tile_len and n_parallel.
5364  * The prefix schedule is then extended with a fixed coordinate that
5365  * is different for each such group.
5366  * Note that the actual values for this coordinate are not important.
5367  * The bands have already been effectively separated at a higher level
5368  * or they are independent and may be executed in parallel.
5369  * The list of band_info has been sorted before this functions is called.
5370  */
5371 static void separate_bands(struct band_info *info, int n)
5372 {
5373         int i;
5374         int j = 0;
5375
5376         for (i = 0; i < n; ++i) {
5377                 int l = info[i].tile_first;
5378
5379                 if (i &&
5380                     (info[i].tile_len != info[i - 1].tile_len ||
5381                      info[i].n_parallel != info[i - 1].n_parallel))
5382                         j++;
5383
5384                 info[i].prefix = extend_range(info[i].prefix,
5385                                                 l, l + 1, j);
5386                 info[i].tile_first = l + 1;
5387         }
5388 }
5389
5390 /* Select the outermost bands in the elements of the sequence or set
5391  * node "node", align their prefix schedules.  Separate all bands
5392  * if "serialize" is set and otherwise separate bands with different values
5393  * for tile_len and/or n_parallel.  Finally, combine the resulting
5394  * prefix and suffix schedules into a single pair of prefix and
5395  * suffix schedules for the entire list.
5396  */
5397 static void list_select_outer_band(struct gpu_gen *gen,
5398         __isl_take isl_schedule_node *node, int pos,
5399         struct band_info *list_info, int serialize)
5400 {
5401         int i;
5402         int n = isl_schedule_node_n_children(node);
5403         isl_ctx *ctx = isl_schedule_node_get_ctx(node);
5404         struct band_info *info;
5405         int max_tile_first;
5406         isl_union_map *prefix;
5407         isl_union_map *suffix;
5408
5409         assert(n >= 1);
5410         info = isl_calloc_array(ctx, struct band_info, n);
5411         assert(info);
5412
5413         max_tile_first = 0;
5414         for (i = 0; i < n; ++i) {
5415                 isl_schedule_node *child;
5416                 child = isl_schedule_node_get_child(node, i);
5417                 select_outer_band(gen, child, pos, &info[i]);
5418                 if (info[i].tile_first > max_tile_first)
5419                         max_tile_first = info[i].tile_first;
5420         }
5421
5422         for (i = 0; i < n; ++i) {
5423                 if (info[i].tile_first == max_tile_first)
5424                         continue;
5425                 info[i].prefix = extend_range(info[i].prefix,
5426                                         info[i].tile_first, max_tile_first, 0);
5427                 info[i].tile_first = max_tile_first;
5428         }
5429
5430         if (serialize) {
5431                 for (i = 0; i < n; ++i) {
5432                         int l = info[i].tile_first;
5433                         info[i].prefix = insert_range(info[i].prefix, l,
5434                                                         pos, i);
5435                         info[i].tile_first = l + 1;
5436                 }
5437         } else {
5438                 qsort(info, n, sizeof(struct band_info), &cmp_band);
5439
5440                 for (i = 0; i < n - 1; ++i)
5441                         if (info[i].tile_len != info[i + 1].tile_len ||
5442                             info[i].n_parallel != info[i + 1].n_parallel)
5443                                 break;
5444
5445                 if (i < n - 1)
5446                         separate_bands(info, n);
5447         }
5448
5449         prefix = info[0].prefix;
5450         suffix = info[0].suffix;
5451
5452         for (i = 1; i < n; ++i) {
5453                 prefix = isl_union_map_union(prefix, info[i].prefix);
5454                 suffix = isl_union_map_union(suffix, info[i].suffix);
5455         }
5456
5457         list_info->tile_first = info[0].tile_first;
5458         list_info->tile_len = -1;
5459         list_info->prefix = prefix;
5460         list_info->suffix = suffix;
5461
5462         isl_schedule_node_free(node);
5463         free(info);
5464 }
5465
5466 /* Select the outermost bands in the elements of the set node "node".
5467  * If the schedule_separate_components is set, then separate all bands.
5468  */
5469 static void set_select_outer_band(struct gpu_gen *gen,
5470         __isl_take isl_schedule_node *node, int pos,
5471         struct band_info *list_info)
5472 {
5473         isl_ctx *ctx = isl_schedule_node_get_ctx(node);
5474         int serialize;
5475
5476         serialize = isl_options_get_schedule_separate_components(ctx);
5477         list_select_outer_band(gen, node, pos, list_info, serialize);
5478 }
5479
5480 /* Select the outermost bands in the elements of the sequence node "node",
5481  * separating all bands.
5482  */
5483 static void sequence_select_outer_band(struct gpu_gen *gen,
5484         __isl_take isl_schedule_node *node, int pos,
5485         struct band_info *list_info)
5486 {
5487         list_select_outer_band(gen, node, pos, list_info, 1);
5488 }
5489
5490 /* If we reach a leaf node, then we have not found any outer tilable
5491  * band with parallel loops, so consider the leaf node as the outermost
5492  * tilable band.
5493  */
5494 static void leaf_select_outer_band(struct gpu_gen *gen,
5495         __isl_take isl_schedule_node *node, int pos, struct band_info *info)
5496 {
5497         info->gen = gen;
5498         info->tile_first = pos;
5499         info->tile_len = 0;
5500         info->prefix = isl_schedule_node_get_prefix_schedule_union_map(node);
5501         info->suffix = isl_schedule_node_get_subtree_schedule_union_map(node);
5502         isl_union_map_foreach_map(info->prefix, &set_stmt_tile_len, info);
5503
5504         isl_schedule_node_free(node);
5505 }
5506
5507 /* Select the outermost tilable band in the subtree that "node" points to.
5508  */
5509 static void select_outer_band(struct gpu_gen *gen,
5510         __isl_take isl_schedule_node *node, int pos, struct band_info *info)
5511 {
5512         enum isl_schedule_node_type type;
5513
5514         type = isl_schedule_node_get_type(node);
5515         switch (type) {
5516         case isl_schedule_node_domain:
5517         case isl_schedule_node_filter:
5518                 node = isl_schedule_node_child(node, 0);
5519                 select_outer_band(gen, node, pos, info);
5520                 return;
5521         case isl_schedule_node_leaf:
5522                 leaf_select_outer_band(gen, node, pos, info);
5523                 return;
5524         case isl_schedule_node_band:
5525                 band_select_outer_band(gen, node, pos, info);
5526                 return;
5527         case isl_schedule_node_set:
5528                 set_select_outer_band(gen, node, pos, info);
5529                 return;
5530         case isl_schedule_node_sequence:
5531                 sequence_select_outer_band(gen, node, pos, info);
5532                 return;
5533         default:
5534                 isl_die(isl_schedule_node_get_ctx(node),
5535                         isl_error_unsupported, "unhandled schedule node type",
5536                         node = node);
5537         case isl_schedule_node_error:
5538                 info->prefix = NULL;
5539                 info->suffix = NULL;
5540                 break;
5541         }
5542
5543         isl_schedule_node_free(node);
5544 }
5545
5546 /* Select the outermost tilable band that (by construction)
5547  * has at least one parallel loop.
5548  * The starting position of the aligned band is stored in the pair
5549  * gen->tile_first.
5550  * The sizes and number of parallel loops may be different in different
5551  * parts of the band forest and are therefore stored in the gpu_stmts.
5552  *
5553  * Return the complete schedule, with the tilable bands aligned
5554  * at gen->tile_first and padded with zero, if needed.
5555  */
5556 static __isl_give isl_union_map *select_outer_tilable_band(struct gpu_gen *gen,
5557         __isl_keep isl_schedule *schedule)
5558 {
5559         isl_schedule_node *node;
5560         struct band_info info;
5561
5562         gen->n_parallel = 0;
5563         gen->tile_len = -1;
5564
5565         node = isl_schedule_get_root(schedule);
5566         select_outer_band(gen, node, 0, &info);
5567
5568         gen->tile_first = info.tile_first;
5569         info.suffix = align_range(info.suffix);
5570
5571         return isl_union_map_flat_range_product(info.prefix, info.suffix);
5572 }
5573
5574 /* Set gen->untiled_len to the number of scheduling dimensions
5575  * for the schedule of the first domain.
5576  * We assume here that this number is the same for all domains.
5577  */
5578 static int set_untiled_len(__isl_take isl_map *map, void *user)
5579 {
5580         unsigned *untiled_len = user;
5581
5582         *untiled_len = isl_map_dim(map, isl_dim_out);
5583
5584         isl_map_free(map);
5585         return -1;
5586 }
5587
5588 /* Compute an appropriate schedule based on the accesses in
5589  * gen->read and gen->write.
5590  *
5591  * We use the dependences in gen->prog->scop to compute
5592  * a schedule that has a parallel loop in each tilable band.
5593  * Finally, we select the outermost tilable band.
5594  *
5595  * If live range reordering is allowed, then we need to make sure
5596  * that live ranges on arrays are not run in parallel since doing
5597  * so would require array expansion.  We therefore add the array
5598  * order dependences to the coincidence dependences.  Non-zero array
5599  * order dependences will then prevent a schedule dimension from being
5600  * considered parallel.
5601  * Live ranges derived from scalars are allowed to be run in parallel
5602  * since we force the scalars to be mapped to private memory in
5603  * check_scalar_live_ranges.
5604  * If live range reordering is allowed, then the false dependences
5605  * are not added to the validity constraints as that would prevent
5606  * reordering.  Instead, the external false dependences that enforce that reads
5607  * from potentially live-in data precede any later write and
5608  * that writes of potentially live-out data follow any other earlier write
5609  * are added to the validity and the coincidence constraints.
5610  * The false dependences are still added to the proximity constraints
5611  * for consistency with the case where live range reordering is not allowed.
5612  * The coincidence constraints then consist of flow dependences,
5613  * external false dependences and array order dependences.
5614  * The independences can be filtered out from the first two sets.
5615  * They have already been filtered out from the array order dependences
5616  * on a per array basis in collect_order_dependences.
5617  * There is no need for a per array handling of the other two sets
5618  * as there should be no flow or external false dependence on local
5619  * variables that can be filtered out.
5620  */
5621 static void compute_schedule(struct gpu_gen *gen)
5622 {
5623         isl_union_set *domain;
5624         isl_union_map *dep_raw, *dep;
5625         isl_union_map *validity, *proximity, *coincidence;
5626         isl_union_map *sched;
5627         isl_schedule_constraints *sc;
5628         isl_schedule *schedule;
5629
5630         domain = isl_union_set_copy(gen->prog->scop->domain);
5631         sc = isl_schedule_constraints_on_domain(isl_union_set_copy(domain));
5632         sc = isl_schedule_constraints_set_context(sc,
5633                                 isl_set_copy(gen->prog->scop->context));
5634         if (gen->options->live_range_reordering) {
5635                 sc = isl_schedule_constraints_set_conditional_validity(sc,
5636                         isl_union_map_copy(gen->prog->scop->tagged_dep_flow),
5637                         isl_union_map_copy(gen->prog->scop->tagged_dep_order));
5638                 proximity = isl_union_map_copy(gen->prog->scop->dep_flow);
5639                 validity = isl_union_map_copy(proximity);
5640                 validity = isl_union_map_union(validity,
5641                             isl_union_map_copy(gen->prog->scop->dep_external));
5642                 proximity = isl_union_map_union(proximity,
5643                             isl_union_map_copy(gen->prog->scop->dep_false));
5644                 coincidence = isl_union_map_copy(validity);
5645                 coincidence = isl_union_map_subtract(coincidence,
5646                         isl_union_map_copy(gen->prog->scop->independence));
5647                 coincidence = isl_union_map_union(coincidence,
5648                                 isl_union_map_copy(gen->prog->array_order));
5649         } else {
5650                 dep_raw = isl_union_map_copy(gen->prog->scop->dep_flow);
5651                 dep = isl_union_map_copy(gen->prog->scop->dep_false);
5652                 dep = isl_union_map_union(dep, dep_raw);
5653                 dep = isl_union_map_coalesce(dep);
5654                 proximity = isl_union_map_copy(dep);
5655                 coincidence = isl_union_map_copy(dep);
5656                 validity = dep;
5657         }
5658         sc = isl_schedule_constraints_set_validity(sc, validity);
5659         sc = isl_schedule_constraints_set_coincidence(sc, coincidence);
5660         sc = isl_schedule_constraints_set_proximity(sc, proximity);
5661
5662         if (gen->options->debug->dump_schedule_constraints)
5663                 isl_schedule_constraints_dump(sc);
5664         schedule = isl_schedule_constraints_compute_schedule(sc);
5665         if (gen->options->debug->dump_schedule)
5666                 isl_schedule_dump(schedule);
5667
5668         sched = select_outer_tilable_band(gen, schedule);
5669
5670         isl_union_map_foreach_map(sched, &set_untiled_len, &gen->untiled_len);
5671         sched = isl_union_map_intersect_domain(sched, domain);
5672         gen->sched = sched;
5673
5674         isl_schedule_free(schedule);
5675 }
5676
5677 /* Compute the sets of outer array elements that need to be copied in and out.
5678  *
5679  * In particular, for each array that is possibly written anywhere in
5680  * gen->prog and that is visible outside the corresponding scop,
5681  * we copy out its entire extent.
5682  *
5683  * Any array elements that is read without first being written needs
5684  * to be copied in. Furthermore, if there are any array elements that
5685  * are copied out, but that may not be written inside gen->prog, then
5686  * they also need to be copied in to ensure that the value after execution
5687  * is the same as the value before execution, at least for those array
5688  * elements that may have their values preserved by the scop.
5689  * In case the array elements are structures, we need to take into
5690  * account that all members of the structures need to be written
5691  * by gen->prog before we can avoid copying the data structure in.
5692  *
5693  * While computing the set of array elements that are copied out but
5694  * not necessarily written, we intersect both sets with the context.
5695  * This helps in those cases where the arrays are declared with a fixed size,
5696  * while the accesses are parametric and the context assigns a fixed value
5697  * to the parameters.
5698  *
5699  * If an element from a local array is read without first being written,
5700  * then there is no point in copying it in since it cannot have been
5701  * written prior to the scop.  Warn about the uninitialized read instead.
5702  */
5703 static void compute_copy_in_and_out(struct gpu_gen *gen)
5704 {
5705         int i;
5706         isl_union_set *local;
5707         isl_union_set *may_write, *must_write;
5708         isl_union_set *copy_in, *copy_out;
5709         isl_union_set *not_written;
5710         isl_union_map *uninitialized;
5711         isl_union_map *local_uninitialized;
5712
5713         must_write = isl_union_map_range(
5714                                 isl_union_map_copy(gen->prog->must_write));
5715         must_write = isl_union_set_intersect_params(must_write,
5716                                             isl_set_copy(gen->prog->context));
5717         may_write = isl_union_map_range(
5718                                 isl_union_map_copy(gen->prog->may_write));
5719         may_write = isl_union_set_intersect_params(may_write,
5720                                             isl_set_copy(gen->prog->context));
5721         may_write = isl_union_set_universe(may_write);
5722         may_write = isl_union_set_apply(may_write,
5723                                     isl_union_map_copy(gen->prog->to_outer));
5724         copy_out = isl_union_set_empty(isl_union_set_get_space(may_write));
5725         local = isl_union_set_copy(copy_out);
5726
5727         for (i = 0; i < gen->prog->n_array; ++i) {
5728                 isl_space *space;
5729                 isl_set *write_i;
5730                 int empty;
5731
5732                 space = isl_space_copy(gen->prog->array[i].space);
5733
5734                 if (gen->prog->array[i].local) {
5735                         isl_set *set;
5736
5737                         set = isl_set_universe(space);
5738                         local = isl_union_set_add_set(local, set);
5739                         continue;
5740                 }
5741
5742                 write_i = isl_union_set_extract_set(may_write, space);
5743                 empty = isl_set_plain_is_empty(write_i);
5744                 isl_set_free(write_i);
5745                 if (empty)
5746                         continue;
5747
5748                 write_i = isl_set_copy(gen->prog->array[i].extent);
5749                 copy_out = isl_union_set_add_set(copy_out, write_i);
5750         }
5751         isl_union_set_free(may_write);
5752
5753         copy_out = isl_union_set_intersect_params(copy_out,
5754                                             isl_set_copy(gen->prog->context));
5755
5756         gen->prog->copy_out = isl_union_set_copy(copy_out);
5757
5758         copy_out = isl_union_set_apply(copy_out,
5759                                     isl_union_map_copy(gen->prog->to_inner));
5760         copy_out = isl_union_set_intersect(copy_out,
5761                                     isl_union_set_copy(gen->prog->may_persist));
5762         not_written = isl_union_set_subtract(copy_out, must_write);
5763
5764         uninitialized = isl_union_map_copy(gen->prog->scop->live_in);
5765         local_uninitialized = isl_union_map_copy(uninitialized);
5766
5767         local = isl_union_set_apply(local,
5768                                     isl_union_map_copy(gen->prog->to_inner));
5769         local_uninitialized = isl_union_map_intersect_range(local_uninitialized,
5770                                                             local);
5771         if (!isl_union_map_is_empty(local_uninitialized)) {
5772                 fprintf(stderr,
5773                         "possibly uninitialized reads (not copied in):\n");
5774                 isl_union_map_dump(local_uninitialized);
5775         }
5776         uninitialized = isl_union_map_subtract(uninitialized,
5777                                                 local_uninitialized);
5778         copy_in = isl_union_map_range(uninitialized);
5779         copy_in = isl_union_set_union(copy_in, not_written);
5780         copy_in = isl_union_set_apply(copy_in,
5781                                     isl_union_map_copy(gen->prog->to_outer));
5782
5783         gen->prog->copy_in = copy_in;
5784 }
5785
5786 /* Internal data structure for extract_access.
5787  * "next_access" points to the end of a linked list that is extended
5788  * by extract_access.
5789  * "single_expression" is set if the access expressions belong to
5790  * an expression statement (i.e., a statement without internal control).
5791  * "any_to_outer" maps all intermediate arrays to their outer arrays.
5792  */
5793 struct ppcg_extract_access_data {
5794         struct gpu_stmt_access **next_access;
5795         int single_expression;
5796         isl_union_map *any_to_outer;
5797 };
5798
5799 /* Given a tagged access relation to a single array "tagged", extract it
5800  * as a map, taking into account that the input may be empty.
5801  * If the access relation is empty, then it does not contain
5802  * any space information, so we try to recover it from the index
5803  * expression.
5804  * The space of the index expression is of the form I -> A,
5805  * with I the statement instances and A the array, or [I -> F] -> A,
5806  * with F the filters corresponding to arguments.
5807  * We first drop F, if present, obtaining I -> A.
5808  * Then we construct I -> R, with R the reference tag,
5809  * combine the two into I -> [R -> A] and uncurry to obtain
5810  * the final result [I -> R] -> A.
5811  * Note that the index expression may have a lower dimension
5812  * than that of the array, but this dimension is not used
5813  * if the access relation is empty.
5814  */
5815 static __isl_give isl_map *extract_single_tagged_access(
5816         __isl_take isl_union_map *tagged, __isl_keep pet_expr *expr)
5817 {
5818         int empty;
5819         isl_id *id;
5820         isl_space *space, *space2;
5821         isl_multi_pw_aff *index;
5822
5823         empty = isl_union_map_is_empty(tagged);
5824         if (empty < 0)
5825                 goto error;
5826         if (!empty)
5827                 return isl_map_from_union_map(tagged);
5828         isl_union_map_free(tagged);
5829
5830         index = pet_expr_access_get_index(expr);
5831         space = isl_multi_pw_aff_get_space(index);
5832         isl_multi_pw_aff_free(index);
5833         if (isl_space_domain_is_wrapping(space))
5834                 space = isl_space_domain_factor_domain(space);
5835         space2 = isl_space_copy(space);
5836         space2 = isl_space_from_domain(isl_space_domain(space));
5837         id = pet_expr_access_get_ref_id(expr);
5838         space2 = isl_space_set_tuple_id(space2, isl_dim_out, id);
5839         space = isl_space_range_product(space2, space);
5840         space = isl_space_uncurry(space);
5841
5842         return isl_map_empty(space);
5843 error:
5844         isl_union_map_free(tagged);
5845         return NULL;
5846 }
5847
5848 /* Extract a gpu_stmt_access from "expr", append it to the list
5849  * that ends in *data->next_access and update the end of the list.
5850  * If the access expression performs a write, then it is considered
5851  * exact only if it appears in a single expression statement and
5852  * if its may access relation is equal to its must access relation.
5853  *
5854  * The combined set of may accesses may be union if member accesses
5855  * are involved, but the entire set is derived from a single reference and
5856  * therefore from a single index expression.  These accesses therefore
5857  * all map to the same outer array.
5858  */
5859 static int extract_access(__isl_keep pet_expr *expr, void *user)
5860 {
5861         struct ppcg_extract_access_data *data = user;
5862         isl_union_map *tagged;
5863         struct gpu_stmt_access *access;
5864         isl_ctx *ctx = pet_expr_get_ctx(expr);
5865         isl_multi_pw_aff *index;
5866
5867         access = isl_alloc_type(ctx, struct gpu_stmt_access);
5868         assert(access);
5869         access->next = NULL;
5870         access->read = pet_expr_access_is_read(expr);
5871         access->write = pet_expr_access_is_write(expr);
5872         tagged = pet_expr_access_get_tagged_may_read(expr);
5873         tagged = isl_union_map_union(tagged,
5874                                 pet_expr_access_get_tagged_may_write(expr));
5875         tagged = isl_union_map_apply_range(tagged,
5876                                         isl_union_map_copy(data->any_to_outer));
5877         if (!access->write) {
5878                 access->exact_write = 1;
5879         } else if (!data->single_expression) {
5880                 access->exact_write = 0;
5881         } else {
5882                 isl_union_map *must, *may;
5883                 may = isl_union_map_copy(tagged);
5884                 may = isl_union_map_domain_factor_domain(may);
5885                 must = pet_expr_access_get_must_write(expr);
5886                 access->exact_write = isl_union_map_is_equal(must, may);
5887                 isl_union_map_free(must);
5888                 isl_union_map_free(may);
5889         }
5890         index = pet_expr_access_get_index(expr);
5891         access->n_index = isl_multi_pw_aff_dim(index, isl_dim_out);
5892         isl_multi_pw_aff_free(index);
5893         access->ref_id = pet_expr_access_get_ref_id(expr);
5894         access->group = -1;
5895         access->tagged_access = extract_single_tagged_access(tagged, expr);
5896         access->access = isl_map_copy(access->tagged_access);
5897         access->access = isl_map_domain_factor_domain(access->access);
5898
5899         *data->next_access = access;
5900         data->next_access = &(*data->next_access)->next;
5901
5902         if (!access->access)
5903                 return -1;
5904
5905         return 0;
5906 }
5907
5908 /* Construct a linked list of gpu_stmt_access objects,
5909  * one for each access expression in the statement body.
5910  * "any_to_outer" maps all intermediate arrays to their outer arrays.
5911  */
5912 static int pet_stmt_extract_accesses(struct gpu_stmt *stmt,
5913         __isl_keep isl_union_map *any_to_outer)
5914 {
5915         struct ppcg_extract_access_data data;
5916
5917         stmt->accesses = NULL;
5918         data.next_access = &stmt->accesses;
5919         data.single_expression =
5920                 pet_tree_get_type(stmt->stmt->body) == pet_tree_expr;
5921         data.any_to_outer = any_to_outer;
5922         return pet_tree_foreach_access_expr(stmt->stmt->body,
5923                                                 &extract_access, &data);
5924 }
5925
5926 /* Return an array of gpu_stmt representing the statements in "scop".
5927  */
5928 static struct gpu_stmt *extract_stmts(isl_ctx *ctx, struct ppcg_scop *scop,
5929         __isl_keep isl_set *context, __isl_keep isl_union_map *any_to_outer)
5930 {
5931         int i;
5932         struct gpu_stmt *stmts;
5933
5934         stmts = isl_calloc_array(ctx, struct gpu_stmt, scop->pet->n_stmt);
5935         if (!stmts)
5936                 return NULL;
5937
5938         for (i = 0; i < scop->pet->n_stmt; ++i) {
5939                 struct gpu_stmt *s = &stmts[i];
5940
5941                 s->id = isl_set_get_tuple_id(scop->pet->stmts[i]->domain);
5942                 s->stmt = scop->pet->stmts[i];
5943                 if (pet_stmt_extract_accesses(s, any_to_outer) < 0)
5944                         return free_stmts(stmts, i + 1);
5945         }
5946
5947         return stmts;
5948 }
5949
5950 /* Callback for ppcg_print_guarded that calls the callback for generate_gpu.
5951  */
5952 static __isl_give isl_printer *print_gpu(__isl_take isl_printer *p, void *user)
5953 {
5954         struct gpu_gen *gen = user;
5955
5956         return gen->print(p, gen->prog, gen->tree, &gen->types,
5957                             gen->print_user);
5958 }
5959
5960 /* Generate CUDA code for "scop" and print it to "p".
5961  * After generating an AST for the transformed scop as explained below,
5962  * we call "gen->print" to print the AST in the desired output format
5963  * to "p".
5964  *
5965  * If it turns out that it does not make sense to generate GPU code,
5966  * then we generate CPU code instead.
5967  *
5968  * The GPU code is generated in a context where at least one
5969  * statement instance is executed.  The corresponding guard (if any) is printed
5970  * around the entire generated GPU code, except for the declaration
5971  * of the arrays that are visible outside of the scop and that therefore
5972  * cannot be declared inside the body of any possible guard.
5973  *
5974  * We first compute a schedule that respects the dependences
5975  * of the original program and select the outermost band
5976  * of tilable dimensions that has at least one parallel loop.
5977  * We then have three blocks of dimensions
5978  *
5979  *      H               B                       G
5980  *
5981  * The tilable band "B" is first tiled according to "tile" sizes, resulting
5982  * in
5983  *
5984  *      H       T               P               G
5985  *
5986  * For each iteration of the T loop and for each array, we compute
5987  * the array elements accessed by that iteration, construct a rectangular
5988  * box around it and shift it to the origin.  The result is used
5989  * as shared memory for the array.
5990  *
5991  * We then split off at most 2 parallel loops from the T loops and
5992  * at most 3 parallel loops from the P loops
5993  *
5994  *      H       T1      T2      P1      P2      G
5995  *
5996  * The T1/P1 loops are then tiled or "wrapped" over the blocks/threads,
5997  * according to "grid"/"block" sizes.
5998  *
5999  *      H       T1T T1P T2      P1T P1P P2      G
6000  *
6001  * Finally, the T1P and P1P iterators are equated to the block and
6002  * thread dimensions respectively and so are effectively removed.
6003  * The H loops are run on the host.  The T1T, T2, P1T, P2 and G loops
6004  * are run on the GPU.
6005  *
6006  * Code is generated in three stages.  We first generate code for the
6007  * host (the H loops), with iterators h%d.  Then, for each leaf node
6008  * of the resulting AST, we generate code for the shared loops (up to
6009  * and including T2), with iterators g%d and after equating the H loops
6010  * to h%d parameters and the T1P loops to the block dimensions.
6011  * Finally, we generate code for the remaining loops in a similar fashion.
6012  */
6013 static __isl_give isl_printer *generate(__isl_take isl_printer *p,
6014         struct gpu_gen *gen, struct ppcg_scop *scop,
6015         struct ppcg_options *options)
6016 {
6017         struct gpu_prog *prog;
6018         isl_ctx *ctx;
6019         isl_set *context, *guard;
6020
6021         if (!scop)
6022                 return isl_printer_free(p);
6023
6024         ctx = isl_printer_get_ctx(p);
6025         prog = gpu_prog_alloc(ctx, scop);
6026         if (!prog)
6027                 return isl_printer_free(p);
6028
6029         context = isl_set_copy(prog->context);
6030         guard = isl_union_set_params(isl_union_set_copy(prog->scop->domain));
6031         prog->context = isl_set_intersect(prog->context, isl_set_copy(guard));
6032
6033         gen->prog = prog;
6034         gen->any_parallelism = 0;
6035         compute_schedule(gen);
6036
6037         if (!gen->any_parallelism) {
6038                 isl_set_free(context);
6039                 isl_set_free(guard);
6040                 p = print_cpu(p, scop, options);
6041         } else {
6042                 compute_copy_in_and_out(gen);
6043                 gen->tree = generate_host_code(gen);
6044                 p = ppcg_print_exposed_declarations(p, prog->scop);
6045                 p = ppcg_print_guarded(p, guard, context, &print_gpu, gen);
6046                 isl_ast_node_free(gen->tree);
6047         }
6048
6049         isl_union_map_free(gen->sched);
6050
6051         gpu_prog_free(prog);
6052
6053         return p;
6054 }
6055
6056 /* Wrapper around generate for use as a ppcg_transform callback.
6057  */
6058 static __isl_give isl_printer *generate_wrap(__isl_take isl_printer *p,
6059         struct ppcg_scop *scop, void *user)
6060 {
6061         struct gpu_gen *gen = user;
6062
6063         return generate(p, gen, scop, gen->options);
6064 }
6065
6066 /* Transform the code in the file called "input" by replacing
6067  * all scops by corresponding GPU code and write the results to "out".
6068  */
6069 int generate_gpu(isl_ctx *ctx, const char *input, FILE *out,
6070         struct ppcg_options *options,
6071         __isl_give isl_printer *(*print)(__isl_take isl_printer *p,
6072                 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
6073                 struct gpu_types *types, void *user), void *user)
6074 {
6075         struct gpu_gen gen;
6076         int r;
6077         int i;
6078
6079         gen.ctx = ctx;
6080         gen.sizes = extract_sizes_from_str(ctx, options->sizes);
6081         gen.options = options;
6082         gen.kernel_id = 0;
6083         gen.print = print;
6084         gen.print_user = user;
6085         gen.types.n = 0;
6086         gen.types.name = NULL;
6087
6088         if (options->debug->dump_sizes) {
6089                 isl_space *space = isl_space_params_alloc(ctx, 0);
6090                 gen.used_sizes = isl_union_map_empty(space);
6091         }
6092
6093         r = ppcg_transform(ctx, input, out, options, &generate_wrap, &gen);
6094
6095         if (options->debug->dump_sizes) {
6096                 isl_union_map_dump(gen.used_sizes);
6097                 isl_union_map_free(gen.used_sizes);
6098         }
6099
6100         isl_union_map_free(gen.sizes);
6101         for (i = 0; i < gen.types.n; ++i)
6102                 free(gen.types.name[i]);
6103         free(gen.types.name);
6104
6105         return r;
6106 }
6107
6108 /* Compute the set of inner array elements that may have their values
6109  * preserved by "prog".  In particular, collect the array elements of
6110  * arrays that are not local to "prog" and remove those elements that
6111  * are definitely killed or definitely written by "prog".
6112  */
6113 static __isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog)
6114 {
6115         int i;
6116         isl_union_set *may_persist, *killed;
6117         isl_union_map *must_kill;
6118
6119         may_persist = isl_union_set_empty(isl_set_get_space(prog->context));
6120         for (i = 0; i < prog->n_array; ++i) {
6121                 isl_set *extent;
6122
6123                 if (prog->array[i].local)
6124                         continue;
6125
6126                 extent = isl_set_copy(prog->array[i].extent);
6127                 may_persist = isl_union_set_add_set(may_persist, extent);
6128         }
6129
6130         may_persist = isl_union_set_intersect_params(may_persist,
6131                                                 isl_set_copy(prog->context));
6132         may_persist = isl_union_set_apply(may_persist,
6133                                         isl_union_map_copy(prog->to_inner));
6134         must_kill = isl_union_map_copy(prog->tagged_must_kill);
6135         killed = isl_union_map_range(must_kill);
6136         must_kill = isl_union_map_copy(prog->must_write);
6137         killed = isl_union_set_union(killed, isl_union_map_range(must_kill));
6138
6139         may_persist = isl_union_set_subtract(may_persist, killed);
6140         return may_persist;
6141 }
6142
6143 struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop)
6144 {
6145         struct gpu_prog *prog;
6146         isl_space *space;
6147         isl_map *id;
6148
6149         if (!scop)
6150                 return NULL;
6151
6152         prog = isl_calloc_type(ctx, struct gpu_prog);
6153         assert(prog);
6154
6155         prog->ctx = ctx;
6156         prog->scop = scop;
6157         prog->context = isl_set_copy(scop->context);
6158         prog->n_stmts = scop->pet->n_stmt;
6159         prog->any_to_outer = pet_scop_compute_outer_to_any(scop->pet);
6160         prog->any_to_outer = isl_union_map_reverse(prog->any_to_outer);
6161         space = isl_union_map_get_space(prog->any_to_outer);
6162         space = isl_space_set_from_params(space);
6163         space = isl_space_add_dims(space, isl_dim_set, 1);
6164         space = isl_space_map_from_set(space);
6165         id = isl_map_identity(space);
6166         prog->any_to_outer = isl_union_map_add_map(prog->any_to_outer, id);
6167         prog->stmts = extract_stmts(ctx, scop,
6168                                         prog->context, prog->any_to_outer);
6169         prog->read = isl_union_map_copy(scop->reads);
6170         prog->may_write = isl_union_map_copy(scop->may_writes);
6171         prog->must_write = isl_union_map_copy(scop->must_writes);
6172         prog->tagged_must_kill = isl_union_map_copy(scop->tagged_must_kills);
6173         prog->to_inner = pet_scop_compute_outer_to_inner(scop->pet);
6174         prog->to_outer = isl_union_map_copy(prog->to_inner);
6175         prog->to_outer = isl_union_map_reverse(prog->to_outer);
6176
6177         if (!prog->stmts)
6178                 return gpu_prog_free(prog);
6179
6180         if (collect_array_info(prog) < 0)
6181                 return gpu_prog_free(prog);
6182         prog->may_persist = compute_may_persist(prog);
6183
6184         return prog;
6185 }
6186
6187 void *gpu_prog_free(struct gpu_prog *prog)
6188 {
6189         if (!prog)
6190                 return NULL;
6191         free_array_info(prog);
6192         free_stmts(prog->stmts, prog->n_stmts);
6193         isl_union_map_free(prog->any_to_outer);
6194         isl_union_map_free(prog->to_outer);
6195         isl_union_map_free(prog->to_inner);
6196         isl_union_set_free(prog->copy_in);
6197         isl_union_set_free(prog->copy_out);
6198         isl_union_map_free(prog->read);
6199         isl_union_map_free(prog->may_write);
6200         isl_union_map_free(prog->must_write);
6201         isl_union_map_free(prog->tagged_must_kill);
6202         isl_union_map_free(prog->array_order);
6203         isl_union_set_free(prog->may_persist);
6204         isl_set_free(prog->context);
6205         free(prog);
6206         return NULL;
6207 }