gpu.c

   1 /*
   2  * Copyright 2010-2011 INRIA Saclay
   3  * Copyright 2012-2013 Ecole Normale Superieure
   4  *
   5  * Use of this software is governed by the MIT license
   6  *
   7  * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
   8  * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
   9  * 91893 Orsay, France
  10  * and Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
  11  */
  12
  13 #include <assert.h>
  14 #include <stdlib.h>
  15 #include <string.h>
  16
  17 #include <isl/polynomial.h>
  18 #include <isl/union_set.h>
  19 #include <isl/aff.h>
  20 #include <isl/ilp.h>
  21 #include <isl/flow.h>
  22 #include <isl/band.h>
  23 #include <isl/schedule.h>
  24 #include <isl/options.h>
  25 #include <isl/ast_build.h>
  26
  27 #include "cpu.h"
  28 #include "gpu.h"
  29 #include "schedule.h"
  30 #include "ppcg_options.h"
  31 #include "print.h"
  32
  33 /* The fields stride, shift and shift_map only contain valid information
  34  * if shift != NULL.
  35  * If so, they express that current index is such that if you add shift,
  36  * then the result is always a multiple of stride.
  37  * shift_map contains the mapping
  38  *
  39  *      i -> (i + shift)/stride
  40  *
  41  * Let D represent the initial shared_len dimensions of the computed schedule.
  42  * The spaces of "lb" and "shift" are of the form
  43  *
  44  *      D -> [b]
  45  *
  46  * "shift_map" is of the form
  47  *
  48  *      [D -> i] -> [D -> (i + shift(D))/stride]
  49  */
  50 struct gpu_array_bound {
  51         isl_val *size;
  52         isl_aff *lb;
  53
  54         isl_val *stride;
  55         isl_aff *shift;
  56         isl_basic_map *shift_map;
  57 };
  58
  59 /* A tile of an array.
  60  *
  61  * n is the dimension of the array.
  62  * bound is an array of size "n" representing the lower bound
  63  *      and size for each index.
  64  *
  65  * tiling maps a tile in the global array to the corresponding
  66  * shared/private memory tile and is of the form
  67  *
  68  *      { [D[i] -> A[a]] -> T[(a + shift(i))/stride - lb(i)] }
  69  *
  70  * where D represents the initial shared_len dimensions
  71  * of the computed schedule.
  72  */
  73 struct gpu_array_tile {
  74         int n;
  75         struct gpu_array_bound *bound;
  76         isl_multi_aff *tiling;
  77 };
  78
  79 struct gpu_array_info;
  80
  81 /* A group of array references in a kernel that should be handled together.
  82  * If private_tile is not NULL, then it is mapped to registers.
  83  * Otherwise, if shared_tile is not NULL, it is mapped to shared memory.
  84  * Otherwise, it is accessed from global memory.
  85  */
  86 struct gpu_array_ref_group {
  87         /* The references in this group access this array. */
  88         struct gpu_array_info *array;
  89         /* Position of this group in the list of reference groups of array. */
  90         int nr;
  91
  92         /* The following fields are use during the construction of the groups.
  93          * access is the combined access relation relative to the shared
  94          * memory tiling.  In particular, the domain of the map corresponds
  95          * to the first shared_len dimensions of the computed schedule.
  96          * write is set if any access in the group is a write.
  97          * exact_write is set if all writes are definite writes.
  98          */
  99         isl_map *access;
 100         int write;
 101         int exact_write;
 102
 103         /* The shared memory tile, NULL if none. */
 104         struct gpu_array_tile *shared_tile;
 105
 106         /* The private memory tile, NULL if none. */
 107         struct gpu_array_tile *private_tile;
 108
 109         /* References in this group; point to elements of a linked list. */
 110         int n_ref;
 111         struct gpu_stmt_access **refs;
 112
 113         /* Last shared memory tile dimension that affects tile of this group. */
 114         int last_shared;
 115 };
 116
 117 struct gpu_gen {
 118         isl_ctx *ctx;
 119         struct ppcg_options *options;
 120
 121         /* Callback for printing of AST in appropriate format. */
 122         __isl_give isl_printer *(*print)(__isl_take isl_printer *p,
 123                 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
 124                 struct gpu_types *types, void *user);
 125         void *print_user;
 126
 127         struct gpu_prog *prog;
 128         /* The generated AST. */
 129         isl_ast_node *tree;
 130
 131         /* The sequence of types for which a definition has been printed. */
 132         struct gpu_types types;
 133
 134         /* tile, grid and block sizes for each kernel */
 135         isl_union_map *sizes;
 136
 137         /* Identifier of current kernel. */
 138         int kernel_id;
 139         /* Pointer to the current kernel. */
 140         struct ppcg_kernel *kernel;
 141         /* Does the computed schedule exhibit any parallelism? */
 142         int any_parallelism;
 143
 144         /* First tile dimension. */
 145         int tile_first;
 146         /* Number of tile dimensions. */
 147         int tile_len;
 148         /* Number of initial parallel loops among tile dimensions. */
 149         int n_parallel;
 150
 151         /* Number of dimensions determining shared memory. */
 152         int shared_len;
 153
 154         /* Number of rows in the untiled schedule. */
 155         int untiled_len;
 156         /* Number of rows in the tiled schedule. */
 157         int tiled_len;
 158         /* Number of rows in schedule after tiling/wrapping over threads. */
 159         int thread_tiled_len;
 160
 161         /* Global untiled schedule. */
 162         isl_union_map *sched;
 163         /* Local (per kernel launch) tiled schedule. */
 164         isl_union_map *tiled_sched;
 165         /* Local schedule per shared memory tile loop iteration. */
 166         isl_union_map *local_sched;
 167
 168         /* Local tiled schedule projected onto the shared tile loops and
 169          * the loops that will be wrapped over the threads,
 170          * with all shared tile loops parametrized.
 171          */
 172         isl_union_map *shared_sched;
 173         /* Projects out the loops that will be wrapped over the threads
 174          * from shared_sched.
 175          */
 176         isl_union_map *shared_proj;
 177
 178         /* A map that takes the range of shared_sched as input,
 179          * wraps the appropriate loops over the threads and then projects
 180          * out these loops.
 181          */
 182         isl_map *privatization;
 183
 184         /* A map from the shared memory tile loops and the thread indices
 185          * (as parameters) to the set of accessed memory elements that
 186          * will be accessed through private copies.
 187          */
 188         isl_union_map *private_access;
 189
 190         /* The schedule for the current private/shared access
 191          * (within print_private_access or print_shared_access).
 192          */
 193         isl_map *copy_sched;
 194         /* The array reference group corresponding to copy_sched. */
 195         struct gpu_array_ref_group *copy_group;
 196
 197         /* First loop to unroll (or -1 if none) in the current part of the
 198          * schedule.
 199          */
 200         int first_unroll;
 201
 202         int n_grid;
 203         int n_block;
 204         /* Note: in the input file, the sizes of the grid and the blocks
 205          * are specified in the order x, y, z, but internally, the sizes
 206          * are stored in reverse order, so that the last element always
 207          * refers to the x dimension.
 208          */
 209         int grid_dim[2];
 210         int block_dim[3];
 211         int *tile_size;
 212 };
 213
 214 /* Print the name of the local copy of a given group of array references.
 215  */
 216 static __isl_give isl_printer *print_array_name(__isl_take isl_printer *p,
 217         struct gpu_array_ref_group *group)
 218 {
 219         int global = 0;
 220
 221         if (group->private_tile)
 222                 p = isl_printer_print_str(p, "private_");
 223         else if (group->shared_tile)
 224                 p = isl_printer_print_str(p, "shared_");
 225         else
 226                 global = 1;
 227         p = isl_printer_print_str(p, group->array->name);
 228         if (!global && group->array->n_group > 1) {
 229                 p = isl_printer_print_str(p, "_");
 230                 p = isl_printer_print_int(p, group->nr);
 231         }
 232
 233         return p;
 234 }
 235
 236 /* Collect all references to the given array and store pointers to them
 237  * in array->refs.
 238  *
 239  * If the array contains structures, then there is no need to collect
 240  * the references since we will not be computing any reference groups.
 241  */
 242 static void collect_references(struct gpu_prog *prog,
 243         struct gpu_array_info *array)
 244 {
 245         int i;
 246         int n;
 247
 248         if (array->has_compound_element)
 249                 return;
 250
 251         n = 0;
 252         for (i = 0; i < prog->n_stmts; ++i) {
 253                 struct gpu_stmt *stmt = &prog->stmts[i];
 254                 struct gpu_stmt_access *access;
 255
 256                 for (access = stmt->accesses; access; access = access->next) {
 257                         const char *name;
 258                         name = isl_map_get_tuple_name(access->access,
 259                                                       isl_dim_out);
 260                         if (name && !strcmp(array->name, name))
 261                                 n++;
 262                 }
 263         }
 264
 265         array->n_ref = n;
 266         array->refs = isl_alloc_array(prog->ctx, struct gpu_stmt_access *, n);
 267         assert(array->refs);
 268
 269         n = 0;
 270         for (i = 0; i < prog->n_stmts; ++i) {
 271                 struct gpu_stmt *stmt = &prog->stmts[i];
 272                 struct gpu_stmt_access *access;
 273
 274                 for (access = stmt->accesses; access; access = access->next) {
 275                         const char *name;
 276                         name = isl_map_get_tuple_name(access->access,
 277                                                       isl_dim_out);
 278                         if (!name || strcmp(array->name, name))
 279                                 continue;
 280
 281                         array->refs[n++] = access;
 282                 }
 283         }
 284 }
 285
 286 /* Create a gpu_array_tile for an array of dimension "n_index".
 287  */
 288 static struct gpu_array_tile *create_tile(isl_ctx *ctx, int n_index)
 289 {
 290         int i;
 291         struct gpu_array_tile *tile;
 292
 293         tile = isl_calloc_type(ctx, struct gpu_array_tile);
 294         assert(tile);
 295
 296         tile->n = n_index;
 297
 298         tile->bound = isl_alloc_array(ctx, struct gpu_array_bound, n_index);
 299         assert(tile->bound);
 300
 301         for (i = 0; i < n_index; ++i) {
 302                 tile->bound[i].size = NULL;
 303                 tile->bound[i].lb = NULL;
 304                 tile->bound[i].stride = NULL;
 305                 tile->bound[i].shift = NULL;
 306                 tile->bound[i].shift_map = NULL;
 307         }
 308
 309         return tile;
 310 }
 311
 312 static void *free_tile(struct gpu_array_tile *tile)
 313 {
 314         int j;
 315
 316         if (!tile)
 317                 return NULL;
 318
 319         for (j = 0; j < tile->n; ++j) {
 320                 isl_val_free(tile->bound[j].size);
 321                 isl_val_free(tile->bound[j].stride);
 322                 isl_aff_free(tile->bound[j].lb);
 323                 isl_aff_free(tile->bound[j].shift);
 324                 isl_basic_map_free(tile->bound[j].shift_map);
 325         }
 326         free(tile->bound);
 327         isl_multi_aff_free(tile->tiling);
 328         free(tile);
 329
 330         return NULL;
 331 }
 332
 333 static struct pet_array *find_array(struct ppcg_scop *scop,
 334         __isl_keep isl_set *accessed)
 335 {
 336         int i;
 337         isl_id *id;
 338
 339         id = isl_set_get_tuple_id(accessed);
 340
 341         for (i = 0; i < scop->n_array; ++i) {
 342                 isl_id *id_i;
 343
 344                 id_i = isl_set_get_tuple_id(scop->arrays[i]->extent);
 345                 isl_id_free(id_i);
 346                 if (id == id_i)
 347                         break;
 348         }
 349         isl_id_free(id);
 350
 351         return i < scop->n_array ? scop->arrays[i] : NULL;
 352 }
 353
 354 /* Compute and return the extent of "array", taking into account the set of
 355  * accessed elements.
 356  *
 357  * In particular, the extent in the outer dimension is taken
 358  * from "accessed", while then extent in the remaing dimensions
 359  * are taken from array->extent.
 360  *
 361  * The extent in the outer dimension cannot be taken from array->extent
 362  * because that may be unbounded.  Furthermore, even if it is bounded,
 363  * it may be larger than the piece of the array that is being accessed.
 364  */
 365 static __isl_give isl_set *compute_extent(struct pet_array *array,
 366         __isl_keep isl_set *accessed)
 367 {
 368         int n_index;
 369         isl_id *id;
 370         isl_set *outer;
 371         isl_set *extent;
 372
 373         extent = isl_set_copy(array->extent);
 374
 375         n_index = isl_set_dim(accessed, isl_dim_set);
 376         if (n_index == 0)
 377                 return extent;
 378
 379         extent = isl_set_project_out(extent, isl_dim_set, 0, 1);
 380         outer = isl_set_copy(accessed);
 381         outer = isl_set_project_out(outer, isl_dim_set, 1, n_index - 1);
 382         extent = isl_set_flat_product(outer, extent);
 383         id = isl_set_get_tuple_id(accessed);
 384         extent = isl_set_set_tuple_id(extent, id);
 385
 386         return extent;
 387 }
 388
 389 /* Is the array "array" being extracted a read-only scalar?
 390  *
 391  * That is, is "array" a scalar that is never possibly written to.
 392  * An array containing structures is never considered to be a scalar.
 393  */
 394 static int is_read_only_scalar(struct gpu_array_info *array,
 395         struct gpu_prog *prog)
 396 {
 397         isl_set *space;
 398         isl_union_map *write;
 399         int empty;
 400
 401         if (array->has_compound_element)
 402                 return 0;
 403         if (array->n_index != 0)
 404                 return 0;
 405
 406         write = isl_union_map_copy(prog->may_write);
 407         space = isl_set_universe(isl_space_copy(array->space));
 408         write = isl_union_map_intersect_range(write,
 409                                                 isl_union_set_from_set(space));
 410         empty = isl_union_map_is_empty(write);
 411         isl_union_map_free(write);
 412
 413         return empty;
 414 }
 415
 416 /* Compute bounds on the host arrays based on the accessed elements
 417  * and collect all references to the array.
 418  *
 419  * If the array is zero-dimensional and does not contain structures,
 420  * i.e., if the array is a scalar, we check whether it is read-only.
 421  */
 422 static int extract_array_info(__isl_take isl_set *array, void *user)
 423 {
 424         int i;
 425         struct gpu_prog *prog = (struct gpu_prog *)user;
 426         const char *name;
 427         int n_index;
 428         isl_pw_aff **bounds;
 429         struct pet_array *pa;
 430         struct gpu_array_info *info;
 431         isl_set *extent;
 432
 433         info = &prog->array[prog->n_array];
 434         prog->n_array++;
 435
 436         n_index = isl_set_dim(array, isl_dim_set);
 437         name = isl_set_get_tuple_name(array);
 438         bounds = isl_alloc_array(isl_set_get_ctx(array),
 439                                  isl_pw_aff *, n_index);
 440         if (!bounds)
 441                 goto error;
 442
 443         info->space = isl_set_get_space(array);
 444         info->name = strdup(name);
 445         info->n_index = n_index;
 446         info->bound = bounds;
 447         info->linearize = prog->scop->options->linearize_device_arrays;
 448
 449         pa = find_array(prog->scop, array);
 450         if (!pa)
 451                 isl_die(isl_set_get_ctx(array), isl_error_internal,
 452                         "unable to find array in scop", goto error);
 453
 454         info->type = strdup(pa->element_type);
 455         info->size = pa->element_size;
 456         info->local = pa->declared && !pa->exposed;
 457         info->has_compound_element = pa->element_is_record;
 458         info->read_only_scalar = is_read_only_scalar(info, prog);
 459
 460         extent = compute_extent(pa, array);
 461         for (i = 0; i < n_index; ++i) {
 462                 isl_set *dom;
 463                 isl_local_space *ls;
 464                 isl_aff *one;
 465                 isl_pw_aff *bound;
 466
 467                 bound = isl_set_dim_max(isl_set_copy(extent), i);
 468                 assert(bound);
 469                 dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
 470                 ls = isl_local_space_from_space(isl_set_get_space(dom));
 471                 one = isl_aff_zero_on_domain(ls);
 472                 one = isl_aff_add_constant_si(one, 1);
 473                 bound = isl_pw_aff_add(bound, isl_pw_aff_alloc(dom, one));
 474                 bound = isl_pw_aff_gist(bound, isl_set_copy(prog->context));
 475
 476                 bounds[i] = bound;
 477                 if (!isl_pw_aff_is_cst(bound))
 478                         info->linearize = 1;
 479         }
 480         info->extent = extent;
 481
 482         collect_references(prog, info);
 483
 484         isl_set_free(array);
 485         return 0;
 486 error:
 487         isl_set_free(array);
 488         return -1;
 489 }
 490
 491 /* Compute a mapping from all outer arrays (of structs) in scop
 492  * to their innermost arrays.
 493  *
 494  * In particular, for each array of a primitive type, the result
 495  * contains the identity mapping on that array.
 496  * For each array involving member accesses, the result
 497  * contains a mapping from the elements of the outer array of structs
 498  * to all corresponding elements of the innermost nested arrays.
 499  */
 500 static __isl_give isl_union_map *compute_to_inner(struct ppcg_scop *scop)
 501 {
 502         int i;
 503         isl_union_map *to_inner;
 504
 505         to_inner = isl_union_map_empty(isl_set_get_space(scop->context));
 506
 507         for (i = 0; i < scop->n_array; ++i) {
 508                 struct pet_array *array = scop->arrays[i];
 509                 isl_set *set;
 510                 isl_map *map;
 511
 512                 if (array->element_is_record)
 513                         continue;
 514
 515                 set = isl_set_copy(array->extent);
 516                 map = isl_set_identity(isl_set_copy(set));
 517
 518                 while (set && isl_set_is_wrapping(set)) {
 519                         isl_id *id;
 520                         isl_map *wrapped;
 521
 522                         id = isl_set_get_tuple_id(set);
 523                         wrapped = isl_set_unwrap(set);
 524                         wrapped = isl_map_domain_map(wrapped);
 525                         wrapped = isl_map_set_tuple_id(wrapped, isl_dim_in, id);
 526                         map = isl_map_apply_domain(map, wrapped);
 527                         set = isl_map_domain(isl_map_copy(map));
 528                 }
 529
 530                 map = isl_map_gist_domain(map, set);
 531
 532                 to_inner = isl_union_map_add_map(to_inner, map);
 533         }
 534
 535         return to_inner;
 536 }
 537
 538 /* Construct a gpu_array_info for each array possibly accessed by "prog" and
 539  * collect them in prog->array.
 540  *
 541  * If there are any member accesses involved, then they are first mapped
 542  * to the outer arrays of structs.
 543  */
 544 static int collect_array_info(struct gpu_prog *prog)
 545 {
 546         int r;
 547         isl_union_set *arrays;
 548
 549         arrays = isl_union_map_range(isl_union_map_copy(prog->read));
 550         arrays = isl_union_set_union(arrays,
 551                     isl_union_map_range(isl_union_map_copy(prog->may_write)));
 552
 553         arrays = isl_union_set_apply(arrays,
 554                                         isl_union_map_copy(prog->to_outer));
 555
 556         arrays = isl_union_set_coalesce(arrays);
 557
 558         prog->n_array = isl_union_set_n_set(arrays);
 559         prog->array = isl_calloc_array(prog->ctx,
 560                                      struct gpu_array_info, prog->n_array);
 561         assert(prog->array);
 562         prog->n_array = 0;
 563         r = isl_union_set_foreach_set(arrays, &extract_array_info, prog);
 564         isl_union_set_free(arrays);
 565
 566         return r;
 567 }
 568
 569 static void free_array_info(struct gpu_prog *prog)
 570 {
 571         int i, j;
 572
 573         for (i = 0; i < prog->n_array; ++i) {
 574                 int n_index = prog->array[i].n_index;
 575                 free(prog->array[i].type);
 576                 free(prog->array[i].name);
 577                 for (j = 0; j < n_index; ++j)
 578                         isl_pw_aff_free(prog->array[i].bound[j]);
 579                 isl_space_free(prog->array[i].space);
 580                 isl_set_free(prog->array[i].extent);
 581                 free(prog->array[i].bound);
 582                 free(prog->array[i].refs);
 583         }
 584         free(prog->array);
 585 }
 586
 587 /* Check if a gpu array is a scalar.  A scalar is a value that is not stored
 588  * as an array or through a pointer reference, but as a single data element.
 589  * At the moment, scalars are represented as zero-dimensional arrays.
 590  * A zero-dimensional array containing structures is not considered
 591  * to be a scalar.
 592  */
 593 int gpu_array_is_scalar(struct gpu_array_info *array)
 594 {
 595         return !array->has_compound_element && array->n_index == 0;
 596 }
 597
 598 /* Is "array" a read-only scalar?
 599  */
 600 int gpu_array_is_read_only_scalar(struct gpu_array_info *array)
 601 {
 602         return array->read_only_scalar;
 603 }
 604
 605 /* Internal data structure for extract_size_of_type.
 606  * "type" specifies the name of the space that we want to extract.
 607  * "res" is used to store the subset of that space.
 608  */
 609 struct ppcg_extract_size_data {
 610         const char *type;
 611         isl_set *res;
 612 };
 613
 614 /* This function is called for each set in a union_set.
 615  * If the name of the set matches data->type, we store the
 616  * set in data->res.
 617  */
 618 static int extract_size_of_type(__isl_take isl_set *size, void *user)
 619 {
 620         struct ppcg_extract_size_data *data = user;
 621         const char *name;
 622
 623         name = isl_set_get_tuple_name(size);
 624         if (name && !strcmp(name, data->type)) {
 625                 data->res = size;
 626                 return -1;
 627         }
 628
 629         isl_set_free(size);
 630         return 0;
 631 }
 632
 633 /* Given a union map { kernel[i] -> *[...] },
 634  * return the range in the space called "type" for the kernel with
 635  * sequence number "id".
 636  */
 637 static __isl_give isl_set *extract_sizes(__isl_keep isl_union_map *sizes,
 638         const char *type, int id)
 639 {
 640         isl_space *space;
 641         isl_set *dom;
 642         isl_union_set *local_sizes;
 643         struct ppcg_extract_size_data data = { type, NULL };
 644
 645         if (!sizes)
 646                 return NULL;
 647
 648         space = isl_union_map_get_space(sizes);
 649         space = isl_space_set_from_params(space);
 650         space = isl_space_add_dims(space, isl_dim_set, 1);
 651         space = isl_space_set_tuple_name(space, isl_dim_set, "kernel");
 652         dom = isl_set_universe(space);
 653         dom = isl_set_fix_si(dom, isl_dim_set, 0, id);
 654
 655         local_sizes = isl_union_set_apply(isl_union_set_from_set(dom),
 656                                         isl_union_map_copy(sizes));
 657         isl_union_set_foreach_set(local_sizes, &extract_size_of_type, &data);
 658         isl_union_set_free(local_sizes);
 659         return data.res;
 660 }
 661
 662 /* Given a singleton set, extract the first (at most *len) elements
 663  * of the single integer tuple into *sizes and update *len if needed.
 664  */
 665 static void read_sizes_from_set(__isl_take isl_set *set, int *sizes, int *len)
 666 {
 667         int i;
 668         int dim;
 669
 670         if (!set)
 671                 return;
 672
 673         dim = isl_set_dim(set, isl_dim_set);
 674         if (dim < *len)
 675                 *len = dim;
 676
 677         for (i = 0; i < *len; ++i) {
 678                 isl_val *v;
 679
 680                 v = isl_set_plain_get_val_if_fixed(set, isl_dim_set, i);
 681                 assert(v);
 682
 683                 sizes[i] = isl_val_get_num_si(v);
 684                 isl_val_free(v);
 685         }
 686
 687         isl_set_free(set);
 688 }
 689
 690 /* Extract user specified "tile" sizes from the "sizes" command line option,
 691  * defaulting to option->tile_size in each dimension.
 692  */
 693 static void read_tile_sizes(struct gpu_gen *gen)
 694 {
 695         int n;
 696         isl_set *size;
 697
 698         gen->tile_size = isl_alloc_array(gen->ctx, int, gen->tile_len);
 699         assert(gen->tile_size);
 700         for (n = 0; n < gen->tile_len; ++n)
 701                 gen->tile_size[n] = gen->options->tile_size;
 702
 703         size = extract_sizes(gen->sizes, "tile", gen->kernel_id);
 704         read_sizes_from_set(size, gen->tile_size, &gen->tile_len);
 705
 706         if (gen->n_parallel > gen->tile_len)
 707                 gen->n_parallel = gen->tile_len;
 708 }
 709
 710 /* Extract user specified "block" sizes from the "sizes" command line option,
 711  * after filling in some potentially useful defaults.
 712  */
 713 static void read_block_sizes(struct gpu_gen *gen)
 714 {
 715         int n;
 716         isl_set *size;
 717
 718         n = gen->n_parallel;
 719         gen->n_block = (n <= 3) ? n : 3;
 720         switch (gen->n_block) {
 721         case 1:
 722                 gen->block_dim[0] = 512;
 723                 break;
 724         case 2:
 725                 gen->block_dim[0] = 32;
 726                 gen->block_dim[1] = 16;
 727                 break;
 728         default:
 729                 gen->block_dim[0] = 32;
 730                 gen->block_dim[1] = 4;
 731                 gen->block_dim[2] = 4;
 732                 break;
 733         }
 734
 735         size = extract_sizes(gen->sizes, "block", gen->kernel_id);
 736         read_sizes_from_set(size, gen->block_dim, &gen->n_block);
 737 }
 738
 739 /* Extract user specified "grid" sizes from the "sizes" command line option,
 740  * after filling in some potentially useful defaults.
 741  */
 742 static void read_grid_sizes(struct gpu_gen *gen)
 743 {
 744         int n = gen->n_parallel;
 745         isl_set *size;
 746
 747         gen->n_grid = (n <= 2) ? n : 2;
 748         switch (gen->n_grid) {
 749         case 1:
 750                 gen->grid_dim[0] = 32768;
 751                 break;
 752         default:
 753                 gen->grid_dim[0] = 256;
 754                 gen->grid_dim[1] = 256;
 755                 break;
 756         }
 757
 758         size = extract_sizes(gen->sizes, "grid", gen->kernel_id);
 759         read_sizes_from_set(size, gen->grid_dim, &gen->n_grid);
 760 }
 761
 762 /* Extract user specified sizes from the "sizes" command line option
 763  * after filling in some potentially useful defaults.
 764  */
 765 static void read_sizes(struct gpu_gen *gen)
 766 {
 767         read_tile_sizes(gen);
 768         read_block_sizes(gen);
 769         read_grid_sizes(gen);
 770 }
 771
 772 static void *free_stmts(struct gpu_stmt *stmts, int n)
 773 {
 774         int i;
 775
 776         if (!stmts)
 777                 return NULL;
 778
 779         for (i = 0; i < n; ++i) {
 780                 struct gpu_stmt_access *access, *next;
 781
 782                 for (access = stmts[i].accesses; access; access = next) {
 783                         next = access->next;
 784                         isl_id_free(access->ref_id);
 785                         isl_map_free(access->access);
 786                         isl_map_free(access->tagged_access);
 787                         free(access);
 788                 }
 789
 790                 isl_id_free(stmts[i].id);
 791         }
 792         free(stmts);
 793
 794         return NULL;
 795 }
 796
 797 /* Construct a map from a domain of dimensionality "len"
 798  * to a domain of dimensionality "len" + "tile_len" that tiles
 799  * the "tile_len" coordinates starting at "first".
 800  * In particular, [s_i] -> [s_i / tile_size[i], s_i % tile_size[i]].
 801  * "dim" prescribes the parameters.
 802  */
 803 static __isl_give isl_map *tile(__isl_take isl_space *dim, int len,
 804         int first, int tile_len, int *tile_size)
 805 {
 806         int i;
 807         isl_basic_map *bmap;
 808         isl_constraint *c;
 809         isl_local_space *ls;
 810
 811         dim = isl_space_add_dims(dim, isl_dim_in, len);
 812         dim = isl_space_add_dims(dim, isl_dim_out, len + tile_len);
 813         bmap = isl_basic_map_universe(isl_space_copy(dim));
 814         ls = isl_local_space_from_space(dim);
 815
 816         for (i = 0; i < len - tile_len; ++i) {
 817                 int j = i < first ? i : i + tile_len;
 818                 int k = i < first ? i : i + 2 * tile_len;
 819
 820                 c = isl_equality_alloc(isl_local_space_copy(ls));
 821                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, j, -1);
 822                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, k, 1);
 823                 bmap = isl_basic_map_add_constraint(bmap, c);
 824         }
 825
 826         for (i = 0; i < tile_len; ++i) {
 827                 c = isl_equality_alloc(isl_local_space_copy(ls));
 828                 c = isl_constraint_set_coefficient_si(c, isl_dim_in,
 829                                                 first + i, -1);
 830                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 831                                                 first + i, tile_size[i]);
 832                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 833                                                 first + i + tile_len, 1);
 834                 bmap = isl_basic_map_add_constraint(bmap, c);
 835
 836                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 837                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 838                                                    first + i + tile_len, 1);
 839                 bmap = isl_basic_map_add_constraint(bmap, c);
 840
 841                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 842                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 843                                                    first + i + tile_len, -1);
 844                 c = isl_constraint_set_constant_si(c, tile_size[i] - 1);
 845                 bmap = isl_basic_map_add_constraint(bmap, c);
 846         }
 847
 848         isl_local_space_free(ls);
 849
 850         return isl_map_from_basic_map(bmap);
 851 }
 852
 853 /* Construct a map from a domain of dimensionality "len"
 854  * to a domain of dimensionality "len" + "wrap_len" that "wraps"
 855  * the "wrap_len" coordinates starting at "first" according to "wrap_size".
 856  * In particular, [s_i] -> [s_i, s_i % wrap_size[i]].
 857  * To do so, we need extra variables corresponding to [s_i / wrap_size[i]],
 858  * that are projected out at the end.
 859  * "dim" prescribes the parameters.
 860  */
 861 static __isl_give isl_map *wrap(__isl_take isl_space *dim, int len,
 862         int first, int wrap_len, int *wrap_size)
 863 {
 864         int i;
 865         isl_basic_map *bmap;
 866         isl_constraint *c;
 867         isl_local_space *ls;
 868
 869         dim = isl_space_add_dims(dim, isl_dim_in, len);
 870         dim = isl_space_add_dims(dim, isl_dim_out, len + 2 * wrap_len);
 871         bmap = isl_basic_map_universe(isl_space_copy(dim));
 872         ls = isl_local_space_from_space(dim);
 873
 874         for (i = 0; i < len; ++i) {
 875                 int k = i < first + wrap_len ? i : i + 2 * wrap_len;
 876
 877                 c = isl_equality_alloc(isl_local_space_copy(ls));
 878                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, -1);
 879                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, k, 1);
 880                 bmap = isl_basic_map_add_constraint(bmap, c);
 881         }
 882
 883         for (i = 0; i < wrap_len; ++i) {
 884                 c = isl_equality_alloc(isl_local_space_copy(ls));
 885                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 886                                                     first + i, -1);
 887                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 888                                                     first + wrap_len + i, 1);
 889                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 890                                     first + 2 * wrap_len + i, wrap_size[i]);
 891                 bmap = isl_basic_map_add_constraint(bmap, c);
 892
 893                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 894                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 895                                                     first + wrap_len + i, 1);
 896                 bmap = isl_basic_map_add_constraint(bmap, c);
 897
 898                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 899                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 900                                                     first + wrap_len + i, -1);
 901                 c = isl_constraint_set_constant_si(c, wrap_size[i] - 1);
 902                 bmap = isl_basic_map_add_constraint(bmap, c);
 903         }
 904
 905         isl_local_space_free(ls);
 906
 907         bmap = isl_basic_map_project_out(bmap, isl_dim_out,
 908                                 first + 2 * wrap_len, wrap_len);
 909
 910         return isl_map_from_basic_map(bmap);
 911 }
 912
 913 /* Add "n" parameters named prefix%d.
 914  */
 915 static __isl_give isl_set *add_params( __isl_take isl_set *set,
 916         int n, const char *prefix)
 917 {
 918         int i;
 919         unsigned nparam;
 920         char name[20];
 921
 922         nparam = isl_set_dim(set, isl_dim_param);
 923         set = isl_set_add_dims(set, isl_dim_param, n);
 924
 925         for (i = 0; i < n; ++i) {
 926                 snprintf(name, sizeof(name), "%s%d", prefix, i);
 927                 set = isl_set_set_dim_name(set, isl_dim_param,
 928                                             nparam + i, name);
 929         }
 930
 931         return set;
 932 }
 933
 934 /* Equate the "n" dimensions of "set" starting at "first" to
 935  * freshly created parameters named prefix%d.
 936  */
 937 static __isl_give isl_set *parametrize(__isl_take isl_set *set,
 938         int first, int n, const char *prefix)
 939 {
 940         int i;
 941         unsigned nparam;
 942
 943         nparam = isl_set_dim(set, isl_dim_param);
 944
 945         set = add_params(set, n, prefix);
 946
 947         for (i = 0; i < n; ++i)
 948                 set = isl_set_equate(set, isl_dim_param, nparam + i,
 949                                         isl_dim_set, first + i);
 950
 951         return set;
 952 }
 953
 954 /* Given a parameter space "space", create a set of dimension "len"
 955  * of which the "n" dimensions starting at "first" are equated to
 956  * freshly created parameters named prefix%d.
 957  */
 958 static __isl_give isl_set *parametrization(__isl_take isl_space *space,
 959         int len, int first, int n, const char *prefix)
 960 {
 961         isl_set *set;
 962
 963         space = isl_space_set_from_params(space);
 964         space = isl_space_add_dims(space, isl_dim_set, len);
 965         set = isl_set_universe(space);
 966
 967         return parametrize(set, first, n, prefix);
 968 }
 969
 970 /* Tile the B loops over the tile sizes and then tile/wrap
 971  * the T1 loops over the blocks.
 972  */
 973 static __isl_give isl_union_map *tile_schedule(struct gpu_gen *gen,
 974         __isl_take isl_union_map *sched)
 975 {
 976         isl_space *dim;
 977         isl_map *tiling, *block_tiling;
 978
 979         dim = isl_union_map_get_space(sched);
 980         tiling = tile(isl_space_copy(dim), gen->untiled_len,
 981                       gen->tile_first, gen->tile_len, gen->tile_size);
 982
 983         if (gen->options->wrap)
 984                 block_tiling = wrap(dim, gen->untiled_len + gen->tile_len,
 985                                 gen->tile_first, gen->n_grid, gen->grid_dim);
 986         else
 987                 block_tiling = tile(dim, gen->untiled_len + gen->tile_len,
 988                                 gen->tile_first, gen->n_grid, gen->grid_dim);
 989
 990         gen->tiled_len = gen->untiled_len + gen->tile_len + gen->n_grid;
 991
 992         tiling = isl_map_apply_range(tiling, block_tiling);
 993
 994         sched = isl_union_map_apply_range(sched,
 995                                              isl_union_map_from_map(tiling));
 996
 997         gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
 998
 999         return sched;
1000 }
1001
1002 /* Equate the "T1P" iterators in the tiled schedule "sched"
1003  * to the block dimensions.
1004  */
1005 static __isl_give isl_union_map *parametrize_tiled_schedule(
1006         struct gpu_gen *gen, __isl_take isl_union_map *sched)
1007 {
1008         isl_space *dim;
1009         isl_set *par;
1010
1011         dim = isl_union_map_get_space(sched);
1012         par = parametrization(dim, gen->tiled_len,
1013                 gen->tile_first + gen->n_grid, gen->n_grid, "b");
1014         sched = isl_union_map_intersect_range(sched,
1015                                                 isl_union_set_from_set(par));
1016
1017         return sched;
1018 }
1019
1020 /* Tile/wrap the P1 loops over the threads.
1021  */
1022 static __isl_give isl_union_map *thread_tile_schedule(struct gpu_gen *gen,
1023         __isl_take isl_union_map *sched)
1024 {
1025         isl_space *dim;
1026         isl_map *tiling;
1027         isl_set *par;
1028
1029         dim = isl_union_map_get_space(sched);
1030
1031         if (gen->options->wrap)
1032                 tiling = wrap(isl_space_copy(dim), gen->tiled_len,
1033                                 gen->shared_len, gen->n_block, gen->block_dim);
1034         else
1035                 tiling = tile(isl_space_copy(dim), gen->tiled_len,
1036                                 gen->shared_len, gen->n_block, gen->block_dim);
1037         gen->thread_tiled_len = gen->tiled_len + gen->n_block;
1038
1039         sched = isl_union_map_apply_range(sched,
1040                                              isl_union_map_from_map(tiling));
1041
1042         par = parametrization(dim, gen->thread_tiled_len,
1043                 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
1044                 gen->n_block, "t");
1045         sched = isl_union_map_intersect_range(sched,
1046                                                 isl_union_set_from_set(par));
1047
1048         gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
1049
1050         return sched;
1051 }
1052
1053 /* If the user asked for it, scale the shared memory tile loops
1054  * (T1T and T2) of "sched" by gen->tile_size[i].
1055  * If we are not performing "wrapping", then additionally scale the T1P
1056  * loops by gen->grid_dim[i].
1057  */
1058 static __isl_give isl_union_map *scale_tile_loops(struct gpu_gen *gen,
1059         __isl_take isl_union_map *sched)
1060 {
1061         int i;
1062         isl_space *dim;
1063         isl_basic_map *scale;
1064         isl_constraint *c;
1065         isl_local_space *ls;
1066
1067         if (!gen->options->scale_tile_loops)
1068                 return sched;
1069
1070         dim = isl_union_map_get_space(sched);
1071         dim = isl_space_add_dims(dim, isl_dim_in, gen->tiled_len);
1072         dim = isl_space_add_dims(dim, isl_dim_out, gen->tiled_len);
1073         scale = isl_basic_map_universe(isl_space_copy(dim));
1074         ls = isl_local_space_from_space(dim);
1075
1076         for (i = 0; i < gen->tiled_len; ++i) {
1077                 int f = 1;
1078
1079                 if (i >= gen->tile_first && i < gen->tile_first + gen->n_grid) {
1080                         f = gen->tile_size[i - gen->tile_first];
1081                         if (!gen->options->wrap)
1082                                 f *= gen->grid_dim[i - gen->tile_first];
1083                 } else if (i >= gen->tile_first + gen->n_grid &&
1084                            i < gen->tile_first + gen->n_grid + gen->tile_len) {
1085                         f = gen->tile_size[i - (gen->tile_first + gen->n_grid)];
1086                 }
1087
1088                 c = isl_equality_alloc(isl_local_space_copy(ls));
1089                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1090                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1091                 scale = isl_basic_map_add_constraint(scale, c);
1092         }
1093
1094         isl_local_space_free(ls);
1095
1096         sched = isl_union_map_apply_range(sched,
1097                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1098
1099         return sched;
1100 }
1101
1102 /* If we are not performing "wrapping" and if the user asked for it,
1103  * scale the thread tile loops (P1T) of "sched" by gen->block_dim[i].
1104  */
1105 static __isl_give isl_union_map *scale_thread_tile_loops(struct gpu_gen *gen,
1106         __isl_take isl_union_map *sched)
1107 {
1108         int i;
1109         isl_space *dim;
1110         isl_basic_map *scale;
1111         isl_constraint *c;
1112         isl_local_space *ls;
1113
1114         if (gen->options->wrap)
1115                 return sched;
1116         if (!gen->options->scale_tile_loops)
1117                 return sched;
1118
1119         dim = isl_union_map_get_space(sched);
1120         dim = isl_space_add_dims(dim, isl_dim_in, gen->thread_tiled_len);
1121         dim = isl_space_add_dims(dim, isl_dim_out, gen->thread_tiled_len);
1122         scale = isl_basic_map_universe(isl_space_copy(dim));
1123         ls = isl_local_space_from_space(dim);
1124
1125         for (i = 0; i < gen->thread_tiled_len; ++i) {
1126                 int f = 1;
1127
1128                 if (i >= gen->shared_len &&
1129                     i < gen->shared_len + gen->n_block)
1130                         f = gen->block_dim[i - gen->shared_len];
1131
1132                 c = isl_equality_alloc(isl_local_space_copy(ls));
1133                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1134                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1135                 scale = isl_basic_map_add_constraint(scale, c);
1136         }
1137
1138         isl_local_space_free(ls);
1139
1140         sched = isl_union_map_apply_range(sched,
1141                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1142
1143         return sched;
1144 }
1145
1146 /* If we are not performing "wrapping" and if the user asked for it,
1147  * scale the "n_tile" loops starting at "first" of "sched" by gen->block_dim[i].
1148  */
1149 static __isl_give isl_union_map *scale_access_tile_loops(struct gpu_gen *gen,
1150         __isl_take isl_union_map *sched, int len, int first, int n_tile)
1151 {
1152         int i;
1153         isl_space *dim;
1154         isl_basic_map *scale;
1155         isl_constraint *c;
1156         isl_local_space *ls;
1157
1158         if (gen->options->wrap)
1159                 return sched;
1160         if (!gen->options->scale_tile_loops)
1161                 return sched;
1162
1163         dim = isl_union_map_get_space(sched);
1164         dim = isl_space_add_dims(dim, isl_dim_in, len);
1165         dim = isl_space_add_dims(dim, isl_dim_out, len);
1166         scale = isl_basic_map_universe(isl_space_copy(dim));
1167         ls = isl_local_space_from_space(dim);
1168
1169         for (i = 0; i < len; ++i) {
1170                 int f = 1;
1171
1172                 if (i >= first && i < first + n_tile)
1173                         f = gen->kernel->block_dim[i - first];
1174
1175                 c = isl_equality_alloc(isl_local_space_copy(ls));
1176                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1177                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1178                 scale = isl_basic_map_add_constraint(scale, c);
1179         }
1180
1181         isl_local_space_free(ls);
1182
1183         sched = isl_union_map_apply_range(sched,
1184                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1185
1186         return sched;
1187 }
1188
1189 /* Add "len" parameters p[i] called prefix%d,
1190  * with bounds to 0 <= p[i] < size[i].
1191  */
1192 __isl_give isl_set *add_bounded_parameters(__isl_take isl_set *set,
1193         int len, int *size, const char *prefix)
1194 {
1195         int i;
1196         unsigned nparam;
1197         isl_space *dim;
1198         isl_basic_set *bset;
1199         isl_constraint *c;
1200         isl_local_space *ls;
1201         char name[20];
1202
1203         nparam = isl_set_dim(set, isl_dim_param);
1204         set = isl_set_add_dims(set, isl_dim_param, len);
1205
1206         for (i = 0; i < len; ++i) {
1207                 snprintf(name, sizeof(name), "%s%d", prefix, i);
1208                 set = isl_set_set_dim_name(set, isl_dim_param,
1209                                             nparam + i, name);
1210         }
1211
1212         dim = isl_set_get_space(set);
1213         bset = isl_basic_set_universe(isl_space_copy(dim));
1214         ls = isl_local_space_from_space(dim);
1215
1216         for (i = 0; i < len; ++i) {
1217                 c = isl_inequality_alloc(isl_local_space_copy(ls));
1218                 c = isl_constraint_set_coefficient_si(c, isl_dim_param,
1219                                                         nparam + i, 1);
1220                 bset = isl_basic_set_add_constraint(bset, c);
1221
1222                 c = isl_inequality_alloc(isl_local_space_copy(ls));
1223                 c = isl_constraint_set_coefficient_si(c, isl_dim_param,
1224                                                         nparam + i, -1);
1225                 c = isl_constraint_set_constant_si(c, size[i] - 1);
1226                 bset = isl_basic_set_add_constraint(bset, c);
1227         }
1228
1229         isl_local_space_free(ls);
1230
1231         return isl_set_intersect(set, isl_set_from_basic_set(bset));
1232 }
1233
1234 /* Add "len" parameters p[i] called prefix%d,
1235  * with bounds to 0 <= p[i] < size[i].
1236  */
1237 static __isl_give isl_set *add_bounded_parameters_dynamic(
1238         __isl_take isl_set *set, __isl_keep isl_multi_pw_aff *size,
1239         const char *prefix)
1240 {
1241         int i, len;
1242         unsigned nparam;
1243         isl_space *space;
1244         isl_local_space *ls;
1245         char name[20];
1246
1247         len = isl_multi_pw_aff_dim(size, isl_dim_out);
1248         nparam = isl_set_dim(set, isl_dim_param);
1249         set = isl_set_add_dims(set, isl_dim_param, len);
1250
1251         for (i = 0; i < len; ++i) {
1252                 snprintf(name, sizeof(name), "%s%d", prefix, i);
1253                 set = isl_set_set_dim_name(set, isl_dim_param,
1254                                             nparam + i, name);
1255         }
1256
1257         space = isl_space_params(isl_set_get_space(set));
1258         ls = isl_local_space_from_space(space);
1259         for (i = 0; i < len; ++i) {
1260                 isl_pw_aff *param, *size_i, *zero;
1261                 isl_set *bound;
1262
1263                 param = isl_pw_aff_var_on_domain(isl_local_space_copy(ls),
1264                                                 isl_dim_param, nparam + i);
1265
1266                 size_i = isl_multi_pw_aff_get_pw_aff(size, i);
1267                 bound = isl_pw_aff_lt_set(isl_pw_aff_copy(param), size_i);
1268                 set = isl_set_intersect_params(set, bound);
1269
1270                 zero = isl_pw_aff_zero_on_domain(isl_local_space_copy(ls));
1271                 bound = isl_pw_aff_ge_set(param, zero);
1272                 set = isl_set_intersect_params(set, bound);
1273         }
1274         isl_local_space_free(ls);
1275
1276         return set;
1277 }
1278
1279 /* Construct a map from an access to group->array to the corresponding
1280  * shared/private memory tile.
1281  * The map is of the form
1282  *
1283  *      { [D[i] -> A[a]] -> T[t] }
1284  *
1285  * where D represents the initial shared_len dimensions
1286  * of the computed schedule.
1287  */
1288 static __isl_give isl_map *shift_access(struct gpu_array_ref_group *group)
1289 {
1290         struct gpu_array_tile *tile;
1291         isl_multi_aff *tiling;
1292
1293         tile = group->private_tile;
1294         if (!tile)
1295                 tile = group->shared_tile;
1296
1297         tiling = isl_multi_aff_copy(tile->tiling);
1298
1299         return isl_map_from_multi_aff(tiling);
1300 }
1301
1302 /* Does "map" have an obviously fixed value at variable "pos" of "type"?
1303  */
1304 static int map_plain_is_fixed(isl_map *map, enum isl_dim_type type,
1305         unsigned pos)
1306 {
1307         isl_val *v;
1308         int fixed;
1309
1310         v = isl_map_plain_get_val_if_fixed(map, type, pos);
1311         if (!v)
1312                 return -1;
1313         fixed = isl_val_is_int(v);
1314         isl_val_free(v);
1315
1316         return fixed;
1317 }
1318
1319 /* Given a schedule that iterates over all elements in a piece of an array,
1320  * perform tiling/wrapping over the threads.
1321  *
1322  * In particular, we tile the final iterators so that the final thread
1323  * dimension runs over the final array dimension.
1324  * However, if those final iterators have only a single iteration,
1325  * we try to tile earlier iterators instead.
1326  */
1327 static __isl_give isl_map *tile_access_schedule(struct gpu_gen *gen,
1328         __isl_take isl_map *sched)
1329 {
1330         isl_space *dim;
1331         isl_union_map *usched;
1332         isl_map *tiling;
1333         isl_set *par;
1334         unsigned nvar = isl_map_dim(sched, isl_dim_out);
1335         int n_tile;
1336         int first;
1337
1338         n_tile = gen->kernel->n_block;
1339         if (n_tile > nvar) {
1340                 int i;
1341                 sched = isl_map_insert_dims(sched,
1342                                                 isl_dim_out, 0, n_tile - nvar);
1343                 for (i = 0; i < n_tile - nvar; ++i)
1344                         sched = isl_map_fix_si(sched, isl_dim_out, i, 0);
1345                 nvar = n_tile;
1346         }
1347
1348         first = nvar - n_tile;
1349
1350         for (; first > 0; first --)
1351                 if (!map_plain_is_fixed(sched, isl_dim_out, first + n_tile - 1))
1352                         break;
1353
1354         dim = isl_map_get_space(sched);
1355         dim = isl_space_params(dim);
1356         if (gen->options->wrap)
1357                 tiling = wrap(isl_space_copy(dim), nvar, first,
1358                                 n_tile, gen->kernel->block_dim);
1359         else
1360                 tiling = tile(isl_space_copy(dim), nvar, first,
1361                                 n_tile, gen->kernel->block_dim);
1362         sched = isl_map_apply_range(sched, tiling);
1363
1364         par = parametrization(dim, nvar + n_tile, first + n_tile, n_tile, "t");
1365         sched = isl_map_intersect_range(sched, par);
1366
1367         usched = isl_union_map_from_map(sched);
1368         usched = scale_access_tile_loops(gen, usched, nvar + n_tile,
1369                                          first, n_tile);
1370         sched = isl_map_from_union_map(usched);
1371
1372         return sched;
1373 }
1374
1375 /* Return the union of all read (read = 1) and/or write (write = 1)
1376  * access relations in the group.
1377  */
1378 static __isl_give isl_union_map *group_access_relation(
1379         struct gpu_array_ref_group *group, int read, int write)
1380 {
1381         int i;
1382         isl_union_map *access;
1383
1384         access = isl_union_map_empty(isl_map_get_space(group->access));
1385         for (i = 0; i < group->n_ref; ++i) {
1386                 isl_map *map_i;
1387
1388                 if (!((read && group->refs[i]->read) ||
1389                      (write && group->refs[i]->write)))
1390                         continue;
1391                 map_i = isl_map_copy(group->refs[i]->access);
1392                 access = isl_union_map_union(access,
1393                                             isl_union_map_from_map(map_i));
1394         }
1395
1396         return access;
1397 }
1398
1399 /* Return the union of all tagged access relations in the group.
1400  */
1401 static __isl_give isl_union_map *group_tagged_access_relation(
1402         struct gpu_array_ref_group *group)
1403 {
1404         int i;
1405         isl_union_map *access;
1406
1407         access = isl_union_map_empty(isl_map_get_space(group->access));
1408         for (i = 0; i < group->n_ref; ++i) {
1409                 isl_map *map_i;
1410
1411                 map_i = isl_map_copy(group->refs[i]->tagged_access);
1412                 access = isl_union_map_union(access,
1413                                             isl_union_map_from_map(map_i));
1414         }
1415
1416         return access;
1417 }
1418
1419 /* Return the extent of "array", recomputed from the bounds.
1420  * The recomputed extent may be simpler than the original extent.
1421  */
1422 static __isl_give isl_set *array_extent(struct gpu_array_info *array)
1423 {
1424         int i;
1425         isl_id *id;
1426         isl_space *space;
1427         isl_local_space *ls;
1428         isl_set *extent;
1429
1430         id = isl_set_get_tuple_id(array->extent);
1431         space = isl_set_get_space(array->extent);
1432         extent = isl_set_universe(isl_space_copy(space));
1433         ls = isl_local_space_from_space(space);
1434         for (i = 0; i < array->n_index; ++i) {
1435                 isl_pw_aff *bound;
1436                 isl_aff *aff;
1437                 isl_pw_aff *index;
1438                 isl_set *lt;
1439
1440                 extent = isl_set_lower_bound_si(extent, isl_dim_set, i, 0);
1441
1442                 aff = isl_aff_var_on_domain(isl_local_space_copy(ls),
1443                                                 isl_dim_set, i);
1444                 index = isl_pw_aff_from_aff(aff);
1445                 bound = isl_pw_aff_copy(array->bound[i]);
1446                 bound = isl_pw_aff_from_range(bound);
1447                 bound = isl_pw_aff_add_dims(bound, isl_dim_in, array->n_index);
1448                 bound = isl_pw_aff_set_tuple_id(bound, isl_dim_in,
1449                                                 isl_id_copy(id));
1450                 lt = isl_pw_aff_lt_set(index, bound);
1451                 extent = isl_set_intersect(extent, lt);
1452         }
1453         isl_local_space_free(ls);
1454         isl_id_free(id);
1455
1456         return extent;
1457 }
1458
1459 /* Return a map from the first shared_len dimensions of the computed
1460  * schedule to the array tile in
1461  * global memory that corresponds to the shared memory copy.
1462  *
1463  * In particular, return a map
1464  *
1465  *      { D[i] -> A[a] }
1466  *
1467  * with constraints
1468  *
1469  *      tile_offset(i) <= a <= tile_offset(i) + tile_size - 1           (1)
1470  *
1471  * and
1472  *
1473  *      0 <= a <= array_size - 1                                        (2)
1474  *
1475  * Note that if some stride has been detected (i.e., when
1476  * group->shared_tile->bound[i].shift is set), then a in (1) refers
1477  * to the shifted and scaled down version.
1478  *
1479  * Constraints (1) are obtained by mapping the size constraints on the
1480  * shared/private memory tile back to the access relation.
1481  * Constraints (2) are obtained from the (recomputed) extent.
1482  */
1483 static __isl_give isl_map *group_tile(struct gpu_array_ref_group *group)
1484 {
1485         int i;
1486         int n_index = group->array->n_index;
1487         isl_map *tile;
1488         isl_space *space;
1489         isl_set *local;
1490         isl_set *extent;
1491
1492         space = isl_multi_aff_get_space(group->shared_tile->tiling);
1493         space = isl_space_range(space);
1494         local = isl_set_universe(space);
1495         for (i = 0; i < n_index; ++i) {
1496                 isl_val *bound;
1497
1498                 local = isl_set_lower_bound_si(local, isl_dim_set, i, 0);
1499                 bound = isl_val_copy(group->shared_tile->bound[i].size);
1500                 bound = isl_val_sub_ui(bound, 1);
1501                 local = isl_set_upper_bound_val(local, isl_dim_set, i, bound);
1502         }
1503         local = isl_set_preimage_multi_aff(local,
1504                                 isl_multi_aff_copy(group->shared_tile->tiling));
1505         tile = isl_set_unwrap(local);
1506         extent = array_extent(group->array);
1507         tile = isl_map_intersect_range(tile, extent);
1508
1509         return tile;
1510 }
1511
1512 /* Given a mapping "iterator_map" from the AST schedule to a domain,
1513  * return the corresponding mapping from the AST schedule to
1514  * to the first shared_len dimensions of the schedule computed by PPCG.
1515  */
1516 static __isl_give isl_pw_multi_aff *compute_sched_to_shared(struct gpu_gen *gen,
1517         __isl_take isl_pw_multi_aff *iterator_map)
1518 {
1519         isl_union_map *umap;
1520         isl_space *space;
1521         isl_map *map, *sched;;
1522
1523         space = isl_space_range(isl_pw_multi_aff_get_space(iterator_map));
1524         space = isl_space_from_domain(space);
1525         space = isl_space_add_dims(space, isl_dim_out, gen->shared_len);
1526
1527         umap = isl_union_map_copy(gen->shared_sched);
1528         umap = isl_union_map_apply_range(umap,
1529                         isl_union_map_copy(gen->shared_proj));
1530         map = isl_union_map_extract_map(umap, space);
1531         isl_union_map_free(umap);
1532
1533         sched = isl_map_preimage_domain_pw_multi_aff(map, iterator_map);
1534         sched = isl_map_detect_equalities(sched);
1535
1536         return isl_pw_multi_aff_from_map(sched);
1537 }
1538
1539 /* Set unroll[j] if the input dimension j is involved in
1540  * the index expression represented by ma.
1541  */
1542 static int check_unroll(__isl_take isl_set *set, __isl_take isl_multi_aff *ma,
1543         void *user)
1544 {
1545         int i, j;
1546         int n_in = isl_multi_aff_dim(ma, isl_dim_in);
1547         int n_out = isl_multi_aff_dim(ma, isl_dim_out);
1548         int *unroll = user;
1549
1550         for (i = 0; i < n_out; ++i) {
1551                 isl_aff *aff;
1552
1553                 aff = isl_multi_aff_get_aff(ma, i);
1554                 for (j = 0; j < n_in; ++j)
1555                         if (isl_aff_involves_dims(aff, isl_dim_in, j, 1))
1556                                 unroll[j] = 1;
1557                 isl_aff_free(aff);
1558         }
1559
1560         isl_set_free(set);
1561         isl_multi_aff_free(ma);
1562         return 0;
1563 }
1564
1565 /* Given an array pos mapping input dimensions to the corresponding
1566  * output dimension, construct the corresponding map.
1567  */
1568 static __isl_give isl_map *permutation(__isl_take isl_space *dim,
1569         int *pos, int len)
1570 {
1571         int i;
1572         isl_constraint *c;
1573         isl_basic_map *bmap;
1574         isl_local_space *ls;
1575
1576         dim = isl_space_add_dims(dim, isl_dim_in, len);
1577         dim = isl_space_add_dims(dim, isl_dim_out, len);
1578         bmap = isl_basic_map_universe(isl_space_copy(dim));
1579         ls = isl_local_space_from_space(dim);
1580
1581         for (i = 0; i < len; ++i) {
1582                 c = isl_equality_alloc(isl_local_space_copy(ls));
1583                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i,
1584                                                       -1);
1585                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, pos[i],
1586                                                       1);
1587                 bmap = isl_basic_map_add_constraint(bmap, c);
1588         }
1589         isl_local_space_free(ls);
1590
1591         return isl_map_from_basic_map(bmap);
1592 }
1593
1594 /* Find all loops involved in any of the index expressions for any of
1595  * the private accesses, move them innermost and then mark them as
1596  * requiring unrolling by setting gen->first_unroll.
1597  * The loops involved should all be parallel because of the checks
1598  * we performed in check_private_group_access.  Moving them innermost
1599  * is therefore a valid transformation.
1600  *
1601  * Loops up to gen->shared_len are generated before the mapping to
1602  * threads is applied.  They should therefore be ignored.
1603  *
1604  * We compute the hidden equalities of the schedule first
1605  * since we will need them in our calls to isl_pw_multi_aff_from_map
1606  * and because we want to make sure that the same equalities
1607  * are also available to the code generator.
1608  */
1609 static __isl_give isl_union_map *interchange_for_unroll(struct gpu_gen *gen,
1610         __isl_take isl_union_map *sched)
1611 {
1612         int i, j;
1613         int unroll[gen->thread_tiled_len];
1614         int perm[gen->thread_tiled_len];
1615         isl_space *dim;
1616         isl_map *permute;
1617         int len = gen->shared_len + gen->n_parallel + gen->n_block;
1618
1619         gen->first_unroll = -1;
1620
1621         sched = isl_union_map_detect_equalities(sched);
1622         for (i = 0; i < gen->thread_tiled_len; ++i)
1623                 unroll[i] = 0;
1624         for (i = 0; i < gen->prog->n_array; ++i) {
1625                 struct gpu_array_info *array = &gen->prog->array[i];
1626
1627                 for (j = 0; j < array->n_group; ++j) {
1628                         isl_union_map *access;
1629                         isl_map *acc;
1630                         isl_pw_multi_aff *pma;
1631
1632                         if (!array->groups[j]->private_tile)
1633                                 continue;
1634
1635                         access = group_access_relation(array->groups[j], 1, 1);
1636                         access = isl_union_map_apply_domain(access,
1637                                                 isl_union_map_copy(sched));
1638
1639                         acc = isl_map_from_union_map(access);
1640                         pma = isl_pw_multi_aff_from_map(acc);
1641                         isl_pw_multi_aff_foreach_piece(pma,
1642                                                         &check_unroll, unroll);
1643
1644                         isl_pw_multi_aff_free(pma);
1645                 }
1646         }
1647
1648         for (i = gen->shared_len; i < len; ++i)
1649                 if (unroll[i])
1650                         break;
1651
1652         if (i >= len)
1653                 return sched;
1654
1655         for (i = len; i < gen->thread_tiled_len; ++i)
1656                 if (unroll[i])
1657                         return sched;
1658
1659         j = 0;
1660         for (i = 0; i < gen->shared_len; ++i)
1661                 perm[i] = j++;
1662         for (i = gen->shared_len; i < gen->thread_tiled_len; ++i)
1663                 if (!unroll[i])
1664                         perm[i] = j++;
1665         gen->first_unroll = j - gen->shared_len;
1666         for (i = gen->shared_len; i < len; ++i)
1667                 if (unroll[i])
1668                         perm[i] = j++;
1669
1670         dim = isl_union_map_get_space(sched);
1671         permute = permutation(dim, perm, gen->thread_tiled_len);
1672         sched = isl_union_map_apply_range(sched,
1673                                           isl_union_map_from_map(permute));
1674
1675         return sched;
1676 }
1677
1678 /* Given a constraint
1679  *
1680  *              a(p,i) + j = g f(e)
1681  *
1682  * or -a(p,i) - j = g f(e) if sign < 0,
1683  * store a(p,i) in bound->shift and g (stride) in bound->stride.
1684  * a(p,i) is assumed to be an expression in only the parameters
1685  * and the input dimensions.
1686  */
1687 static void extract_stride(__isl_keep isl_constraint *c,
1688         struct gpu_array_bound *bound, __isl_keep isl_val *stride, int sign)
1689 {
1690         int i;
1691         isl_val *v;
1692         isl_space *space;
1693         unsigned nparam;
1694         unsigned nvar;
1695         isl_aff *aff;
1696
1697         isl_val_free(bound->stride);
1698         bound->stride = isl_val_copy(stride);
1699
1700         space = isl_constraint_get_space(c);
1701         space = isl_space_domain(space);
1702
1703         nparam = isl_space_dim(space, isl_dim_param);
1704         nvar = isl_space_dim(space, isl_dim_set);
1705
1706         v = isl_constraint_get_constant_val(c);
1707         if (sign < 0)
1708                 v = isl_val_neg(v);
1709         aff = isl_aff_zero_on_domain(isl_local_space_from_space(space));
1710         aff = isl_aff_set_constant_val(aff, v);
1711
1712         for (i = 0; i < nparam; ++i) {
1713                 if (!isl_constraint_involves_dims(c, isl_dim_param, i, 1))
1714                         continue;
1715                 v = isl_constraint_get_coefficient_val(c, isl_dim_param, i);
1716                 if (sign < 0)
1717                         v = isl_val_neg(v);
1718                 aff = isl_aff_add_coefficient_val(aff, isl_dim_param, i, v);
1719         }
1720
1721         for (i = 0; i < nvar; ++i) {
1722                 if (!isl_constraint_involves_dims(c, isl_dim_in, i, 1))
1723                         continue;
1724                 v = isl_constraint_get_coefficient_val(c, isl_dim_in, i);
1725                 if (sign < 0)
1726                         v = isl_val_neg(v);
1727                 aff = isl_aff_add_coefficient_val(aff, isl_dim_in, i, v);
1728         }
1729
1730         bound->shift = aff;
1731 }
1732
1733 /* Given an equality constraint of a map with a single output dimension j,
1734  * check if the constraint is of the form
1735  *
1736  *              a(p,i) + j = g f(e)
1737  *
1738  * with a(p,i) an expression in the parameters and input dimensions
1739  * and f(e) an expression in the existentially quantified variables.
1740  * If so, and if g is larger than any such g from a previously considered
1741  * constraint, then call extract_stride to record the stride information
1742  * in bound.
1743  */
1744 static int check_stride_constraint(__isl_take isl_constraint *c, void *user)
1745 {
1746         int i;
1747         isl_ctx *ctx;
1748         isl_val *v;
1749         unsigned n_div;
1750         struct gpu_array_bound *bound = user;
1751
1752         ctx = isl_constraint_get_ctx(c);
1753         n_div = isl_constraint_dim(c, isl_dim_div);
1754         v = isl_constraint_get_coefficient_val(c, isl_dim_out, 0);
1755
1756         if (n_div && (isl_val_is_one(v) || isl_val_is_negone(v))) {
1757                 int s = isl_val_sgn(v);
1758                 isl_val *stride = isl_val_zero(ctx);
1759
1760                 isl_val_free(v);
1761                 for (i = 0; i < n_div; ++i) {
1762                         v = isl_constraint_get_coefficient_val(c,
1763                                                                 isl_dim_div, i);
1764                         stride = isl_val_gcd(stride, v);
1765                 }
1766                 if (!isl_val_is_zero(stride) &&
1767                     isl_val_gt(stride, bound->stride))
1768                         extract_stride(c, bound, stride, s);
1769
1770                 isl_val_free(stride);
1771         } else
1772                 isl_val_free(v);
1773
1774         isl_constraint_free(c);
1775         return 0;
1776 }
1777
1778 /* Given contraints on an array index i, check if we can find
1779  * a shift a(p) and a stride g such that
1780  *
1781  *      a(p) + i = 0 mod g
1782  *
1783  * If so, record the information in bound and apply the mapping
1784  * i -> (i + a(p))/g to the array index in bounds and return
1785  * the new constraints.
1786  * If not, simply return the original constraints.
1787  *
1788  * If bounds is a subset of the space
1789  *
1790  *      D -> i
1791  *
1792  * then the bound recorded in bound->shift is of the form
1793  *
1794  *      D -> s(D)
1795  *
1796  * with s(D) equal to a(p) above.
1797  * The mapping recorded in bound->shift_map is of the form
1798  *
1799  *      [D -> i] -> [D -> (i + S(D))/g]
1800  *
1801  * This mapping is computed as follows.
1802  * We first introduce "i" in the domain through precomposition
1803  * with [D -> i] -> D obtaining
1804  *
1805  *      [D -> i] -> s(D)
1806  *
1807  * Adding [D -> i] -> i produces
1808  *
1809  *      [D -> i] -> i + s(D)
1810  *
1811  * and the domain product with [D -> i] -> D yields
1812  *
1813  *      [D -> i] -> [D -> i + s(D)]
1814  *
1815  * Composition with [D -> i] -> [D -> i/g] gives the desired result.
1816  */
1817 static __isl_give isl_basic_map *check_stride(struct gpu_array_bound *bound,
1818         __isl_take isl_basic_map *bounds)
1819 {
1820         isl_space *space;
1821         isl_basic_map *hull;
1822         isl_basic_map *shift, *id, *bmap, *scale;
1823         isl_basic_set *bset;
1824         isl_aff *aff;
1825
1826         bound->stride = NULL;
1827
1828         hull = isl_basic_map_affine_hull(isl_basic_map_copy(bounds));
1829
1830         isl_basic_map_foreach_constraint(hull, &check_stride_constraint, bound);
1831
1832         isl_basic_map_free(hull);
1833
1834         if (!bound->stride)
1835                 return bounds;
1836
1837         shift = isl_basic_map_from_aff(isl_aff_copy(bound->shift));
1838         space = isl_basic_map_get_space(bounds);
1839         bmap = isl_basic_map_domain_map(isl_basic_map_universe(space));
1840         shift = isl_basic_map_apply_range(bmap, shift);
1841         space = isl_basic_map_get_space(bounds);
1842         id = isl_basic_map_range_map(isl_basic_map_universe(space));
1843         shift = isl_basic_map_sum(id, shift);
1844         space = isl_basic_map_get_space(bounds);
1845         id = isl_basic_map_domain_map(isl_basic_map_universe(space));
1846         shift = isl_basic_map_range_product(id, shift);
1847
1848         space = isl_space_domain(isl_basic_map_get_space(bounds));
1849         id = isl_basic_map_identity(isl_space_map_from_set(space));
1850         space = isl_space_range(isl_basic_map_get_space(bounds));
1851         aff = isl_aff_zero_on_domain(isl_local_space_from_space(space));
1852         aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1);
1853         aff = isl_aff_scale_down_val(aff, isl_val_copy(bound->stride));
1854         scale = isl_basic_map_from_aff(aff);
1855         scale = isl_basic_map_product(id, scale);
1856
1857         bound->shift_map = isl_basic_map_apply_range(shift, scale);
1858         bmap = isl_basic_map_copy(bound->shift_map);
1859         bset = isl_basic_set_apply(isl_basic_map_wrap(bounds), bmap);
1860         bounds = isl_basic_set_unwrap(bset);
1861
1862         return bounds;
1863 }
1864
1865 /* Data used in compute_array_dim_size and compute_size_in_direction.
1866  *
1867  * pos is the position of the variable representing the array index,
1868  * i.e., the variable for which want to compute the size.  This variable
1869  * is also the last variable in the set.
1870  */
1871 struct gpu_size_info {
1872         isl_basic_set *bset;
1873         struct gpu_array_bound *bound;
1874         int pos;
1875 };
1876
1877 /* Given a constraint from the basic set describing the bounds on
1878  * an array index, check if it is a lower bound, say m i >= b(x), and,
1879  * if so, check whether the expression "i - ceil(b(x)/m) + 1" has a constant
1880  * upper bound.  If so, and if this bound is smaller than any bound
1881  * derived from earlier constraints, set the size to this bound on
1882  * the expression and the lower bound to ceil(b(x)/m).
1883  */
1884 static int compute_size_in_direction(__isl_take isl_constraint *c, void *user)
1885 {
1886         struct gpu_size_info *size = user;
1887         unsigned nparam;
1888         unsigned n_div;
1889         isl_val *v;
1890         isl_aff *aff;
1891         isl_aff *lb;
1892
1893         nparam = isl_basic_set_dim(size->bset, isl_dim_param);
1894         n_div = isl_constraint_dim(c, isl_dim_div);
1895
1896         if (isl_constraint_involves_dims(c, isl_dim_div, 0, n_div) ||
1897             !isl_constraint_is_lower_bound(c, isl_dim_set, size->pos)) {
1898                 isl_constraint_free(c);
1899                 return 0;
1900         }
1901
1902         aff = isl_constraint_get_bound(c, isl_dim_set, size->pos);
1903         aff = isl_aff_ceil(aff);
1904
1905         lb = isl_aff_copy(aff);
1906
1907         aff = isl_aff_neg(aff);
1908         aff = isl_aff_add_coefficient_si(aff, isl_dim_in, size->pos, 1);
1909
1910         v = isl_basic_set_max_val(size->bset, aff);
1911         isl_aff_free(aff);
1912
1913         if (isl_val_is_int(v)) {
1914                 v = isl_val_add_ui(v, 1);
1915                 if (!size->bound->size || isl_val_lt(v, size->bound->size)) {
1916                         isl_val_free(size->bound->size);
1917                         size->bound->size = isl_val_copy(v);
1918                         lb = isl_aff_drop_dims(lb, isl_dim_in, size->pos, 1);
1919                         isl_aff_free(size->bound->lb);
1920                         size->bound->lb = isl_aff_copy(lb);
1921                 }
1922         }
1923         isl_val_free(v);
1924         isl_aff_free(lb);
1925
1926         isl_constraint_free(c);
1927
1928         return 0;
1929 }
1930
1931 /* Given a basic map "bounds" that maps parameters and input dimensions
1932  * to a single output dimension, look for an expression in the parameters
1933  * and input dimensions such that the range of the output dimension shifted
1934  * by this expression is a constant.
1935  *
1936  * In particular, we currently only consider lower bounds on the output
1937  * dimension as candidate expressions.
1938  */
1939 static int compute_array_dim_size(struct gpu_array_bound *bound,
1940         __isl_take isl_basic_map *bounds)
1941 {
1942         struct gpu_size_info size;
1943
1944         bounds = isl_basic_map_detect_equalities(bounds);
1945         bounds = check_stride(bound, bounds);
1946
1947         bound->size = NULL;
1948         bound->lb = NULL;
1949
1950         size.bound = bound;
1951         size.pos = isl_basic_map_dim(bounds, isl_dim_in);
1952         size.bset = isl_basic_map_wrap(bounds);
1953         size.bset = isl_basic_set_flatten(size.bset);
1954         size.bset = isl_set_simple_hull(isl_basic_set_compute_divs(size.bset));
1955         isl_basic_set_foreach_constraint(size.bset, &compute_size_in_direction,
1956                                         &size);
1957         isl_basic_set_free(size.bset);
1958
1959         return bound->size ? 0 : -1;
1960 }
1961
1962 /* Check if we can find a memory tile for the given array
1963  * based on the given accesses, and if so, put the results in "tile".
1964  *
1965  * We project the accesses on each index in turn and look for a parametric
1966  * offset such that the size is constant.
1967  */
1968 static int can_tile(__isl_keep isl_map *access, struct gpu_array_tile *tile)
1969 {
1970         int i;
1971
1972         for (i = 0; i < tile->n; ++i) {
1973                 isl_map *access_i;
1974                 isl_basic_map *hull;
1975
1976                 access_i = isl_map_copy(access);
1977                 access_i = isl_map_project_out(access_i, isl_dim_out, 0, i);
1978                 access_i = isl_map_project_out(access_i, isl_dim_out,
1979                                             1, tile->n - (i + 1));
1980                 access_i = isl_map_compute_divs(access_i);
1981                 hull = isl_map_simple_hull(access_i);
1982                 if (compute_array_dim_size(&tile->bound[i], hull) < 0)
1983                         return 0;
1984         }
1985
1986         return 1;
1987 }
1988
1989 /* Construct a map with input the shared tile loops and the loops that
1990  * will be wrapped around the threads that relates these later loops
1991  * to the thread indices and then projects them out.
1992  */
1993 static __isl_give isl_map *compute_privatization(struct gpu_gen *gen)
1994 {
1995         isl_map *priv;
1996         isl_map *tiling;
1997         isl_map *proj;
1998         isl_set *par;
1999         isl_space *dim;
2000
2001         dim = isl_union_map_get_space(gen->shared_sched);
2002
2003         if (gen->options->wrap)
2004                 tiling = wrap(isl_space_copy(dim), gen->shared_len + gen->n_block,
2005                                 gen->shared_len, gen->n_block, gen->block_dim);
2006         else
2007                 tiling = tile(isl_space_copy(dim), gen->shared_len + gen->n_block,
2008                                 gen->shared_len, gen->n_block, gen->block_dim);
2009
2010         priv = tiling;
2011
2012         par = parametrization(dim, gen->shared_len + 2 * gen->n_block,
2013                 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
2014                 gen->n_block, "t");
2015
2016         priv = isl_map_align_params(priv, isl_set_get_space(par));
2017         priv = isl_map_intersect_range(priv, par);
2018
2019         dim = isl_map_get_space(priv);
2020         dim = isl_space_drop_dims(dim, isl_dim_in, 0, isl_space_dim(dim, isl_dim_in));
2021         dim = isl_space_drop_dims(dim, isl_dim_out, 0, isl_space_dim(dim, isl_dim_out));
2022         proj = projection(dim, gen->shared_len + 2 * gen->n_block,
2023                           gen->shared_len);
2024
2025         priv = isl_map_apply_range(priv, proj);
2026
2027         return priv;
2028 }
2029
2030 /* Construct a map from domain_dim to domain_dim that increments
2031  * the dimension at position "pos" and leaves all other dimensions
2032  * constant.
2033  */
2034 static __isl_give isl_map *next(__isl_take isl_space *domain_dim, int pos)
2035 {
2036         int i;
2037         int len = isl_space_dim(domain_dim, isl_dim_set);
2038         isl_space *dim;
2039         isl_basic_map *next;
2040         isl_local_space *ls;
2041
2042         dim = isl_space_map_from_set(domain_dim);
2043         next = isl_basic_map_universe(isl_space_copy(dim));
2044         ls = isl_local_space_from_space(dim);
2045
2046         for (i = 0; i < len; ++i) {
2047                 isl_constraint *c;
2048
2049                 c = isl_equality_alloc(isl_local_space_copy(ls));
2050                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, 1);
2051                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
2052                 if (i == pos)
2053                         c = isl_constraint_set_constant_si(c, 1);
2054                 next = isl_basic_map_add_constraint(next, c);
2055         }
2056
2057         isl_local_space_free(ls);
2058
2059         return isl_map_from_basic_map(next);
2060 }
2061
2062 /* Check if the given access is coalesced.
2063  * That is, check whether incrementing the dimension that will get
2064  * wrapped over the last thread index results in incrementing
2065  * the last array index.
2066  *
2067  * This function is only called for access relations without reuse.
2068  */
2069 static int access_is_coalesced(struct gpu_gen *gen,
2070         __isl_keep isl_union_map *access)
2071 {
2072         isl_space *dim;
2073         isl_map *access_map;
2074         isl_map *next_thread_x;
2075         isl_map *next_element;
2076         isl_map *map;
2077         int coalesced;
2078
2079         access = isl_union_map_copy(access);
2080         access = isl_union_map_apply_domain(access,
2081                                 isl_union_map_copy(gen->tiled_sched));
2082         access_map = isl_map_from_union_map(access);
2083
2084         dim = isl_map_get_space(access_map);
2085         dim = isl_space_domain(dim);
2086         next_thread_x = next(dim, gen->shared_len + gen->n_block - 1);
2087
2088         dim = isl_map_get_space(access_map);
2089         dim = isl_space_range(dim);
2090         next_element = next(dim, isl_space_dim(dim, isl_dim_set) - 1);
2091
2092         map = isl_map_apply_domain(next_thread_x, isl_map_copy(access_map));
2093         map = isl_map_apply_range(map, access_map);
2094
2095         coalesced = isl_map_is_subset(map, next_element);
2096
2097         isl_map_free(next_element);
2098         isl_map_free(map);
2099
2100         return coalesced;
2101 }
2102
2103 /* Given an access relation in terms of the first gen->shared_len + gen->n_block
2104  * dimensions of the computed schedule, check if it is bijective for
2105  * fixed values of the first gen->shared_len dimensions.
2106  * We perform this check by equating these dimensions to parameters.
2107  */
2108 static int access_is_bijective(struct gpu_gen *gen, __isl_keep isl_map *access)
2109 {
2110         int res;
2111         isl_set *par;
2112         isl_space *space;
2113
2114         access = isl_map_copy(access);
2115         space = isl_space_params(isl_map_get_space(access));
2116         par = parametrization(space, gen->shared_len + gen->n_block,
2117                                 0, gen->shared_len, "s");
2118         access = isl_map_intersect_domain(access, par);
2119         res = isl_map_is_bijective(access);
2120         isl_map_free(access);
2121
2122         return res;
2123 }
2124
2125 /* Look for the last shared tile loop that affects the offset of "tile"
2126  * and return the result.
2127  * If there is no such loop, then return the index of the loop
2128  * before the first shared tile loop, in particular gen->tile_first - 1.
2129  */
2130 static int compute_tile_last_shared(struct gpu_gen *gen,
2131         struct gpu_array_tile *tile)
2132 {
2133         int i, j;
2134
2135         for (j = gen->shared_len - 1; j >= gen->tile_first; --j) {
2136                 for (i = 0; i < tile->n; ++i) {
2137                         isl_aff *lb;
2138                         isl_aff *shift;
2139
2140                         lb = tile->bound[i].lb;
2141                         if (isl_aff_involves_dims(lb, isl_dim_in, j, 1))
2142                                 break;
2143
2144                         shift = tile->bound[i].shift;
2145                         if (!shift)
2146                                 continue;
2147                         if (isl_aff_involves_dims(shift, isl_dim_in, j, 1))
2148                                 break;
2149                 }
2150                 if (i < tile->n)
2151                         break;
2152         }
2153
2154         return j;
2155 }
2156
2157 /* Look for the last shared tile loop that affects the offset of the
2158  * shared or private tile and store the result in group->last_shared.
2159  * If there is no such loop, then group->last_shared is set to a value
2160  * before the first shared tile loop, in particular gen->tile_first - 1.
2161  * If there is no tile defined on the array reference group,
2162  * then set group->last_shared to gen->shared_len - 1.
2163  */
2164 static void set_last_shared(struct gpu_gen *gen,
2165         struct gpu_array_ref_group *group)
2166 {
2167         struct gpu_array_tile *tile;
2168
2169         group->last_shared = gen->shared_len - 1;
2170
2171         tile = group->private_tile;
2172         if (!tile)
2173                 tile = group->shared_tile;
2174         if (!tile)
2175                 return;
2176
2177         group->last_shared = compute_tile_last_shared(gen, tile);
2178 }
2179
2180 /* Compute a privatized copy of all access relations from reference groups that
2181  * are mapped to private memory and store the result in gen->privatization.
2182  *
2183  * Read-only scalars and arrays containing structures are not mapped
2184  * to private memory.
2185  */
2186 static void compute_private_access(struct gpu_gen *gen)
2187 {
2188         int i, j;
2189         isl_union_map *private;
2190
2191         if (!gen->options->use_private_memory)
2192                 return;
2193
2194         private = isl_union_map_empty(isl_union_map_get_space(gen->shared_sched));
2195
2196         for (i = 0; i < gen->prog->n_array; ++i) {
2197                 struct gpu_array_info *array = &gen->prog->array[i];
2198
2199                 if (gpu_array_is_read_only_scalar(array))
2200                         continue;
2201                 if (array->has_compound_element)
2202                         continue;
2203
2204                 for (j = 0; j < array->n_group; ++j) {
2205                         if (!array->groups[j]->private_tile)
2206                                 continue;
2207
2208                         private = isl_union_map_union(private,
2209                                 group_access_relation(array->groups[j], 1, 1));
2210                 }
2211         }
2212
2213         if (isl_union_map_is_empty(private))
2214                 isl_union_map_free(private);
2215         else {
2216                 isl_union_map *priv;
2217
2218                 private = isl_union_map_apply_domain(private,
2219                                         isl_union_map_copy(gen->shared_sched));
2220                 priv = isl_union_map_from_map(isl_map_copy(gen->privatization));
2221                 private = isl_union_map_apply_domain(private, priv);
2222                 gen->private_access = private;
2223         }
2224 }
2225
2226 /* Compute the size of the tile specified by "tile"
2227  * in number of elements and return the result.
2228  */
2229 static __isl_give isl_val *tile_size(isl_ctx *ctx, struct gpu_array_tile *tile)
2230 {
2231         int i;
2232         isl_val *size;
2233
2234         size = isl_val_one(ctx);
2235
2236         for (i = 0; i < tile->n; ++i)
2237                 size = isl_val_mul(size, isl_val_copy(tile->bound[i].size));
2238
2239         return size;
2240 }
2241
2242 /* If max_shared_memory is not set to infinity (-1), then make
2243  * sure that the total amount of shared memory required by the
2244  * array reference groups mapped to shared memory is no larger
2245  * than this maximum.
2246  *
2247  * We apply a greedy approach and discard (keep in global memory)
2248  * those groups that would result in a total memory size that
2249  * is larger than the maximum.
2250  */
2251 static void check_shared_memory_bound(struct gpu_gen *gen)
2252 {
2253         int i, j;
2254         isl_val *left, *size;
2255
2256         if (gen->options->max_shared_memory < 0)
2257                 return;
2258
2259         left = isl_val_int_from_si(gen->ctx, gen->options->max_shared_memory);
2260
2261         for (i = 0; i < gen->prog->n_array; ++i) {
2262                 struct gpu_array_info *array = &gen->prog->array[i];
2263
2264                 for (j = 0; j < array->n_group; ++j) {
2265                         struct gpu_array_ref_group *group;
2266
2267                         group = array->groups[j];
2268                         if (group->private_tile)
2269                                 continue;
2270                         if (!group->shared_tile)
2271                                 continue;
2272
2273                         size = tile_size(gen->ctx, group->shared_tile);
2274                         size = isl_val_mul_ui(size, array->size);
2275
2276                         if (isl_val_le(size, left)) {
2277                                 left = isl_val_sub(left, size);
2278                                 continue;
2279                         }
2280                         isl_val_free(size);
2281
2282                         group->shared_tile = free_tile(group->shared_tile);
2283                 }
2284         }
2285
2286         isl_val_free(left);
2287 }
2288
2289 /* Given a description of an array tile "tile" and the "space"
2290  *
2291  *      { D -> A }
2292  *
2293  * where D represents the first shared_len schedule dimensions
2294  * and A represents the array, construct an isl_multi_aff
2295  *
2296  *      { [D[i] -> A[a]] -> A'[a'] }
2297  *
2298  * with A' a scaled down copy of A according to the shifts and strides
2299  * in "tile".  In particular,
2300  *
2301  *      a' = (a + shift(i))/stride
2302  *
2303  * "insert_array" represents
2304  *
2305  *      { [D -> A] -> D }
2306  *
2307  * and is used to insert A into the domain of functions that only
2308  * reference D.
2309  */
2310 static __isl_give isl_multi_aff *strided_tile(
2311         struct gpu_array_tile *tile, __isl_keep isl_space *space,
2312         __isl_keep isl_multi_aff *insert_array)
2313 {
2314         int i;
2315         isl_ctx *ctx;
2316         isl_multi_aff *shift;
2317         isl_multi_val *stride;
2318         isl_space *space2;
2319         isl_local_space *ls;
2320         isl_multi_aff *tiling;
2321
2322         ctx = isl_space_get_ctx(space);
2323         space2 = isl_space_domain(isl_space_copy(space));
2324         ls = isl_local_space_from_space(space2);
2325         space2 = isl_space_range(isl_space_copy(space));
2326         stride = isl_multi_val_zero(space2);
2327         shift = isl_multi_aff_zero(isl_space_copy(space));
2328
2329         for (i = 0; i < tile->n; ++i) {
2330                 struct gpu_array_bound *bound = &tile->bound[i];
2331                 isl_val *stride_i;
2332                 isl_aff *shift_i;
2333
2334                 if (tile->bound[i].shift) {
2335                         stride_i = isl_val_copy(bound->stride);
2336                         shift_i = isl_aff_copy(bound->shift);
2337                 } else {
2338                         stride_i = isl_val_one(ctx);
2339                         shift_i = isl_aff_zero_on_domain(
2340                                         isl_local_space_copy(ls));
2341                 }
2342
2343                 stride = isl_multi_val_set_val(stride, i, stride_i);
2344                 shift = isl_multi_aff_set_aff(shift, i, shift_i);
2345         }
2346         isl_local_space_free(ls);
2347
2348         shift = isl_multi_aff_pullback_multi_aff(shift,
2349                                     isl_multi_aff_copy(insert_array));
2350
2351         tiling = isl_multi_aff_range_map(isl_space_copy(space));
2352         tiling = isl_multi_aff_add(tiling, shift);
2353         tiling = isl_multi_aff_scale_down_multi_val(tiling, stride);
2354
2355         return tiling;
2356 }
2357
2358 /* Compute a tiling for the array reference group "group".
2359  *
2360  * The tiling is of the form
2361  *
2362  *      { [D[i] -> A[a]] -> T[t] }
2363  *
2364  * where D represents the first shared_len schedule dimensions,
2365  * A represents the global array and T represents the shared or
2366  * private memory tile.  The name of T is the name of the local
2367  * array.
2368  *
2369  * If there is any stride in the accesses, then the mapping is
2370  *
2371  *      t = (a + shift(i))/stride - lb(i)
2372  *
2373  * otherwise, it is simply
2374  *
2375  *      t = a - lb(i)
2376  */
2377 static void compute_group_tiling(struct gpu_array_ref_group *group)
2378 {
2379         int i;
2380         struct gpu_array_tile *tile;
2381         struct gpu_array_info *array = group->array;
2382         isl_space *space;
2383         isl_multi_aff *tiling, *lb, *insert_array;
2384         isl_printer *p;
2385         char *local_name;
2386
2387         tile = group->private_tile;
2388         if (!tile)
2389                 tile = group->shared_tile;
2390         if (!tile)
2391                 return;
2392
2393         space = isl_map_get_space(group->access);
2394         insert_array = isl_multi_aff_domain_map(isl_space_copy(space));
2395
2396         for (i = 0; i < tile->n; ++i)
2397                 if (tile->bound[i].shift)
2398                         break;
2399
2400         if (i < tile->n)
2401                 tiling = strided_tile(tile, space, insert_array);
2402         else
2403                 tiling = isl_multi_aff_range_map(isl_space_copy(space));
2404
2405         lb = isl_multi_aff_zero(space);
2406         for (i = 0; i < tile->n; ++i) {
2407                 isl_aff *lb_i = isl_aff_copy(tile->bound[i].lb);
2408                 lb = isl_multi_aff_set_aff(lb, i, lb_i);
2409         }
2410         lb = isl_multi_aff_pullback_multi_aff(lb, insert_array);
2411
2412         tiling = isl_multi_aff_sub(tiling, lb);
2413
2414         p = isl_printer_to_str(isl_multi_aff_get_ctx(tiling));
2415         p = print_array_name(p, group);
2416         local_name = isl_printer_get_str(p);
2417         isl_printer_free(p);
2418         tiling = isl_multi_aff_set_tuple_name(tiling, isl_dim_out, local_name);
2419         free(local_name);
2420
2421         tile->tiling = tiling;
2422 }
2423
2424 /* Compute a tiling for all the array reference groups.
2425  */
2426 static void compute_group_tilings(struct gpu_gen *gen)
2427 {
2428         int i, j;
2429
2430         for (i = 0; i < gen->prog->n_array; ++i) {
2431                 struct gpu_array_info *array = &gen->prog->array[i];
2432
2433                 for (j = 0; j < array->n_group; ++j)
2434                         compute_group_tiling(array->groups[j]);
2435         }
2436 }
2437
2438 /* Fill up the groups array with singleton groups, i.e., one group
2439  * per reference, initializing the array, access, write, n_ref and refs fields.
2440  * In particular the access field is initialized to the scheduled
2441  * access relation of the array reference.
2442  *
2443  * Return the number of elements initialized, i.e., the number of
2444  * active references in the current kernel.
2445  */
2446 static int populate_array_references(struct gpu_array_info *array,
2447         __isl_keep isl_union_map *sched, struct gpu_array_ref_group **groups)
2448 {
2449         int i;
2450         int n;
2451         isl_ctx *ctx = isl_union_map_get_ctx(sched);
2452
2453         n = 0;
2454         for (i = 0; i < array->n_ref; ++i) {
2455                 isl_union_map *umap;
2456                 isl_map *map;
2457                 struct gpu_array_ref_group *group;
2458                 struct gpu_stmt_access *access = array->refs[i];
2459
2460                 map = isl_map_copy(access->access);
2461                 umap = isl_union_map_from_map(map);
2462                 umap = isl_union_map_apply_domain(umap,
2463                                 isl_union_map_copy(sched));
2464
2465                 if (isl_union_map_is_empty(umap)) {
2466                         isl_union_map_free(umap);
2467                         continue;
2468                 }
2469
2470                 map = isl_map_from_union_map(umap);
2471                 map = isl_map_detect_equalities(map);
2472
2473                 group = isl_calloc_type(ctx, struct gpu_array_ref_group);
2474                 assert(group);
2475                 group->array = array;
2476                 group->access = map;
2477                 group->write = access->write;
2478                 group->exact_write = access->exact_write;
2479                 group->refs = &array->refs[i];
2480                 group->n_ref = 1;
2481
2482                 groups[n++] = group;
2483         }
2484
2485         return n;
2486 }
2487
2488 /* If group->n_ref == 1, then group->refs was set by
2489  * populate_array_references to point directly into
2490  * group->array->refs and should not be freed.
2491  * If group->n_ref > 1, then group->refs was set by join_groups
2492  * to point to a newly allocated array.
2493  */
2494 static void free_array_ref_group(struct gpu_array_ref_group *group)
2495 {
2496         if (!group)
2497                 return;
2498         free_tile(group->shared_tile);
2499         free_tile(group->private_tile);
2500         isl_map_free(group->access);
2501         if (group->n_ref > 1)
2502                 free(group->refs);
2503         free(group);
2504 }
2505
2506 /* Given a map where the input dimensions represent the tile loops,
2507  * eliminate the innermost of those that have a fixed value
2508  * until we reach one that does not (obviously) have a fixed value.
2509  */
2510 static __isl_give isl_map *eliminate_fixed_inner_loops(
2511         __isl_take isl_map *access)
2512 {
2513         int i, n;
2514
2515         n = isl_map_dim(access, isl_dim_in);
2516
2517         for (i = n - 1; i >= 0; --i) {
2518                 if (!map_plain_is_fixed(access, isl_dim_in, i))
2519                         break;
2520                 access = isl_map_eliminate(access, isl_dim_in, i, 1);
2521         }
2522         return access;
2523 }
2524
2525 /* Check if the access relations of group1 and group2 overlap within
2526  * the innermost loop.  In particular, ignore any inner dimension
2527  * with a fixed value.
2528  * The copying to and from shared memory will be performed within
2529  * the innermost actual loop so we are only allowed to consider
2530  * the dimensions up to that innermost loop while checking whether
2531  * two access relations overlap.
2532  */
2533 static int accesses_overlap(struct gpu_array_ref_group *group1,
2534         struct gpu_array_ref_group *group2)
2535 {
2536         int empty;
2537         isl_map *access1, *access2;
2538
2539         access1 = isl_map_copy(group1->access);
2540         access1 = eliminate_fixed_inner_loops(access1);
2541         access2 = isl_map_copy(group2->access);
2542         access2 = eliminate_fixed_inner_loops(access2);
2543         access1 = isl_map_intersect(access1, access2);
2544         empty = isl_map_is_empty(access1);
2545         isl_map_free(access1);
2546
2547         return !empty;
2548 }
2549
2550 /* Combine the given two groups into a single group, containing
2551  * the references of both groups.
2552  */
2553 static struct gpu_array_ref_group *join_groups(
2554         struct gpu_array_ref_group *group1,
2555         struct gpu_array_ref_group *group2)
2556 {
2557         int i;
2558         isl_ctx *ctx;
2559         struct gpu_array_ref_group *group;
2560
2561         ctx = isl_map_get_ctx(group1->access);
2562         group = isl_calloc_type(ctx, struct gpu_array_ref_group);
2563         assert(group);
2564         group->array = group1->array;
2565         group->access = isl_map_union(isl_map_copy(group1->access),
2566                                         isl_map_copy(group2->access));
2567         group->write = group1->write || group2->write;
2568         group->exact_write = group1->exact_write && group2->exact_write;
2569         group->n_ref = group1->n_ref + group2->n_ref;
2570         group->refs = isl_alloc_array(ctx, struct gpu_stmt_access *,
2571                                         group->n_ref);
2572         assert(group->refs);
2573         for (i = 0; i < group1->n_ref; ++i)
2574                 group->refs[i] = group1->refs[i];
2575         for (i = 0; i < group2->n_ref; ++i)
2576                 group->refs[group1->n_ref + i] = group2->refs[i];
2577
2578         return group;
2579 }
2580
2581 /* Combine the given two groups into a single group and free
2582  * the original two groups.
2583  */
2584 static struct gpu_array_ref_group *join_groups_and_free(
2585         struct gpu_array_ref_group *group1,
2586         struct gpu_array_ref_group *group2)
2587 {
2588         struct gpu_array_ref_group *group;
2589
2590         group = join_groups(group1, group2);
2591         free_array_ref_group(group1);
2592         free_array_ref_group(group2);
2593         return group;
2594 }
2595
2596 /* Compute the private and/or shared memory tiles for the array
2597  * reference group "group" of array "array".
2598  * Return 0 on success and -1 on error.
2599  *
2600  * If the array is a read-only scalar or if the user requested
2601  * not to use shared or private memory, then we do not need to do anything.
2602  *
2603  * If the array group involves any may writes (that are not must writes),
2604  * then we would have to make sure that we load the data into shared/private
2605  * memory first in case the data is not written by the kernel
2606  * (but still written back out to global memory).
2607  * Since we don't have any such mechanism at the moment, we don't
2608  * compute shared/private tiles for groups involving may writes.
2609  *
2610  * We only try to compute a shared memory tile if there is any reuse
2611  * or if the access is not coalesced.
2612  *
2613  * For computing a private memory tile, we also require that there is
2614  * some reuse.  Moreover, we require that the access is private
2615  * to the thread.  That is, we check that any given array element
2616  * is only accessed by a single thread.
2617  * We compute an access relation that maps the shared tile loop iterators
2618  * and the shared point loop iterators that will be wrapped over the
2619  * threads to the array elements.
2620  * We actually check that those iterators that will be wrapped
2621  * partition the array space.  This check is stricter than necessary
2622  * since several iterations may be mapped onto the same thread
2623  * and then they could be allowed to access the same memory elements,
2624  * but our check does not allow this situation.
2625  *
2626  * We also check that the index expression only depends on parallel
2627  * loops.  That way, we can move those loops innermost and unroll them.
2628  * Again, we use a test that is stricter than necessary.
2629  * We actually check whether the index expression only depends
2630  * on the iterators that are wrapped over the threads.
2631  * These are necessarily parallel, but there may be more parallel loops.
2632  *
2633  * Combining the injectivity of the first test with the single-valuedness
2634  * of the second test, we simply test for bijectivity.
2635  *
2636  * If it turns out we can use registers, we compute the private memory
2637  * tile size using can_tile, after introducing a dependence
2638  * on the thread indices.
2639  */
2640 static int compute_group_bounds_core(struct gpu_gen *gen,
2641         struct gpu_array_ref_group *group)
2642 {
2643         isl_ctx *ctx = isl_space_get_ctx(group->array->space);
2644         isl_union_map *access;
2645         int n_index = group->array->n_index;
2646         int no_reuse;
2647         isl_map *acc;
2648         int use_shared = gen->options->use_shared_memory;
2649         int use_private = gen->options->use_private_memory;
2650
2651         if (!use_shared && !use_private)
2652                 return 0;
2653         if (gpu_array_is_read_only_scalar(group->array))
2654                 return 0;
2655         if (!group->exact_write)
2656                 return 0;
2657
2658         access = group_access_relation(group, 1, 1);
2659         no_reuse = isl_union_map_is_injective(access);
2660
2661         if (use_shared && (!no_reuse || !access_is_coalesced(gen, access))) {
2662                 group->shared_tile = create_tile(ctx, group->array->n_index);
2663                 if (!can_tile(group->access, group->shared_tile))
2664                         group->shared_tile = free_tile(group->shared_tile);
2665         }
2666
2667         if (!use_private || no_reuse) {
2668                 isl_union_map_free(access);
2669                 return 0;
2670         }
2671
2672         access = isl_union_map_apply_domain(access,
2673                                         isl_union_map_copy(gen->shared_sched));
2674
2675         acc = isl_map_from_union_map(access);
2676
2677         if (!access_is_bijective(gen, acc)) {
2678                 isl_map_free(acc);
2679                 return 0;
2680         }
2681
2682         group->private_tile = create_tile(gen->ctx, n_index);
2683         acc = isl_map_apply_domain(acc, isl_map_copy(gen->privatization));
2684         if (!can_tile(acc, group->private_tile))
2685                 group->private_tile = free_tile(group->private_tile);
2686
2687         isl_map_free(acc);
2688
2689         return 0;
2690 }
2691
2692 /* Compute the private and/or shared memory tiles for the array
2693  * reference group "group" of array "array" and set last_shared.
2694  * Return 0 on success and -1 on error.
2695  */
2696 static int compute_group_bounds(struct gpu_gen *gen,
2697         struct gpu_array_ref_group *group)
2698 {
2699         if (compute_group_bounds_core(gen, group) < 0)
2700                 return -1;
2701         set_last_shared(gen, group);
2702
2703         return 0;
2704 }
2705
2706 /* If two groups have overlapping access relations (as determined by
2707  * the "overlap" function) and if one of them involves a write,
2708  * then merge the two groups into one.
2709  * If "compute_bounds" is set, then call compute_group_bounds
2710  * on the merged groups.
2711  *
2712  * Return the updated number of groups.
2713  * Return -1 on error.
2714  */
2715 static int group_writes(struct gpu_gen *gen,
2716         int n, struct gpu_array_ref_group **groups,
2717         int (*overlap)(struct gpu_array_ref_group *group1,
2718                 struct gpu_array_ref_group *group2), int compute_bounds)
2719 {
2720         int i, j;
2721
2722         for (i = 0; i < n; ++i) {
2723                 for (j = n - 1; j > i; --j) {
2724                         if (!groups[i]->write && !groups[j]->write)
2725                                 continue;
2726
2727                         if (!overlap(groups[i], groups[j]))
2728                                 continue;
2729
2730                         groups[i] = join_groups_and_free(groups[i], groups[j]);
2731                         if (compute_bounds &&
2732                             compute_group_bounds(gen, groups[i]) < 0)
2733                                 return -1;
2734                         if (j != n - 1)
2735                                 groups[j] = groups[n - 1];
2736                         groups[n - 1] = NULL;
2737                         n--;
2738                 }
2739         }
2740
2741         return n;
2742 }
2743
2744 /* If two groups have overlapping access relations (within the innermost
2745  * loop) and if one of them involves a write, then merge the two groups
2746  * into one.
2747  *
2748  * Return the updated number of groups.
2749  */
2750 static int group_overlapping_writes(struct gpu_gen *gen,
2751         int n, struct gpu_array_ref_group **groups)
2752 {
2753         return group_writes(gen, n, groups, &accesses_overlap, 0);
2754 }
2755
2756 /* Check if the access relations of group1 and group2 overlap within
2757  * the outermost min(group1->last_shared, group2->last_shared) loops.
2758  */
2759 static int last_shared_accesses_overlap(struct gpu_array_ref_group *group1,
2760         struct gpu_array_ref_group *group2)
2761 {
2762         int last_shared;
2763         int dim;
2764         int empty;
2765         isl_map *map_i, *map_j, *map;
2766
2767         last_shared = group1->last_shared;
2768         if (group2->last_shared < last_shared)
2769                 last_shared = group2->last_shared;
2770         map_i = isl_map_copy(group1->access);
2771         dim = isl_map_dim(map_i, isl_dim_in);
2772         map_i = isl_map_eliminate(map_i, isl_dim_in,
2773                                 last_shared + 1, dim - (last_shared + 1));
2774         map_j = isl_map_copy(group2->access);
2775         map_j = isl_map_eliminate(map_j, isl_dim_in,
2776                                 last_shared + 1, dim - (last_shared + 1));
2777         map = isl_map_intersect(map_i, map_j);
2778         empty = isl_map_is_empty(map);
2779         isl_map_free(map);
2780
2781         return !empty;
2782 }
2783
2784 /* If two groups have overlapping access relations (within the outer
2785  * last_shared loops) and if one of them involves a write,
2786  * then merge the two groups into one.
2787  *
2788  * Return the updated number of groups.
2789  */
2790 static int group_last_shared_overlapping_writes(struct gpu_gen *gen, int n,
2791         struct gpu_array_ref_group **groups)
2792 {
2793         return group_writes(gen, n, groups, &last_shared_accesses_overlap, 1);
2794 }
2795
2796 /* Is the size of the tile specified by "tile" smaller than the sum of
2797  * the sizes of the tiles specified by "tile1" and "tile2"?
2798  */
2799 static int smaller_tile(isl_ctx *ctx, struct gpu_array_tile *tile,
2800         struct gpu_array_tile *tile1, struct gpu_array_tile *tile2)
2801 {
2802         int smaller;
2803         isl_val *size, *size1, *size2;
2804
2805         size = tile_size(ctx, tile);
2806         size1 = tile_size(ctx, tile1);
2807         size2 = tile_size(ctx, tile2);
2808
2809         size = isl_val_sub(size, size1);
2810         size = isl_val_sub(size, size2);
2811         smaller = isl_val_is_neg(size);
2812
2813         isl_val_free(size);
2814
2815         return smaller;
2816 }
2817
2818 /* Given an initial grouping of array references and shared memory tiles
2819  * for each group that allows for a shared memory tile, merge two groups
2820  * if both have a shared memory tile, the merged group also has
2821  * a shared memory tile and the size of the tile for the merge group
2822  * is smaller than the sum of the tile sizes of the individual groups.
2823  *
2824  * If merging two groups decreases the "last_shared" dimension of
2825  * one or both of the two groups, then we need to check for overlapping
2826  * writes again.
2827  *
2828  * Return the number of groups after merging.
2829  * Return -1 on error.
2830  */
2831 static int group_common_shared_memory_tile(struct gpu_gen *gen,
2832         struct gpu_array_info *array, int n,
2833         struct gpu_array_ref_group **groups)
2834 {
2835         int i, j;
2836         int recompute_overlap = 0;
2837         isl_ctx *ctx = isl_space_get_ctx(array->space);
2838
2839         for (i = 0; i < n; ++i) {
2840                 if (!groups[i]->shared_tile)
2841                         continue;
2842                 for (j = n - 1; j > i; --j) {
2843                         isl_map *map;
2844                         int empty;
2845                         struct gpu_array_ref_group *group;
2846
2847                         if (!groups[j]->shared_tile)
2848                                 continue;
2849
2850                         map = isl_map_intersect(isl_map_copy(groups[i]->access),
2851                                             isl_map_copy(groups[j]->access));
2852                         empty = isl_map_is_empty(map);
2853                         isl_map_free(map);
2854
2855                         if (empty)
2856                                 continue;
2857
2858                         group = join_groups(groups[i], groups[j]);
2859                         if (compute_group_bounds(gen, group) < 0) {
2860                                 free_array_ref_group(group);
2861                                 return -1;
2862                         }
2863                         if (!group->shared_tile ||
2864                             !smaller_tile(ctx, group->shared_tile,
2865                                         groups[i]->shared_tile,
2866                                         groups[j]->shared_tile)) {
2867                                 free_array_ref_group(group);
2868                                 continue;
2869                         }
2870
2871                         if (group->last_shared < groups[i]->last_shared ||
2872                             group->last_shared < groups[j]->last_shared)
2873                                 recompute_overlap = 1;
2874                         free_array_ref_group(groups[i]);
2875                         free_array_ref_group(groups[j]);
2876                         groups[i] = group;
2877                         if (j != n - 1)
2878                                 groups[j] = groups[n - 1];
2879                         n--;
2880                 }
2881         }
2882
2883         if (recompute_overlap)
2884                 n = group_last_shared_overlapping_writes(gen, n, groups);
2885         return n;
2886 }
2887
2888 /* Set array->n_group and array->groups to n and groups.
2889  *
2890  * Additionally, set the "nr" field of each group
2891  * and the "group" field of each reference in each group.
2892  */
2893 static void set_array_groups(struct gpu_array_info *array,
2894         int n, struct gpu_array_ref_group **groups)
2895 {
2896         int i, j;
2897
2898         array->n_group = n;
2899         array->groups = groups;
2900
2901         for (i = 0; i < n; ++i) {
2902                 groups[i]->nr = i;
2903
2904                 for (j = 0; j < groups[i]->n_ref; ++j)
2905                         groups[i]->refs[j]->group = i;
2906         }
2907 }
2908
2909 /* Group array references that should be considered together when
2910  * deciding whether to access them from private, shared or global memory.
2911  * Return -1 on error.
2912  *
2913  * In particular, if two array references overlap and if one of them
2914  * is a write, then the two references are grouped together.
2915  * We first perform an initial grouping based only on the access relation.
2916  * After computing shared and private memory tiles, we check for
2917  * overlapping writes again, but this time taking into account
2918  * the "last_shared" property.
2919  *
2920  * Furthermore, if two groups admit a shared memory tile and if the
2921  * combination of the two also admits a shared memory tile, we merge
2922  * the two groups.
2923  *
2924  * If the array contains structures, then there is no need to compute
2925  * reference groups since we do not map such arrays to private or shared
2926  * memory.
2927  */
2928 static int group_array_references(struct gpu_gen *gen,
2929         struct gpu_array_info *array, __isl_keep isl_union_map *sched)
2930 {
2931         int i;
2932         int n;
2933         isl_ctx *ctx = isl_union_map_get_ctx(sched);
2934         struct gpu_array_ref_group **groups;
2935
2936         if (array->has_compound_element)
2937                 return 0;
2938
2939         groups = isl_calloc_array(ctx, struct gpu_array_ref_group *,
2940                                         array->n_ref);
2941         if (!groups)
2942                 return -1;
2943
2944         n = populate_array_references(array, sched, groups);
2945
2946         n = group_overlapping_writes(gen, n, groups);
2947
2948         for (i = 0; i < n; ++i)
2949                 if (compute_group_bounds(gen, groups[i]) < 0)
2950                         n = -1;
2951
2952         n = group_last_shared_overlapping_writes(gen, n, groups);
2953
2954         n = group_common_shared_memory_tile(gen, array, n, groups);
2955
2956         set_array_groups(array, n, groups);
2957
2958         if (n >= 0)
2959                 return 0;
2960
2961         for (i = 0; i < array->n_ref; ++i)
2962                 free_array_ref_group(groups[i]);
2963         return -1;
2964 }
2965
2966 /* Take tiled_sched, project it onto the shared tile loops and
2967  * the loops that will be wrapped over the threads and
2968  * store the result in gen->shared_sched.
2969  * Also compute a projection that projects out the loops that will be
2970  * wrapped over the threads and store this projection in gen->shared_proj.
2971  */
2972 static void compute_shared_sched(struct gpu_gen *gen)
2973 {
2974         isl_space *dim;
2975         isl_map *proj;
2976         isl_set *par;
2977         isl_union_map *sched;
2978
2979         sched = isl_union_map_copy(gen->tiled_sched);
2980
2981         dim = isl_union_map_get_space(sched);
2982         proj = projection(dim, gen->tiled_len, gen->shared_len + gen->n_block);
2983         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
2984
2985         dim = isl_union_map_get_space(sched);
2986         proj = projection(dim, gen->shared_len + gen->n_block, gen->shared_len);
2987
2988         gen->shared_sched = sched;
2989         gen->shared_proj = isl_union_map_from_map(proj);
2990 }
2991
2992 /* Group references of all arrays in the program.
2993  */
2994 static int group_references(struct gpu_gen *gen)
2995 {
2996         int i;
2997         int r = 0;
2998         isl_union_map *sched;
2999
3000         sched = isl_union_map_apply_range(isl_union_map_copy(gen->shared_sched),
3001                                           isl_union_map_copy(gen->shared_proj));
3002
3003         for (i = 0; i < gen->prog->n_array; ++i) {
3004                 r = group_array_references(gen, &gen->prog->array[i], sched);
3005                 if (r < 0)
3006                         break;
3007         }
3008
3009         isl_union_map_free(sched);
3010
3011         return r;
3012 }
3013
3014 /* Free all array information that is local to the current kernel.
3015  */
3016 static void free_local_array_info(struct gpu_gen *gen)
3017 {
3018         int i, j;
3019
3020         for (i = 0; i < gen->prog->n_array; ++i) {
3021                 struct gpu_array_info *array = &gen->prog->array[i];
3022
3023                 for (j = 0; j < array->n_group; ++j)
3024                         free_array_ref_group(array->groups[j]);
3025                 free(array->groups);
3026         }
3027 }
3028
3029 /* Compute the size of a bounding box around the origin and "set",
3030  * where "set" is assumed to contain only non-negative elements.
3031  * In particular, compute the maximal value of "set" in each direction
3032  * and add one.
3033  */
3034 static __isl_give isl_multi_pw_aff *extract_size(__isl_take isl_set *set,
3035         __isl_keep isl_set *context)
3036 {
3037         int i, n;
3038         isl_multi_pw_aff *mpa;
3039
3040         n = isl_set_dim(set, isl_dim_set);
3041         mpa = isl_multi_pw_aff_zero(isl_set_get_space(set));
3042         for (i = 0; i < n; ++i) {
3043                 isl_space *space;
3044                 isl_aff *one;
3045                 isl_pw_aff *bound;
3046
3047                 bound = isl_set_dim_max(isl_set_copy(set), i);
3048                 bound = isl_pw_aff_coalesce(bound);
3049                 bound = isl_pw_aff_gist(bound, isl_set_copy(context));
3050
3051                 space = isl_pw_aff_get_domain_space(bound);
3052                 one = isl_aff_zero_on_domain(isl_local_space_from_space(space));
3053                 one = isl_aff_add_constant_si(one, 1);
3054                 bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one));
3055                 mpa = isl_multi_pw_aff_set_pw_aff(mpa, i, bound);
3056         }
3057         isl_set_free(set);
3058
3059         return mpa;
3060 }
3061
3062 /* Compute the effective grid size as a list of the sizes in each dimension.
3063  *
3064  * The grid size specified by the user or set by default
3065  * in read_grid_sizes() and applied in tile_schedule(),
3066  * may be too large for the given code in the sense that
3067  * it may contain blocks that don't need to execute anything.
3068  * We therefore don't return this grid size, but instead the
3069  * smallest grid size that ensures that all blocks that actually
3070  * execute code are included in the grid.
3071  *
3072  * We first extract a description of the grid, i.e., the possible values
3073  * of the block ids, from gen->tiled_sched.
3074  * The block ids are parameters in gen->tiled_sched.
3075  * We simply need to change them into set dimensions.
3076  *
3077  * Then, for each block dimension, we compute the maximal value of the block id
3078  * and add one.
3079  */
3080 static __isl_give isl_multi_pw_aff *extract_grid_size(struct gpu_gen *gen,
3081         struct ppcg_kernel *kernel)
3082 {
3083         int i;
3084         isl_set *grid;
3085
3086         grid = isl_union_map_params(isl_union_map_copy(gen->tiled_sched));
3087         grid = isl_set_from_params(grid);
3088         grid = isl_set_add_dims(grid, isl_dim_set, gen->n_grid);
3089         for (i = 0; i < gen->n_grid; ++i) {
3090                 int pos;
3091                 char name[20];
3092
3093                 snprintf(name, sizeof(name), "b%d", i);
3094                 pos = isl_set_find_dim_by_name(grid, isl_dim_param, name);
3095                 assert(pos >= 0);
3096                 grid = isl_set_equate(grid, isl_dim_param, pos, isl_dim_set, i);
3097                 grid = isl_set_project_out(grid, isl_dim_param, pos, 1);
3098         }
3099
3100         return extract_size(grid, kernel->context);
3101 }
3102
3103 /* Compute the size of a fixed bounding box around the origin and "set",
3104  * where "set" is assumed to contain only non-negative elements,
3105  * and store the results in "size".
3106  * In particular, compute the maximal value of "set" in each direction
3107  * and add one.
3108  */
3109 static void extract_fixed_size(__isl_take isl_set *set, int *size)
3110 {
3111         int i, n;
3112         isl_local_space *ls;
3113         isl_aff *obj;
3114
3115         n = isl_set_dim(set, isl_dim_set);
3116         ls = isl_local_space_from_space(isl_set_get_space(set));
3117         obj = isl_aff_zero_on_domain(ls);
3118         for (i = 0; i < n; ++i) {
3119                 isl_val *max;
3120
3121                 obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 1);
3122                 max = isl_set_max_val(set, obj);
3123                 size[i] = isl_val_get_num_si(max) + 1;
3124                 isl_val_free(max);
3125                 obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 0);
3126         }
3127         isl_aff_free(obj);
3128         isl_set_free(set);
3129 }
3130
3131 /* Compute the effective block size as a list of the sizes in each dimension
3132  * and store the sizes in kernel->block_dim.
3133  *
3134  * The block size specified by the user or set by default
3135  * in read_block_sizes() and applied in thread_tile_schedule(),
3136  * may be too large for the given code in the sense that
3137  * it may contain threads that don't need to execute anything.
3138  * We therefore don't store this block size in kernel->block_dim,
3139  * but instead the smallest block size that ensures that all threads
3140  * that actually execute code are included in the block.
3141  *
3142  * The current implementation eliminates all parameters, ensuring
3143  * that the size is a fixed constant in each dimension.
3144  * In principle we could also compute parametric sizes.
3145  * We would have to make sure to project out all b%d and t%d parameters,
3146  * however.
3147  */
3148 static void extract_block_size(struct gpu_gen *gen, struct ppcg_kernel *kernel)
3149 {
3150         int i;
3151         int nparam;
3152         isl_set *block;
3153         isl_multi_pw_aff *mpa;
3154
3155         block = isl_union_map_params(isl_union_map_copy(gen->local_sched));
3156         block = isl_set_from_params(block);
3157         block = isl_set_add_dims(block, isl_dim_set, gen->n_block);
3158         kernel->n_block = gen->n_block;
3159         for (i = 0; i < gen->n_block; ++i) {
3160                 int pos;
3161                 char name[20];
3162
3163                 snprintf(name, sizeof(name), "t%d", i);
3164                 pos = isl_set_find_dim_by_name(block, isl_dim_param, name);
3165                 assert(pos >= 0);
3166                 block = isl_set_equate(block, isl_dim_param, pos,
3167                                         isl_dim_set, i);
3168         }
3169         nparam = isl_set_dim(block, isl_dim_param);
3170         block = isl_set_project_out(block, isl_dim_param, 0, nparam);
3171
3172         extract_fixed_size(block, kernel->block_dim);
3173 }
3174
3175 void ppcg_kernel_free(void *user)
3176 {
3177         struct ppcg_kernel *kernel = user;
3178         int i;
3179
3180         if (!kernel)
3181                 return;
3182
3183         isl_multi_pw_aff_free(kernel->grid_size);
3184         isl_set_free(kernel->context);
3185         isl_union_set_free(kernel->arrays);
3186         isl_space_free(kernel->space);
3187         isl_ast_node_free(kernel->tree);
3188
3189         for (i = 0; i < kernel->n_array; ++i)
3190                 isl_pw_aff_list_free(kernel->array[i].bound);
3191         free(kernel->array);
3192
3193         for (i = 0; i < kernel->n_var; ++i) {
3194                 free(kernel->var[i].name);
3195                 isl_vec_free(kernel->var[i].size);
3196         }
3197         free(kernel->var);
3198
3199         free(kernel);
3200 }
3201
3202 static void create_kernel_var(isl_ctx *ctx, struct gpu_array_ref_group *group,
3203         struct ppcg_kernel_var *var)
3204 {
3205         int j;
3206         struct gpu_array_tile *tile;
3207         isl_printer *p;
3208         char *name;
3209
3210         var->array = group->array;
3211
3212         tile = group->private_tile;
3213         var->type = ppcg_access_private;
3214         if (!tile) {
3215                 tile = group->shared_tile;
3216                 var->type = ppcg_access_shared;
3217         }
3218
3219         p = isl_printer_to_str(ctx);
3220         p = print_array_name(p, group);
3221         var->name = isl_printer_get_str(p);
3222         isl_printer_free(p);
3223
3224         var->size = isl_vec_alloc(ctx, group->array->n_index);
3225
3226         for (j = 0; j < group->array->n_index; ++j)
3227                 var->size = isl_vec_set_element_val(var->size, j,
3228                                             isl_val_copy(tile->bound[j].size));
3229 }
3230
3231 static void create_kernel_vars(struct gpu_gen *gen, struct ppcg_kernel *kernel)
3232 {
3233         int i, j, n;
3234
3235         n = 0;
3236         for (i = 0; i < gen->prog->n_array; ++i) {
3237                 struct gpu_array_info *array = &gen->prog->array[i];
3238
3239                 for (j = 0; j < array->n_group; ++j) {
3240                         struct gpu_array_ref_group *group = array->groups[j];
3241                         if (group->private_tile || group->shared_tile)
3242                                 ++n;
3243                 }
3244         }
3245
3246         kernel->n_var = n;
3247         kernel->var = isl_calloc_array(gen->ctx, struct ppcg_kernel_var, n);
3248         assert(kernel->var);
3249
3250         n = 0;
3251         for (i = 0; i < gen->prog->n_array; ++i) {
3252                 struct gpu_array_info *array = &gen->prog->array[i];
3253
3254                 for (j = 0; j < array->n_group; ++j) {
3255                         struct gpu_array_ref_group *group = array->groups[j];
3256                         if (!group->private_tile && !group->shared_tile)
3257                                 continue;
3258                         create_kernel_var(gen->ctx, group, &kernel->var[n]);
3259                         ++n;
3260                 }
3261         }
3262 }
3263
3264 /* The sizes of the arrays on the host that have been computed by
3265  * extract_array_info may depend on the parameters.  Use the extra
3266  * constraints on the parameters that are valid at "host_domain"
3267  * to simplify these expressions and store the results in kernel->array.
3268  *
3269  * We only need these localized bounds for arrays that are accessed
3270  * by the current kernel.  If we have found at least one reference group
3271  * then the array is accessed by the kernel.  If the array has compound
3272  * elements then we skipped the construction of array reference groups.
3273  */
3274 static void localize_bounds(struct gpu_gen *gen, struct ppcg_kernel *kernel,
3275         __isl_keep isl_set *host_domain)
3276 {
3277         int i, j;
3278         isl_set *context;
3279
3280         kernel->array = isl_calloc_array(gen->ctx,
3281                             struct gpu_local_array_info, gen->prog->n_array);
3282         assert(kernel->array);
3283         kernel->n_array = gen->prog->n_array;
3284
3285         context = isl_set_copy(host_domain);
3286         context = isl_set_params(context);
3287
3288         for (i = 0; i < gen->prog->n_array; ++i) {
3289                 struct gpu_array_info *array = &gen->prog->array[i];
3290                 isl_pw_aff_list *local;
3291
3292                 if (array->n_group == 0 && !array->has_compound_element)
3293                         continue;
3294
3295                 local = isl_pw_aff_list_alloc(gen->ctx, array->n_index);
3296
3297                 for (j = 0; j < array->n_index; ++j) {
3298                         isl_pw_aff *pwaff;
3299
3300                         pwaff = isl_pw_aff_copy(array->bound[j]);
3301                         pwaff = isl_pw_aff_gist(pwaff, isl_set_copy(context));
3302                         local = isl_pw_aff_list_add(local, pwaff);
3303                 }
3304
3305                 kernel->array[i].bound = local;
3306         }
3307         isl_set_free(context);
3308 }
3309
3310 /* Find the element in gen->stmt that has the given "id".
3311  * Return NULL if no such gpu_stmt can be found.
3312  */
3313 static struct gpu_stmt *find_stmt(struct gpu_prog *prog, __isl_keep isl_id *id)
3314 {
3315         int i;
3316
3317         for (i = 0; i < prog->n_stmts; ++i) {
3318                 if (id == prog->stmts[i].id)
3319                         break;
3320         }
3321
3322         return i < prog->n_stmts ? &prog->stmts[i] : NULL;
3323 }
3324
3325 /* Set gen->tile_len and gen->n_parallel to those of the statement
3326  * affected by the first map (part of the schedule)
3327  * on which this function is called.
3328  * Because of the way the schedule is constructed, the other statements
3329  * in the list, if any, should have the same values for these properties.
3330  */
3331 static int extract_tile_len(__isl_take isl_map *map, void *user)
3332 {
3333         struct gpu_gen *gen = (struct gpu_gen *) user;
3334         isl_id *id;
3335         struct gpu_stmt *stmt;
3336
3337         id = isl_map_get_tuple_id(map, isl_dim_in);
3338         stmt = find_stmt(gen->prog, id);
3339         isl_id_free(id);
3340
3341         isl_map_free(map);
3342
3343         if (!stmt)
3344                 isl_die(gen->ctx, isl_error_unknown,
3345                         "statement not found", return -1);
3346
3347         gen->tile_len = stmt->tile_len;
3348         gen->n_parallel = stmt->n_parallel;
3349
3350         return -1;
3351 }
3352
3353 void ppcg_kernel_stmt_free(void *user)
3354 {
3355         int i;
3356         struct ppcg_kernel_stmt *stmt = user;
3357
3358         if (!stmt)
3359                 return;
3360
3361         switch (stmt->type) {
3362         case ppcg_kernel_copy:
3363                 isl_ast_expr_free(stmt->u.c.index);
3364                 isl_ast_expr_free(stmt->u.c.local_index);
3365                 break;
3366         case ppcg_kernel_domain:
3367                 isl_id_to_ast_expr_free(stmt->u.d.ref2expr);
3368                 break;
3369         case ppcg_kernel_sync:
3370                 break;
3371         }
3372
3373         free(stmt);
3374 }
3375
3376 /* Set the options of "context" to
3377  *
3378  *      { space -> [x] : x >= first }
3379  */
3380 static __isl_give isl_ast_build *set_unroll(
3381         __isl_take isl_ast_build *build, __isl_take isl_space *space,
3382         int first)
3383 {
3384         isl_ctx *ctx;
3385         isl_map *unroll;
3386         isl_union_map *opt;
3387
3388         ctx = isl_ast_build_get_ctx(build);
3389
3390         space = isl_space_from_domain(space);
3391         space = isl_space_add_dims(space, isl_dim_out, 1);
3392         space = isl_space_set_tuple_name(space, isl_dim_out, "unroll");
3393         unroll = isl_map_universe(space);
3394         unroll = isl_map_lower_bound_si(unroll, isl_dim_out, 0, first);
3395         opt = isl_union_map_from_map(unroll);
3396
3397         build = isl_ast_build_set_options(build, opt);
3398
3399         return build;
3400 }
3401
3402 /* Return a list of isl_ids of the form "prefix%d".
3403  */
3404 static __isl_give isl_id_list *generate_names(isl_ctx *ctx,
3405         int n, const char *prefix)
3406 {
3407         int i;
3408         char name[10];
3409         isl_id_list *names;
3410
3411         names = isl_id_list_alloc(ctx, n);
3412         for (i = 0; i < n; ++i) {
3413                 isl_id *id;
3414
3415                 snprintf(name, sizeof(name), "%s%d", prefix, i);
3416                 id = isl_id_alloc(ctx, name, NULL);
3417                 names = isl_id_list_add(names, id);
3418         }
3419
3420         return names;
3421 }
3422
3423 /* Extend the schedule "schedule" with the part of "extension"
3424  * starting at "first" up to "len".
3425  */
3426 static __isl_give isl_union_map *extend_schedule(
3427         __isl_take isl_union_map *schedule,
3428         __isl_take isl_union_map *extension, int first, int len)
3429 {
3430         isl_space *space;
3431         isl_map *proj;
3432         isl_union_map *umap;
3433         isl_set *set;
3434
3435         space = isl_union_map_get_space(schedule);
3436         space = isl_space_set_from_params(space);
3437         space = isl_space_add_dims(space, isl_dim_set, len);
3438         proj = isl_set_identity(isl_set_universe(space));
3439         proj = isl_map_project_out(proj, isl_dim_out, 0, first);
3440         extension = isl_union_map_apply_range(extension,
3441                                                 isl_union_map_from_map(proj));
3442
3443         schedule = isl_union_map_range_product(schedule, extension);
3444
3445         return schedule;
3446 }
3447
3448 /* Return the gpu_stmt_access in the list "accesses" that corresponds
3449  * to "ref_id".
3450  */
3451 static struct gpu_stmt_access *find_access(struct gpu_stmt_access *accesses,
3452         __isl_keep isl_id *ref_id)
3453 {
3454         struct gpu_stmt_access *access;
3455
3456         for (access = accesses; access; access = access->next)
3457                 if (access->ref_id == ref_id)
3458                         return access;
3459
3460         return NULL;
3461 }
3462
3463 /* Return the index of the array called "name" in the list of arrays.
3464  */
3465 static int find_array_index(struct gpu_gen *gen, const char *name)
3466 {
3467         int i;
3468
3469         for (i = 0; i < gen->prog->n_array; ++i)
3470                 if (!strcmp(name, gen->prog->array[i].name))
3471                         return i;
3472
3473         return -1;
3474 }
3475
3476 /* Internal data structure for the index and AST expression transformation
3477  * callbacks for pet_stmt_build_ast_exprs.
3478  *
3479  * "accesses" is the list of gpu_stmt_access in the statement.
3480  * "iterator_map" expresses the statement iterators in terms of
3481  * the AST loop iterators.
3482  * "sched2shared" expresses the first shared_len dimensions of
3483  * the computed schedule in terms of the AST loop iterators.
3484  *
3485  * The following fields are set in transform_index and used in transform_expr.
3486  * "array" is the array that is being accessed.
3487  * "global" is set if the global array is accessed (rather than
3488  * shared/private memory).
3489  * "local_array" refers to information on the array specialized
3490  * to the current kernel.
3491  */
3492 struct ppcg_transform_data {
3493         struct gpu_gen *gen;
3494         struct gpu_stmt_access *accesses;
3495         isl_pw_multi_aff *iterator_map;
3496         isl_pw_multi_aff *sched2shared;
3497
3498         struct gpu_array_info *array;
3499         int global;
3500         struct gpu_local_array_info *local_array;
3501 };
3502
3503 /* Return the name of the outer array (of structs) accessed by "access".
3504  */
3505 static const char *get_outer_array_name(__isl_keep isl_map *access)
3506 {
3507         isl_space *space;
3508         const char *name;
3509
3510         space = isl_space_range(isl_map_get_space(access));
3511         while (space && isl_space_is_wrapping(space))
3512                 space = isl_space_domain(isl_space_unwrap(space));
3513         name = isl_space_get_tuple_name(space, isl_dim_set);
3514         isl_space_free(space);
3515
3516         return name;
3517 }
3518
3519 /* Index transformation callback for pet_stmt_build_ast_exprs.
3520  *
3521  * "index" expresses the array indices in terms of statement iterators
3522  *
3523  * We first reformulate "index" in terms of the AST loop iterators.
3524  * Then we check if we are accessing the global array or
3525  * a shared/private copy.  In the former case, we simply return
3526  * the updated index.  If "index" is an affine expression rather
3527  * than an array access, then we also return the updated index here.
3528  *
3529  * If no reference groups have been computed for the array,
3530  * then we can only be accessing the global array.
3531  *
3532  * Otherwise, we apply the tiling to the index.
3533  * This tiling is of the form
3534  *
3535  *      [D -> A] -> T
3536  *
3537  * The index is of the form
3538  *
3539  *      L -> A
3540  *
3541  * We update the tiling to refer to the AST loop iteratos
3542  *
3543  *      [L -> A] -> T
3544  *
3545  * and modify index to keep track of those iterators
3546  *
3547  *      L -> [L -> A]
3548  *
3549  * Combining these two yields a tiled index expression in terms
3550  * of the AST loop iterators
3551  *
3552  *      L -> T
3553  */
3554 static __isl_give isl_multi_pw_aff *transform_index(
3555         __isl_take isl_multi_pw_aff *index, __isl_keep isl_id *ref_id,
3556         void *user)
3557 {
3558         struct ppcg_transform_data *data = user;
3559         struct gpu_stmt_access *access;
3560         struct gpu_array_ref_group *group;
3561         struct gpu_array_tile *tile;
3562         isl_pw_multi_aff *iterator_map;
3563         int i;
3564         const char *name;
3565         isl_space *space;
3566         isl_multi_pw_aff *tiling;
3567         isl_pw_multi_aff *pma;
3568         isl_multi_pw_aff *mpa;
3569
3570         data->array = NULL;
3571
3572         iterator_map = isl_pw_multi_aff_copy(data->iterator_map);
3573         index = isl_multi_pw_aff_pullback_pw_multi_aff(index, iterator_map);
3574
3575         access = find_access(data->accesses, ref_id);
3576         if (!access)
3577                 return index;
3578         if (!isl_map_has_tuple_name(access->access, isl_dim_out))
3579                 return index;
3580
3581         name = get_outer_array_name(access->access);
3582         i = find_array_index(data->gen, name);
3583         if (i < 0)
3584                 isl_die(isl_multi_pw_aff_get_ctx(index), isl_error_internal,
3585                         "cannot find array",
3586                         return isl_multi_pw_aff_free(index));
3587         data->array = &data->gen->prog->array[i];
3588         data->local_array = &data->gen->kernel->array[i];
3589
3590         if (access->group < 0) {
3591                 data->global = 1;
3592                 return index;
3593         }
3594
3595         group = data->array->groups[access->group];
3596         tile = group->private_tile;
3597         if (!tile)
3598                 tile = group->shared_tile;
3599         data->global = !tile;
3600         if (!tile)
3601                 return index;
3602
3603         space = isl_space_range(isl_multi_pw_aff_get_space(index));
3604         space = isl_space_map_from_set(space);
3605         pma = isl_pw_multi_aff_identity(space);
3606         pma = isl_pw_multi_aff_product(
3607                         isl_pw_multi_aff_copy(data->sched2shared), pma);
3608         tiling = isl_multi_pw_aff_from_multi_aff(
3609                                     isl_multi_aff_copy(tile->tiling));
3610         tiling = isl_multi_pw_aff_pullback_pw_multi_aff(tiling, pma);
3611
3612         space = isl_space_domain(isl_multi_pw_aff_get_space(index));
3613         space = isl_space_map_from_set(space);
3614         mpa = isl_multi_pw_aff_identity(space);
3615         index = isl_multi_pw_aff_range_product(mpa, index);
3616         index = isl_multi_pw_aff_pullback_multi_pw_aff(tiling, index);
3617
3618         return index;
3619 }
3620
3621 /* Dereference "expr" by adding an index [0].
3622  * The original "expr" is assumed not to have any indices.
3623  *
3624  * If "expr" is a member access, then the dereferencing needs
3625  * to be applied to the structure argument of this member access.
3626  */
3627 static __isl_give isl_ast_expr *dereference(__isl_take isl_ast_expr *expr)
3628 {
3629         isl_ctx *ctx;
3630         isl_ast_expr *res;
3631         isl_ast_expr_list *list;
3632
3633         if (isl_ast_expr_get_op_type(expr) == isl_ast_op_member) {
3634                 isl_ast_expr *arg;
3635
3636                 arg = isl_ast_expr_get_op_arg(expr, 0);
3637                 arg = dereference(arg);
3638                 expr = isl_ast_expr_set_op_arg(expr, 0, arg);
3639
3640                 return expr;
3641         }
3642
3643         ctx = isl_ast_expr_get_ctx(expr);
3644         res = isl_ast_expr_from_val(isl_val_zero(ctx));
3645         list = isl_ast_expr_list_from_ast_expr(res);
3646         res = isl_ast_expr_get_op_arg(expr, 0);
3647         res = isl_ast_expr_access(res, list);
3648         isl_ast_expr_free(expr);
3649
3650         return res;
3651 }
3652
3653 /* Linearize the index expression "expr" based on the array bounds
3654  * of "array".
3655  *
3656  * That is, transform expression
3657  *
3658  *      A[i_0][i_1]...[i_n]
3659  *
3660  * to
3661  *
3662  *      A[(..((i_0 * b_1 + i_1) ... ) * b_n + i_n]
3663  *
3664  * where b_0, b_1, ..., b_n are the bounds on the array.
3665  *
3666  * If the base of "expr" is a member access, then the linearization needs
3667  * to be applied to the structure argument of this member access.
3668  */
3669 __isl_give isl_ast_expr *gpu_local_array_info_linearize_index(
3670         struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr)
3671 {
3672         int i, n;
3673         isl_ctx *ctx;
3674         isl_set *context;
3675         isl_ast_expr *arg0;
3676         isl_ast_expr *res;
3677         isl_ast_expr_list *list;
3678         isl_ast_build *build;
3679
3680         arg0 = isl_ast_expr_get_op_arg(expr, 0);
3681         if (isl_ast_expr_get_type(arg0) == isl_ast_expr_op &&
3682             isl_ast_expr_get_op_type(arg0) == isl_ast_op_member) {
3683                 isl_ast_expr *arg;
3684
3685                 arg = isl_ast_expr_get_op_arg(arg0, 0);
3686                 arg = gpu_local_array_info_linearize_index(array, arg);
3687                 arg0 = isl_ast_expr_set_op_arg(arg0, 0, arg);
3688                 expr = isl_ast_expr_set_op_arg(expr, 0, arg0);
3689
3690                 return expr;
3691         }
3692         isl_ast_expr_free(arg0);
3693
3694         ctx = isl_ast_expr_get_ctx(expr);
3695         context = isl_set_universe(isl_space_params_alloc(ctx, 0));
3696         build = isl_ast_build_from_context(context);
3697
3698         n = isl_ast_expr_get_op_n_arg(expr);
3699         res = isl_ast_expr_get_op_arg(expr, 1);
3700         for (i = 2; i < n; ++i) {
3701                 isl_pw_aff *bound_i;
3702                 isl_ast_expr *expr_i;
3703
3704                 bound_i = isl_pw_aff_list_get_pw_aff(array->bound, i - 1);
3705                 expr_i = isl_ast_build_expr_from_pw_aff(build, bound_i);
3706                 res = isl_ast_expr_mul(res, expr_i);
3707                 expr_i = isl_ast_expr_get_op_arg(expr, i);
3708                 res = isl_ast_expr_add(res, expr_i);
3709         }
3710
3711         isl_ast_build_free(build);
3712
3713         list = isl_ast_expr_list_from_ast_expr(res);
3714         res = isl_ast_expr_get_op_arg(expr, 0);
3715         res = isl_ast_expr_access(res, list);
3716
3717         isl_ast_expr_free(expr);
3718
3719         return res;
3720 }
3721
3722 /* AST expression transformation callback for pet_stmt_build_ast_exprs.
3723  *
3724  * If the AST expression refers to a global scalar that is not
3725  * a read-only scalar, then its address was passed to the kernel and
3726  * we need to dereference it.
3727  *
3728  * If the AST expression refers to an access to a global array,
3729  * then we linearize the access exploiting the bounds in data->local_array.
3730  */
3731 static __isl_give isl_ast_expr *transform_expr(__isl_take isl_ast_expr *expr,
3732         __isl_keep isl_id *id, void *user)
3733 {
3734         struct ppcg_transform_data *data = user;
3735
3736         if (!data->array)
3737                 return expr;
3738         if (gpu_array_is_read_only_scalar(data->array))
3739                 return expr;
3740         if (!data->global)
3741                 return expr;
3742         if (data->array->n_index == 0)
3743                 return dereference(expr);
3744         if (!data->array->linearize)
3745                 return expr;
3746
3747         return gpu_local_array_info_linearize_index(data->local_array, expr);
3748 }
3749
3750 /* This function is called for each instance of a user statement
3751  * in the kernel.
3752  *
3753  * We attach a struct ppcg_kernel_stmt to the "node", containing
3754  * a computed AST expression for each access.
3755  * These AST expressions are computed from iterator_map,
3756  * which expresses the domain
3757  * elements in terms of the generated loops, and sched2shared,
3758  * which expresses the first shared_len dimensions of the schedule
3759  * computed by PPCG in terms of the generated loops.
3760  */
3761 static __isl_give isl_ast_node *at_each_domain(__isl_take isl_ast_node *node,
3762         __isl_keep isl_ast_build *build, void *user)
3763 {
3764         struct ppcg_transform_data data;
3765         struct gpu_gen *gen = (struct gpu_gen *) user;
3766         struct ppcg_kernel_stmt *stmt;
3767         isl_id *id;
3768         isl_pw_multi_aff *sched2shared;
3769         isl_map *map;
3770         isl_pw_multi_aff *iterator_map;
3771         isl_ast_expr *expr, *arg;
3772         isl_union_map *schedule;
3773         int i, n;
3774         struct gpu_stmt_access *access;
3775
3776         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
3777         if (!stmt)
3778                 return isl_ast_node_free(node);
3779
3780         expr = isl_ast_node_user_get_expr(node);
3781         arg = isl_ast_expr_get_op_arg(expr, 0);
3782         id = isl_ast_expr_get_id(arg);
3783
3784         schedule = isl_ast_build_get_schedule(build);
3785         map = isl_map_reverse(isl_map_from_union_map(schedule));
3786         iterator_map = isl_pw_multi_aff_from_map(map);
3787         sched2shared = compute_sched_to_shared(gen,
3788                                         isl_pw_multi_aff_copy(iterator_map));
3789
3790         stmt->type = ppcg_kernel_domain;
3791         stmt->u.d.stmt = find_stmt(gen->prog, id);
3792         if (!stmt->u.d.stmt)
3793                 goto error;
3794
3795         data.gen = gen;
3796         data.accesses = stmt->u.d.stmt->accesses;
3797         data.iterator_map = iterator_map;
3798         data.sched2shared = sched2shared;
3799         stmt->u.d.ref2expr = pet_stmt_build_ast_exprs(stmt->u.d.stmt->stmt,
3800                                             build, &transform_index, &data,
3801                                             &transform_expr, &data);
3802
3803         isl_id_free(id);
3804         isl_pw_multi_aff_free(iterator_map);
3805         isl_pw_multi_aff_free(sched2shared);
3806         isl_ast_expr_free(arg);
3807         isl_ast_expr_free(expr);
3808
3809         id = isl_id_alloc(gen->ctx, NULL, stmt);
3810         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
3811         return isl_ast_node_set_annotation(node, id);
3812 error:
3813         isl_id_free(id);
3814         isl_pw_multi_aff_free(iterator_map);
3815         ppcg_kernel_stmt_free(stmt);
3816         isl_pw_multi_aff_free(sched2shared);
3817         return isl_ast_node_free(node);
3818 }
3819
3820 /* This function is called when code has been generated for the shared
3821  * tile loops.  The "schedule" refers only to the original statements.
3822  *
3823  * We extend the schedule with that part of gen->local_sched that hasn't
3824  * been taken into account yet.  This introduces parameters referring
3825  * to thread ids in the schedule, so we add them (with the appropriate
3826  * bounds to the context as well).
3827  * Finally, we set the appropriate unrolling options
3828  * if gen->first_unroll is set.
3829  */
3830 static __isl_give isl_ast_node *create_domain_leaf(
3831         __isl_take isl_union_map *schedule, __isl_take isl_ast_build *build,
3832         void *user)
3833 {
3834         struct gpu_gen *gen = (struct gpu_gen *) user;
3835         isl_space *space;
3836         isl_union_map *sched;
3837         isl_ast_node *tree;
3838         isl_set *set;
3839         isl_id_list *iterators;
3840         int n;
3841
3842         schedule = extend_schedule(schedule,
3843                         isl_union_map_copy(gen->local_sched),
3844                         gen->shared_len, gen->thread_tiled_len);
3845
3846         space = isl_ast_build_get_schedule_space(build);
3847         set = isl_set_universe(space);
3848         set = add_bounded_parameters(set, gen->kernel->n_block,
3849                                         gen->kernel->block_dim, "t");
3850         build = isl_ast_build_restrict(build, set);
3851
3852         n = gen->thread_tiled_len - gen->shared_len;
3853
3854         if (gen->first_unroll >= 0) {
3855                 space = isl_space_set_alloc(gen->ctx, 0, n);
3856                 build = set_unroll(build, space, gen->first_unroll);
3857         }
3858         iterators = generate_names(gen->ctx, n, "c");
3859         build = isl_ast_build_set_iterators(build, iterators);
3860         build = isl_ast_build_set_at_each_domain(build, &at_each_domain, gen);
3861         tree = isl_ast_build_ast_from_schedule(build, schedule);
3862         isl_ast_build_free(build);
3863
3864         return tree;
3865 }
3866
3867 /* This function is called for each statement node in the AST of the code
3868  * for copying to or from shared/private memory.
3869  * Attach a pointer to a ppcg_kernel_stmt representing the copy
3870  * statement to the node.
3871  * The statement name is "read" or "write", depending on whether we are
3872  * reading from global memory or writing to global memory.
3873  * The name of the T space is {shared,private}_<array>.
3874  *
3875  * The schedule is of the form
3876  *
3877  *      type[A -> T] -> L
3878  *
3879  * where A refers to a piece of an array and T to the corresponding
3880  * shifted tile.  We split this schedule into mappings L -> A and L -> T
3881  * and store the corresponding expressions in stmt->index and stmt->local_index,
3882  * where stmt points to the ppcg_kernel_stmt that is attached to the node.
3883  */
3884 static __isl_give isl_ast_node *attach_copy_stmt(__isl_take isl_ast_node *node,
3885         __isl_keep isl_ast_build *build, void *user)
3886 {
3887         struct gpu_gen *gen = (struct gpu_gen *) user;
3888         struct ppcg_kernel_stmt *stmt;
3889         isl_id *id;
3890         isl_ast_expr *expr;
3891         isl_space *space;
3892         isl_map *access, *local_access, *map;
3893         isl_pw_multi_aff *pma;
3894         const char *type;
3895         int array_index;
3896
3897         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
3898         if (!stmt)
3899                 return isl_ast_node_free(node);
3900
3901         access = isl_map_from_union_map(isl_ast_build_get_schedule(build));
3902         type = isl_map_get_tuple_name(access, isl_dim_in);
3903         stmt->u.c.read = !strcmp(type, "read");
3904         access = isl_map_reverse(access);
3905         space = isl_space_unwrap(isl_space_range(isl_map_get_space(access)));
3906         local_access = isl_map_copy(access);
3907
3908         map = isl_map_domain_map(isl_map_universe(isl_space_copy(space)));
3909         id = isl_map_get_tuple_id(access, isl_dim_out);
3910         map = isl_map_set_tuple_id(map, isl_dim_in, id);
3911         access = isl_map_apply_range(access, map);
3912         pma = isl_pw_multi_aff_from_map(access);
3913         expr = isl_ast_build_access_from_pw_multi_aff(build, pma);
3914         stmt->u.c.index = expr;
3915
3916         map = isl_map_range_map(isl_map_universe(space));
3917         id = isl_map_get_tuple_id(local_access, isl_dim_out);
3918         map = isl_map_set_tuple_id(map, isl_dim_in, id);
3919         local_access = isl_map_apply_range(local_access, map);
3920         pma = isl_pw_multi_aff_from_map(local_access);
3921         expr = isl_ast_build_access_from_pw_multi_aff(build, pma);
3922         stmt->u.c.local_index = expr;
3923
3924         stmt->u.c.array = gen->copy_group->array;
3925         array_index = stmt->u.c.array - gen->prog->array;
3926         stmt->u.c.local_array = &gen->kernel->array[array_index];
3927         stmt->type = ppcg_kernel_copy;
3928
3929         id = isl_id_alloc(gen->ctx, NULL, stmt);
3930         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
3931         return isl_ast_node_set_annotation(node, id);
3932 }
3933
3934 /* Given a schedule of the form
3935  *
3936  *      [S -> A] -> L
3937  *
3938  * (with S the first shared_len dimensions of the computed schedule,
3939  * A the array and L the schedule correponding to the generated loops),
3940  * indicating where to copy the array elements that need to be copied,
3941  * construct code for performing the copying.
3942  *
3943  * "group" is the array reference group that is being copied
3944  * "type" is either "read" or "write"
3945  * private is set if copying needs to be performed to/from registers
3946  *
3947  * We first construct a mapping to a shifted tile of the array,
3948  *
3949  *      [S -> A] -> T(S,A)                                      (1)
3950  *
3951  * If private is set, then we also use this mapping as a schedule
3952  * (which is already thread-specific and will be completely unrolled).
3953  * Otherwise, we wrap/tile the range over the threads.
3954  * The result is
3955  *
3956  *      [S -> A] -> T'(S,A)
3957  *
3958  * Combined with the given schedule, we have
3959  *
3960  *      [S -> A] -> [L -> T'(S,A)]                              (2)
3961  *
3962  * From the shifted tile mapping, we construct a mapping
3963  *
3964  *      [S -> A] -> [A -> T(S,A)]
3965  *
3966  * and apply it to the schedule (2), obtaining
3967  *
3968  *      [A -> T(S(L),A)] -> [L -> T'(S(L),A)]
3969  *
3970  * Note that we can project out S because it is uniquely defined by L.
3971  */
3972 static __isl_give isl_ast_node *copy_access(struct gpu_gen *gen,
3973         __isl_take isl_map *sched,
3974         const char *type, struct gpu_array_ref_group *group,
3975         __isl_take isl_ast_build *build, int private)
3976 {
3977         isl_space *space;
3978         isl_ast_node *tree;
3979         isl_map *schedule, *shift, *map;
3980         isl_set *set;
3981         isl_id_list *iterators;
3982         int n;
3983
3984         shift = shift_access(group);
3985
3986         schedule = isl_map_copy(shift);
3987         schedule = isl_map_reset_tuple_id(schedule, isl_dim_out);
3988         if (!private)
3989                 schedule = tile_access_schedule(gen, schedule);
3990
3991         n = isl_map_dim(schedule, isl_dim_out);
3992         set = isl_set_universe(isl_ast_build_get_schedule_space(build));
3993         set = add_bounded_parameters(set, gen->kernel->n_block,
3994                                         gen->kernel->block_dim, "t");
3995
3996         schedule = isl_map_range_product(sched, schedule);
3997
3998         space = isl_space_domain(isl_map_get_space(shift));
3999         map = isl_map_range_map(isl_map_universe(isl_space_unwrap(space)));
4000         map = isl_map_range_product(map, shift);
4001
4002         schedule = isl_map_apply_domain(schedule, map);
4003
4004         schedule = isl_map_set_tuple_name(schedule, isl_dim_in, type);
4005
4006         build = isl_ast_build_restrict(build, set);
4007
4008         gen->copy_group = group;
4009
4010         if (private) {
4011                 space = isl_space_range(isl_map_get_space(schedule));
4012                 space = isl_space_range(isl_space_unwrap(space));
4013                 build = set_unroll(build, space, 0);
4014         }
4015         iterators = generate_names(gen->ctx, n, "c");
4016         build = isl_ast_build_set_iterators(build, iterators);
4017         build = isl_ast_build_set_at_each_domain(build, &attach_copy_stmt, gen);
4018         tree = isl_ast_build_ast_from_schedule(build,
4019                                             isl_union_map_from_map(schedule));
4020         isl_ast_build_free(build);
4021
4022         return tree;
4023 }
4024
4025 /* Return code for reading into or writing from shared memory
4026  * the given array reference group.
4027  *
4028  * If we are performing a read from global memory to shared memory and
4029  * if the array involved is not a scalar, then we copy
4030  * the entire tile to shared memory.  This may result in some extra
4031  * elements getting copied, but it should lead to simpler code
4032  * (which means that fewer registers may be needed) and less divergence.
4033  *
4034  * Otherwise, we only copy the elements that will be read or have been written
4035  * in the kernel.
4036  *
4037  *
4038  * The input "sched" is of the form.
4039  *
4040  *      type[S -> A] -> L
4041  *
4042  * with S the first shared_len dimensions of the computed schedule,
4043  * A the array and L the schedule correponding to the generated loops.
4044  *
4045  * We first drop "type",
4046  *
4047  *      [S -> A] -> L
4048  *
4049  * If the above conditions are satisfied, we project out A,
4050  * resulting in
4051  *
4052  *      S -> L
4053  *
4054  * and then introduce the group tile [S -> T], resulting in
4055  *
4056  *      [S -> T] -> L
4057  */
4058 static __isl_give isl_ast_node *copy_group_shared_accesses(
4059         struct gpu_gen *gen, struct gpu_array_ref_group *group,
4060         __isl_take isl_map *sched, __isl_take isl_ast_build *build)
4061 {
4062         const char *type;
4063         int read;
4064         isl_union_map *access;
4065
4066         type = isl_map_get_tuple_name(sched, isl_dim_in);
4067         read = !strcmp(type, "read");
4068
4069         sched = isl_map_reset_tuple_id(sched, isl_dim_in);
4070
4071         if (read && !gpu_array_is_scalar(group->array)) {
4072                 isl_space *space;
4073                 isl_map *map;
4074
4075                 space = isl_space_domain(isl_map_get_space(sched));
4076                 space = isl_space_unwrap(space);
4077                 map = isl_map_domain_map(isl_map_universe(space));
4078                 sched = isl_map_apply_domain(sched, map);
4079
4080                 map = group_tile(group);
4081                 map = isl_map_reverse(isl_map_domain_map(map));
4082                 sched = isl_map_apply_domain(sched, map);
4083         }
4084
4085         return copy_access(gen, sched, type, group, build, 0);
4086 }
4087
4088 /* Return code for reading into or writing from private memory
4089  * the given array reference group.
4090  *
4091  * Let S be the first shared_len dimensions of the computed schedule,
4092  * D the iteration domains, A the array and L the schedule correponding
4093  * to the generated loops.
4094  * "sched" is of the form
4095  *
4096  *      type[S -> A] -> L
4097  *
4098  * where type is either "read" or "write".
4099  * We apply the privatization D -> S(t), with t the thread ids,
4100  * to the access relation D -> A to obtain the privatized access relation
4101  *
4102  *      S(t) -> A
4103  *
4104  * We drop the type from "sched" and intersect with the privatized access
4105  * relation to obtain
4106  *
4107  *      [S(t) -> A] -> L
4108  */
4109 static __isl_give isl_ast_node *copy_group_private_accesses(
4110         struct gpu_gen *gen, struct gpu_array_ref_group *group,
4111         __isl_take isl_map *sched, __isl_take isl_ast_build *build)
4112 {
4113         const char *type;
4114         int read;
4115         isl_union_map *priv;
4116         isl_union_map *access;
4117         isl_map *access_map;
4118
4119         type = isl_map_get_tuple_name(sched, isl_dim_in);
4120         read = !strcmp(type, "read");
4121
4122         priv = isl_union_map_from_map(isl_map_copy(gen->privatization));
4123         priv = isl_union_map_apply_range(isl_union_map_copy(gen->shared_sched),
4124                                         priv);
4125
4126         access = group_access_relation(group, read, !read);
4127         access = isl_union_map_apply_domain(access, priv);
4128         access_map = isl_map_from_union_map(access);
4129
4130         sched = isl_map_reset_tuple_id(sched, isl_dim_in);
4131         sched = isl_map_intersect_domain(sched, isl_map_wrap(access_map));
4132
4133         return copy_access(gen, sched, type, group, build, 1);
4134 }
4135
4136 /* Return code for reading into or writing from shared or private memory.
4137  *
4138  * "schedule" is of the form
4139  *
4140  *      type[S -> A] -> L
4141  *
4142  * with S be the first shared_len dimensions of the computed schedule,
4143  * A the array and L the schedule correponding to the generated loops.
4144  * The array reference group is attached to "type".
4145  */
4146 static __isl_give isl_ast_node *create_access_leaf(
4147         struct gpu_gen *gen, __isl_take isl_map *schedule,
4148         __isl_take isl_ast_build *build)
4149 {
4150         struct gpu_array_ref_group *group;
4151         isl_id *id;
4152
4153         id = isl_map_get_tuple_id(schedule, isl_dim_in);
4154         group = isl_id_get_user(id);
4155         isl_id_free(id);
4156
4157         if (group->private_tile)
4158                 return copy_group_private_accesses(gen, group, schedule,
4159                                                         build);
4160         else
4161                 return copy_group_shared_accesses(gen, group, schedule,
4162                                                         build);
4163 }
4164
4165 /* Create a domain node representing a synchronization.
4166  */
4167 static __isl_give isl_ast_node *create_sync_leaf(
4168         struct gpu_gen *gen, __isl_take isl_map *schedule,
4169         __isl_take isl_ast_build *build)
4170 {
4171         struct ppcg_kernel_stmt *stmt;
4172         isl_id *id;
4173         isl_space *space;
4174         isl_ast_node *node;
4175         isl_ast_expr *expr;
4176
4177         isl_map_free(schedule);
4178
4179         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
4180         if (!stmt)
4181                 return NULL;
4182
4183         stmt->type = ppcg_kernel_sync;
4184
4185         space = isl_ast_build_get_schedule_space(build);
4186         space = isl_space_from_domain(space);
4187         space = isl_space_set_tuple_name(space, isl_dim_out, "sync");
4188         expr = isl_ast_build_call_from_pw_multi_aff(build,
4189                     isl_pw_multi_aff_from_multi_aff(isl_multi_aff_zero(space)));
4190         node = isl_ast_node_alloc_user(expr);
4191         isl_ast_build_free(build);
4192
4193         id = isl_id_alloc(gen->ctx, NULL, stmt);
4194         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
4195         return isl_ast_node_set_annotation(node, id);
4196 }
4197
4198 /* This function is called during the code generation at the point
4199  * where the schedule domain element is completely determined by
4200  * the generated code.  The input schedule contains the original
4201  * statements as well as synchronization and copy "statements".
4202  * The latter are scheduled at different points than any of the original
4203  * statements, so they will only arrive here in isolation.
4204  *
4205  * If the current schedule only refers to a single statement,
4206  * we check if it is a copy or synchronization statement and
4207  * call the appropriate functions.
4208  * Otherwise, we assume we are dealing with the original statements
4209  * and we call create_domain_leaf.
4210  */
4211 static __isl_give isl_ast_node *create_kernel_leaf(
4212         __isl_take isl_ast_build *build, void *user)
4213 {
4214         struct gpu_gen *gen = (struct gpu_gen *) user;
4215         isl_map *map;
4216         isl_union_map *schedule;
4217         const char *name;
4218
4219         schedule = isl_ast_build_get_schedule(build);
4220
4221         if (isl_union_map_n_map(schedule) != 1)
4222                 return create_domain_leaf(schedule, build, user);
4223
4224         map = isl_map_from_union_map(schedule);
4225         name = isl_map_get_tuple_name(map, isl_dim_in);
4226         if (!strcmp(name, "read") || !strcmp(name, "write"))
4227                 return create_access_leaf(gen, map, build);
4228         if (!strcmp(name, "sync"))
4229                 return create_sync_leaf(gen, map, build);
4230
4231         return create_domain_leaf(isl_union_map_from_map(map), build, user);
4232 }
4233
4234 /* Mark all odd schedule dimensions as "atomic" (when the even dimensions
4235  * have value 0) and all even schedule dimensions as "unroll".
4236  *
4237  * That is, the options look as follows
4238  *
4239  *      { [0, b, 0, d, ..., 0] -> atomic[i] : exists a : i = 2 a + 1;
4240  *        [a, b, c, d, ..., z] -> unroll[i] : exists a : i = 2 a }
4241  *
4242  * The even positions are used to be able to schedule copying blocks
4243  * and synchronization before or after each level of the shared memory
4244  * tile loops and we want to make sure that code for these is generated
4245  * separately (within each level).
4246  */
4247 static __isl_give isl_ast_build *set_atomic_and_unroll(
4248         __isl_take isl_ast_build *build,
4249         __isl_take isl_space *space, int sched_len)
4250 {
4251         isl_ctx *ctx;
4252         isl_map *map;
4253         isl_constraint *c;
4254         isl_union_map *opt;
4255         isl_local_space *ls;
4256         int i, n;
4257
4258         ctx = isl_ast_build_get_ctx(build);
4259
4260         space = isl_space_params(space);
4261         space = isl_space_add_dims(space, isl_dim_set, sched_len);
4262         space = isl_space_from_domain(space);
4263         space = isl_space_add_dims(space, isl_dim_out, 2);
4264         map = isl_map_universe(isl_space_copy(space));
4265         for (i = 0; i < sched_len; i += 2)
4266                 map = isl_map_fix_si(map, isl_dim_in, i, 0);
4267         ls = isl_local_space_from_space(isl_map_get_space(map));
4268         c = isl_equality_alloc(ls);
4269         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 0, 1);
4270         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 1, 2);
4271         c = isl_constraint_set_constant_si(c, 1);
4272         map = isl_map_add_constraint(map, c);
4273         map = isl_map_project_out(map, isl_dim_out, 1, 1);
4274         map = isl_map_set_tuple_name(map, isl_dim_out, "atomic");
4275         opt = isl_union_map_from_map(map);
4276
4277         map = isl_map_universe(space);
4278         ls = isl_local_space_from_space(isl_map_get_space(map));
4279         c = isl_equality_alloc(ls);
4280         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 0, 1);
4281         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 1, 2);
4282         map = isl_map_add_constraint(map, c);
4283         map = isl_map_project_out(map, isl_dim_out, 1, 1);
4284         map = isl_map_set_tuple_name(map, isl_dim_out, "unroll");
4285         opt = isl_union_map_add_map(opt, map);
4286
4287         build = isl_ast_build_set_options(build, opt);
4288
4289         return build;
4290 }
4291
4292 /* Return a map that maps a space of dimension gen->shared_len
4293  * to its last dimensions starting at gen->tile_first.
4294  * The range is of dimension
4295  *
4296  *      2 * (gen->shared_len - gen->tile_first) + 1
4297  *
4298  * The input dimensions are mapped to the odd dimensions in the output,
4299  * while the even dimensions (except 2*pos) are fixed to 0.
4300  * Output dimension 2*pos (if pos >= 0) is fixed to "val".
4301  * If pos >= 0, then only the pos first dimensions starting at gen->tile_first
4302  * are mapped to the output.  The remaining input dimensions are projected
4303  * out and the corresponding output dimensions are fixed to 0.
4304  */
4305 static __isl_give isl_map *insert_even(struct gpu_gen *gen,
4306         __isl_take isl_space *space, int pos, int val)
4307 {
4308         int i, n;
4309         isl_map *proj;
4310
4311         space = isl_space_set_from_params(space);
4312         space = isl_space_add_dims(space, isl_dim_set, gen->shared_len);
4313         space = isl_space_map_from_set(space);
4314         proj = isl_map_identity(space);
4315         proj = isl_map_project_out(proj, isl_dim_out, 0, gen->tile_first);
4316         n = gen->shared_len - gen->tile_first;
4317         for (i = 0; i <= n; ++i) {
4318                 proj = isl_map_insert_dims(proj, isl_dim_out, 2 * i, 1);
4319                 if (i == pos)
4320                         proj = isl_map_fix_si(proj, isl_dim_out, 2 * i, val);
4321                 else
4322                         proj = isl_map_fix_si(proj, isl_dim_out, 2 * i, 0);
4323         }
4324
4325         if (pos < 0)
4326                 return proj;
4327
4328         proj = isl_map_eliminate(proj, isl_dim_in, gen->tile_first + pos,
4329                                 gen->shared_len - (gen->tile_first + pos));
4330         for (i = pos; i < n; ++i)
4331                 proj = isl_map_fix_si(proj, isl_dim_out, 2 * i + 1, 0);
4332
4333         return proj;
4334 }
4335
4336 /* Given the AST context schedule "schedule" and the mapping from
4337  * domains to the shared tile loops "shared_sched", add a schedule
4338  * for a synchronization operation at position "val" of loop level "pos".
4339  *
4340  * schedule is of the form
4341  *
4342  *      D -> L
4343  *
4344  * (with D the iteration domains and L the already generated loops),
4345  * while shared_sched is of the form
4346  *
4347  *      D -> S
4348  *
4349  * We combine them into
4350  *
4351  *      L -> S
4352  *
4353  * apply a mapping
4354  *
4355  *      [s_0,...] -> [0,s_{tile_first},0,..., val, 0, 0, ... 0]
4356  *
4357  * and use the result as a schedule for "sync".
4358  */
4359 static __isl_give isl_union_map *add_sync_schedule(struct gpu_gen *gen,
4360         __isl_take isl_union_map *res, __isl_keep isl_union_map *schedule,
4361         __isl_keep isl_union_map *shared_sched, int pos, int val)
4362 {
4363         isl_space *space;
4364         isl_map *proj, *map;
4365
4366         shared_sched = isl_union_map_copy(shared_sched);
4367         schedule = isl_union_map_copy(schedule);
4368
4369         space = isl_union_map_get_space(shared_sched);
4370         schedule = isl_union_map_apply_domain(shared_sched, schedule);
4371         map = isl_map_from_union_map(schedule);
4372
4373         proj = insert_even(gen, space, pos, val);
4374         map = isl_map_apply_range(map, proj);
4375         map = isl_map_from_range(isl_map_wrap(map));
4376         map = isl_map_set_tuple_name(map, isl_dim_in, "sync");
4377
4378         res = isl_union_map_add_map(res, map);
4379
4380         return res;
4381 }
4382
4383 /* Given a set of wrapped references "ref", return the corresponding
4384  * access relations based on the tagged access relations "tagged".
4385  *
4386  * The elements of "ref" are of the form
4387  *
4388  *      [D -> R]
4389  *
4390  * with D an iteration domains and R a reference.
4391  * The elements of "tagged" are of the form
4392  *
4393  *      [D -> R] -> A
4394  *
4395  * with A an array.
4396  *
4397  * Extend "tagged" to include the iteration domain in the range, i.e.,
4398  *
4399  *      [D -> R] -> [D -> A]
4400  *
4401  * apply the result to "ref" and then unwrap the resulting set
4402  * to obtain relations of the form
4403  *
4404  *      D -> A
4405  */
4406 static __isl_give isl_union_map *wrapped_reference_to_access(
4407         __isl_take isl_union_set *ref, __isl_take isl_union_map *tagged)
4408 {
4409         isl_union_map *tag2access;
4410
4411         tag2access = isl_union_map_copy(tagged);
4412         tag2access = isl_union_map_universe(tag2access);
4413         tag2access = isl_union_set_unwrap(isl_union_map_domain(tag2access));
4414         tag2access = isl_union_map_domain_map(tag2access);
4415         tag2access = isl_union_map_range_product(tag2access, tagged);
4416
4417         ref = isl_union_set_coalesce(ref);
4418         ref = isl_union_set_apply(ref, tag2access);
4419
4420         return isl_union_set_unwrap(ref);
4421 }
4422
4423 /* Given an access relation "access" from "group", remove those reads
4424  * if ("read" is 1) or writes (if "read" is 0) that are only needed to
4425  * communicate data within the same iteration of the last_shared dimension
4426  * of the group.
4427  *
4428  * If the access is a read then it is necessarily an element of
4429  *
4430  *      live_in union (range flow)
4431  *
4432  * where live_in and flow may be overapproximations.
4433  * If the access is a write then it is necessarily an element of
4434  *
4435  *      live_out union (domain flow)
4436  *
4437  * In both cases, the access relation is also a subset of
4438  * the group access relation.
4439  *
4440  * Essentially, we compute the intersection of "access" with either
4441  *
4442  *      live_in union (range non-local-flow)
4443  *
4444  * or
4445  *
4446  *      live_out union (domain non-local-flow)
4447  *
4448  * We first construct a relation "local"
4449  *
4450  *      [[D -> R] -> [D' -> R']]
4451  *
4452  * of pairs of domain iterations accessing the reference group
4453  * and references in the group that are scheduled to the same iteration
4454  * of the last_shared dimension.
4455  *
4456  * If this relation does not intersect the dataflow dependences,
4457  * then there is nothing we can possibly remove and we simply
4458  * return the input.
4459  *
4460  * Otherwise, we remove the "local" dataflow dependences from
4461  * the set of all dataflow dependences.
4462  * Note that if the potential dataflow dependences are an overapproximation
4463  * of the actual dataflow dependences, then the result remains an
4464  * overapproximation of the non-local dataflow dependences.
4465  * Copying to/from global memory is only needed for the references
4466  * in the domain/range of the result or for accesses that are live out/in
4467  * for the entire scop.
4468  *
4469  * We therefore map the domain/range of the "external" relation
4470  * to the corresponding access relation and take the union with
4471  * the live out/in relation.
4472  */
4473 static __isl_give isl_union_map *remove_local_accesses(struct gpu_gen *gen,
4474         struct gpu_array_ref_group *group, __isl_take isl_union_map *access,
4475         int read)
4476 {
4477         int empty;
4478         isl_union_map *tagger;
4479         isl_union_set *domain;
4480         isl_space *space;
4481         isl_union_map *sched, *local, *tagged, *external;
4482         isl_union_set *tag_set;
4483         isl_map *proj;
4484
4485         if (isl_union_map_is_empty(access))
4486                 return access;
4487
4488         tagged = group_tagged_access_relation(group);
4489
4490         sched = isl_union_map_copy(gen->sched);
4491
4492         space = isl_union_map_get_space(sched);
4493         proj = projection(space, gen->untiled_len, group->last_shared + 1);
4494         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
4495
4496         tagger = isl_union_map_copy(gen->prog->scop->tagger);
4497         domain = isl_union_map_domain(isl_union_map_copy(tagged));
4498         tagger = isl_union_map_intersect_range(tagger, domain);
4499         sched = isl_union_map_apply_domain(sched, tagger);
4500
4501         local = isl_union_map_apply_range(sched,
4502                             isl_union_map_reverse(isl_union_map_copy(sched)));
4503         local = isl_union_map_intersect(local,
4504                         isl_union_map_copy(gen->prog->scop->tagged_dep_flow));
4505
4506         empty = isl_union_map_is_empty(local);
4507         if (empty < 0 || empty) {
4508                 isl_union_map_free(tagged);
4509                 isl_union_map_free(local);
4510                 if (empty < 0)
4511                         return isl_union_map_free(access);
4512                 return access;
4513         }
4514
4515         external = isl_union_map_copy(gen->prog->scop->tagged_dep_flow);
4516         external = isl_union_map_intersect_params(external,
4517                                 isl_set_copy(gen->prog->scop->context));
4518         external = isl_union_map_subtract(external, local);
4519
4520         if (read) {
4521                 tag_set = isl_union_map_range(external);
4522                 external = wrapped_reference_to_access(tag_set, tagged);
4523                 external = isl_union_map_union(external,
4524                                 isl_union_map_copy(gen->prog->scop->live_in));
4525         } else {
4526                 tag_set = isl_union_map_domain(external);
4527                 external = wrapped_reference_to_access(tag_set, tagged);
4528                 external = isl_union_map_union(external,
4529                                 isl_union_map_copy(gen->prog->scop->live_out));
4530         }
4531
4532         access = isl_union_map_intersect(access, external);
4533
4534         return access;
4535 }
4536
4537 /* Given the AST context schedule "schedule" and the mapping from
4538  * domains to the shared tile loops "shared_sched", add a schedule
4539  * for copying an array reference group to/from shared/private memory.
4540  * "read" is set if data should be copied from global memory
4541  * to shared/private memory.
4542  * "k" represents the current group
4543  * "s" is the total number of groups
4544  *
4545  * We schedule an operation before or after the innermost loop
4546  * of "shared_sched" that affects the tile of the array reference group.
4547  *
4548  * schedule is of the form
4549  *
4550  *      D -> L
4551  *
4552  * (with D the iteration domains and L the already generated loops),
4553  * while shared_sched is of the form
4554  *
4555  *      D -> S
4556  *
4557  * We first compute the access relation for the reference group
4558  *
4559  *      D -> A
4560  *
4561  * and remove from this access relation those reads or writes
4562  * that only needed to communicate data within the same iteration
4563  * of the last_shared dimension of the group.
4564  * We then combine what is left with shared_sched into
4565  *
4566  *      D -> [S -> A]
4567  *
4568  * If this results in an empty relation, no copying needs to be performed
4569  * at this point.
4570  * Otherwise, we invert the relation and combine it with "schedule" into
4571  *
4572  *      [S -> A] -> L
4573  *
4574  * The actual additional piece of the schedule is obtained from combining
4575  *
4576  *      [S -> A] -> S
4577  *
4578  * with a mapping
4579  *
4580  *      [s_0,...] -> [0,s_{tile_first},0,..., val, 0, 0, ... 0]
4581  *
4582  * The position of "val" corresponds to the innermost loop that affects
4583  * the tile and the value indicates where the copying is scheduled
4584  * with respect to the actual kernel code (at value 0).
4585  * Reads are schedule before the code, writes to global memory from
4586  * private memory are scheduled at values 1 to s, writes to global
4587  * memory from shared memory are scheduled at values s + 2 to 2 * s + 1.
4588  *
4589  * If we are scheduling a read from global memory to shared memory,
4590  * we insert a synchronization before the kernel code (at the innermost
4591  * level).
4592  * If we are scheduling a write to global memory, then we add
4593  * a synchronization after all writes (at value 2 *s + 2).
4594  * However, there is no need for a synchronization after the outermost loop.
4595  * A write to global memory from private memory at the innermost level
4596  * does not require a synchronization, because it is covered by
4597  * the synchronization after the kernel inserted by body_schedule.
4598  */
4599 static __isl_give isl_union_map *add_group_schedule(struct gpu_gen *gen,
4600         __isl_take isl_union_map *res, __isl_keep isl_union_map *schedule,
4601         __isl_keep isl_union_map *shared_sched,
4602         struct gpu_array_ref_group *group, int read, int k, int s)
4603 {
4604         int n;
4605         int pos, val;
4606         isl_space *space;
4607         isl_union_map *access;
4608         isl_map *map, *proj, *access_map;
4609         isl_id *id;
4610
4611         access = group_access_relation(group, read, !read);
4612         access = remove_local_accesses(gen, group, access, read);
4613         access = isl_union_map_range_product(isl_union_map_copy(shared_sched),
4614                                                 access);
4615
4616         if (isl_union_map_is_empty(access)) {
4617                 isl_union_map_free(access);
4618                 return res;
4619         }
4620
4621         access = isl_union_map_reverse(access);
4622         access = isl_union_map_apply_range(access,
4623                                             isl_union_map_copy(schedule));
4624         access_map = isl_map_from_union_map(access);
4625
4626         space = isl_space_copy(group->array->space);
4627         space = isl_space_from_range(space);
4628         space = isl_space_add_dims(space, isl_dim_in, gen->shared_len);
4629         map = isl_map_domain_map(isl_map_universe(space));
4630
4631         space = isl_union_map_get_space(schedule);
4632         pos = group->last_shared + 1 - gen->tile_first;
4633         assert(pos >= 0);
4634         if (read)
4635                 val = -2 - k;
4636         else if (group->private_tile)
4637                 val = 1 + k;
4638         else
4639                 val = 1 + s + 1 + k;
4640         proj = insert_even(gen, space, pos, val);
4641         map = isl_map_apply_range(map, proj);
4642
4643         access_map = isl_map_range_product(access_map, map);
4644
4645         id = isl_id_alloc(gen->ctx, read ? "read" : "write", group);
4646         access_map = isl_map_set_tuple_id(access_map, isl_dim_in, id);
4647
4648         res = isl_union_map_add_map(res, access_map);
4649
4650         n = gen->shared_len - gen->tile_first;
4651         if (read) {
4652                 if (!group->private_tile)
4653                         res = add_sync_schedule(gen, res, schedule,
4654                                                 shared_sched, n, -1);
4655         } else {
4656                 if (pos == 0)
4657                         return res;
4658                 if (pos == n && group->private_tile)
4659                         return res;
4660                 res = add_sync_schedule(gen, res, schedule, shared_sched,
4661                                         pos, 2 * s + 2);
4662         }
4663
4664         return res;
4665 }
4666
4667 /* Return a schedule for the shared tile loops based on the current
4668  * AST context schedule.
4669  *
4670  * We create a "shared_sched" that maps the domains to the first
4671  * shared_len dimensions of the computed schedule, project out the
4672  * first tile_first dimensions (as these are already covered by
4673  * the host code) and insert "statement-level" dimensions at even
4674  * positions so that we can schedule copy blocks and synchronization
4675  * before/after each level.
4676  *
4677  * In particular, copy blocks are inserted inside the innermost
4678  * level that affect the tile.  For the copying to global memory,
4679  * those from private memory are scheduled before those from shared
4680  * memory such that synchronization can be inserted between the two
4681  * at the innermost level.
4682  * Synchronization is inserted at the innermost level before the
4683  * actual kernel code if there is any copying from global memory
4684  * to shared memory.  It is inserted unconditionally at the innermost
4685  * level after the actual kernel code and the copying to global memory
4686  * from private memory (if any).  Finally, it is inserted after
4687  * any copying to global memory, except at the outermost level
4688  * and at the innermost level if there is no copying from shared
4689  * memory.  The copying from private memory is covered by the unconditional
4690  * synchronization at the innermost level.
4691  */
4692 static __isl_give isl_union_map *body_schedule(struct gpu_gen *gen,
4693         __isl_take isl_union_map *schedule)
4694 {
4695         isl_space *space;
4696         isl_union_map *res;
4697         isl_union_map *shared_sched;
4698         isl_union_map *sched;
4699         isl_map *proj, *map;
4700         int i, j, k, s;
4701
4702         shared_sched = isl_union_map_copy(gen->tiled_sched);
4703         proj = projection(isl_union_map_get_space(shared_sched),
4704                                 gen->tiled_len, gen->shared_len);
4705         shared_sched = isl_union_map_apply_range(shared_sched,
4706                                 isl_union_map_from_map(proj));
4707         space = isl_union_map_get_space(shared_sched);
4708         proj = insert_even(gen, space, -1, 0);
4709         sched = isl_union_map_apply_range(isl_union_map_copy(shared_sched),
4710                                 isl_union_map_from_map(proj));
4711
4712         res = isl_union_map_range_product(isl_union_map_copy(schedule), sched);
4713
4714         s = 0;
4715         for (i = 0; i < gen->prog->n_array; ++i)
4716                 s += gen->prog->array[i].n_group;
4717
4718         k = 0;
4719         for (i = 0; i < gen->prog->n_array; ++i) {
4720                 struct gpu_array_info *array = &gen->prog->array[i];
4721
4722                 for (j = 0; j < array->n_group; ++j) {
4723                         struct gpu_array_ref_group *group;
4724
4725                         group = array->groups[j];
4726                         if (!group->private_tile && !group->shared_tile)
4727                                 continue;
4728                         res = add_group_schedule(gen, res, schedule,
4729                                                 shared_sched, group, 0, k, s);
4730                         res = add_group_schedule(gen, res, schedule,
4731                                                 shared_sched, group, 1, k, s);
4732                         ++k;
4733                 }
4734         }
4735
4736         res = add_sync_schedule(gen, res, schedule, shared_sched,
4737                             gen->shared_len - gen->tile_first, 1 + s);
4738
4739         isl_union_map_free(shared_sched);
4740         isl_union_map_free(schedule);
4741
4742         return res;
4743 }
4744
4745 /* Generate code for "kernel" in the given "context".
4746  *
4747  * We first generate code for the shared tile loops (T1T, T1P and T2)
4748  * in a context that includes the block ids.
4749  * Within each iteration of these loops an additional code generation
4750  * is performed (within create_kernel_leaf) for the rest of the schedule
4751  * in a context that includes the thread ids.
4752  */
4753 static __isl_give isl_ast_node *generate_kernel(struct gpu_gen *gen,
4754         __isl_keep isl_ast_build *build, __isl_keep isl_set *host_domain,
4755         __isl_keep isl_multi_pw_aff *grid_size)
4756 {
4757         isl_space *space;
4758         isl_set *set;
4759         isl_id_list *iterators;
4760         isl_union_map *schedule;
4761         isl_ast_node *tree;
4762         int sched_len;
4763
4764         schedule = isl_ast_build_get_schedule(build);
4765
4766         build = isl_ast_build_copy(build);
4767         build = isl_ast_build_restrict(build, isl_set_copy(host_domain));
4768         space = isl_ast_build_get_schedule_space(build);
4769         set = isl_set_universe(isl_space_copy(space));
4770         set = add_bounded_parameters_dynamic(set, grid_size, "b");
4771         build = isl_ast_build_restrict(build, set);
4772
4773         schedule = body_schedule(gen, schedule);
4774
4775         sched_len = 2 * (gen->shared_len - gen->tile_first) + 1;
4776
4777         build = set_atomic_and_unroll(build, space, sched_len);
4778         iterators = generate_names(gen->ctx, sched_len, "g");
4779         build = isl_ast_build_set_iterators(build, iterators);
4780         build = isl_ast_build_set_create_leaf(build, &create_kernel_leaf, gen);
4781         tree = isl_ast_build_ast_from_schedule(build, schedule);
4782         isl_ast_build_free(build);
4783
4784         return tree;
4785 }
4786
4787 /* Attach "id" to the given node.
4788  */
4789 static __isl_give isl_ast_node *attach_id(__isl_take isl_ast_node *node,
4790         __isl_keep isl_ast_build *build, void *user)
4791 {
4792         isl_id *id = user;
4793
4794         node = isl_ast_node_set_annotation(node, id);
4795
4796         return node;
4797 }
4798
4799 /* Construct an AST node for performing a kernel launch and attach
4800  * the information about the kernel to that node.
4801  *
4802  * The kernel AST has been constructed in the context of the range
4803  * of "schedule".  In particular, the grid size has been computed
4804  * in the context.  We therefore still need to make sure that these
4805  * constraints are expressed in the code.  We do this by creating a schedule
4806  *
4807  *      kernel[] -> [S -> []]
4808  *
4809  * where S is the schedule domain, i.e., the range of "schedule".
4810  * The AST generation will then create a single call surrounded by
4811  * all the condition in "S" that have not been expressed yet.
4812  *
4813  * The kernel information is attached to this node in attach_id.
4814  */
4815 static __isl_give isl_ast_node *construct_launch(
4816         __isl_take isl_ast_build *build, __isl_take isl_union_map *schedule,
4817         __isl_take struct ppcg_kernel *kernel)
4818 {
4819         isl_id *id;
4820         isl_ctx *ctx;
4821         isl_union_set *domain;
4822         isl_set *set;
4823         isl_map *map;
4824         isl_ast_node *node;
4825
4826         ctx = isl_ast_build_get_ctx(build);
4827
4828         id = isl_id_alloc(ctx, NULL, kernel);
4829         id = isl_id_set_free_user(id, &ppcg_kernel_free);
4830
4831         domain = isl_union_map_range(schedule);
4832         set = isl_set_from_union_set(domain);
4833         map = isl_map_from_domain(set);
4834         map = isl_map_from_range(isl_map_wrap(map));
4835         map = isl_map_set_tuple_name(map, isl_dim_in, "kernel");
4836         schedule = isl_union_map_from_map(map);
4837
4838         build = isl_ast_build_set_at_each_domain(build, &attach_id, id);
4839         node = isl_ast_build_ast_from_schedule(build, schedule);
4840         isl_ast_build_free(build);
4841
4842         return node;
4843 }
4844
4845 /* This function is called for each leaf in the AST of the host code.
4846  * We first specialize the schedule to the site of the leaf, compute
4847  * the size of shared memory and then construct the body of the host code
4848  * and the associated kernel.
4849  *
4850  * The necessary information for printing the kernel launch is
4851  * stored in a struct ppcg_kernel and attached to the leaf node
4852  * created to represent the launch.
4853  */
4854 static __isl_give isl_ast_node *create_host_leaf(
4855         __isl_take isl_ast_build *build, void *user)
4856 {
4857         struct gpu_gen *gen = (struct gpu_gen *) user;
4858         isl_id *id;
4859         isl_ast_node *node;
4860         struct ppcg_kernel *kernel;
4861         isl_set *host_domain;
4862         isl_union_map *schedule;
4863         isl_union_map *local_sched;
4864         isl_union_map *access;
4865         isl_union_set *domain;
4866         int i;
4867
4868         schedule = isl_ast_build_get_schedule(build);
4869
4870         isl_union_map_foreach_map(schedule, &extract_tile_len, gen);
4871         read_sizes(gen);
4872
4873         domain = isl_union_map_domain(isl_union_map_copy(schedule));
4874
4875         local_sched = isl_union_map_copy(gen->sched);
4876         local_sched = isl_union_map_intersect_domain(local_sched, domain);
4877         access = isl_union_map_union(isl_union_map_copy(gen->prog->read),
4878                                      isl_union_map_copy(gen->prog->may_write));
4879         access = isl_union_map_apply_domain(access,
4880                                             isl_union_map_copy(local_sched));
4881
4882         gen->tiled_sched = tile_schedule(gen, local_sched);
4883         gen->tiled_sched = parametrize_tiled_schedule(gen, gen->tiled_sched);
4884         gen->tiled_sched = scale_tile_loops(gen, gen->tiled_sched);
4885
4886         gen->local_sched = isl_union_map_copy(gen->tiled_sched);
4887         gen->local_sched = thread_tile_schedule(gen, gen->local_sched);
4888         gen->local_sched = scale_thread_tile_loops(gen, gen->local_sched);
4889
4890         kernel = gen->kernel = isl_calloc_type(gen->ctx, struct ppcg_kernel);
4891         if (!kernel)
4892                 goto error;
4893
4894         kernel->id = gen->kernel_id++;
4895         kernel->context = isl_union_map_params(isl_union_map_copy(schedule));
4896         kernel->grid_size = extract_grid_size(gen, kernel);
4897         extract_block_size(gen, kernel);
4898         kernel->arrays = isl_union_map_range(access);
4899         kernel->arrays = isl_union_set_apply(kernel->arrays,
4900                                 isl_union_map_copy(gen->prog->to_outer));
4901         kernel->space = isl_ast_build_get_schedule_space(build);
4902
4903         gen->private_access = NULL;
4904         compute_shared_sched(gen);
4905         gen->privatization = compute_privatization(gen);
4906         if (group_references(gen) < 0)
4907                 schedule = isl_union_map_free(schedule);
4908         compute_private_access(gen);
4909         check_shared_memory_bound(gen);
4910         compute_group_tilings(gen);
4911         host_domain = isl_set_from_union_set(isl_union_map_range(
4912                                                 isl_union_map_copy(schedule)));
4913         localize_bounds(gen, kernel, host_domain);
4914
4915         gen->local_sched = interchange_for_unroll(gen, gen->local_sched);
4916
4917         kernel->tree = generate_kernel(gen, build, host_domain,
4918                                         kernel->grid_size);
4919         create_kernel_vars(gen, kernel);
4920
4921         free_local_array_info(gen);
4922         isl_map_free(gen->privatization);
4923         isl_union_map_free(gen->private_access);
4924         isl_union_map_free(gen->local_sched);
4925         isl_union_map_free(gen->tiled_sched);
4926         isl_union_map_free(gen->shared_sched);
4927         isl_union_map_free(gen->shared_proj);
4928         isl_set_free(host_domain);
4929         free(gen->tile_size);
4930
4931         node = construct_launch(build, schedule, kernel);
4932
4933         return node;
4934 error:
4935         isl_union_map_free(schedule);
4936         return NULL;
4937 }
4938
4939 /* Use isl to generate code for the outer gen->tile_first loops
4940  * of the global schedule in gen->sched, resulting in the host code.
4941  * Within each iteration of this partial schedule, i.e., for each kernel
4942  * launch, create_host_leaf takes care of generating the kernel code.
4943  */
4944 static __isl_give isl_ast_node *generate_host_code(struct gpu_gen *gen)
4945 {
4946         isl_ast_build *build;
4947         isl_ast_node *tree;
4948         isl_union_map *sched;
4949         isl_map *proj;
4950         isl_id_list *iterators;
4951
4952         sched = isl_union_map_copy(gen->sched);
4953         proj = projection(isl_union_map_get_space(sched),
4954                             gen->untiled_len, gen->tile_first);
4955         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
4956
4957         isl_options_set_ast_build_group_coscheduled(gen->ctx, 1);
4958         build = isl_ast_build_from_context(isl_set_copy(gen->prog->context));
4959         iterators = generate_names(gen->ctx, gen->tile_first, "h");
4960         build = isl_ast_build_set_iterators(build, iterators);
4961         build = isl_ast_build_set_create_leaf(build, &create_host_leaf, gen);
4962         tree = isl_ast_build_ast_from_schedule(build, sched);
4963         isl_ast_build_free(build);
4964
4965         return tree;
4966 }
4967
4968 __isl_give isl_union_map *extract_sizes_from_str(isl_ctx *ctx, const char *str)
4969 {
4970         if (!str)
4971                 return NULL;
4972         return isl_union_map_read_from_str(ctx, str);
4973 }
4974
4975 /* Information about the outermost tilable bands in the forest of bands.
4976  *
4977  * tile_len and n_parallel are only sets on band_info structures
4978  * that correspond to outermost bands.  For other bands (in particular,
4979  * ancestors of the outermost bands), n_parallal is set to 0.
4980  *
4981  * prefix is the (padded) schedule leading up to the outermost tilable bands.
4982  *
4983  * tile_first is the number of schedule dimensions in prefix.
4984  *
4985  * suffix is the schedule of the outermost tilable bands and their descendants.
4986  */
4987 struct band_info {
4988         struct gpu_gen *gen;
4989         int tile_first;
4990         int tile_len;
4991         int n_parallel;
4992         isl_union_map *prefix;
4993         isl_union_map *suffix;
4994 };
4995
4996 /* Set tile_len and n_parallel of the statement to that of
4997  * their outermost band, recorded in the band_info.
4998  */
4999 static int set_stmt_tile_len(__isl_take isl_map *map, void *user)
5000 {
5001         struct band_info *info = user;
5002         struct gpu_stmt *stmt;
5003         isl_id *id;
5004
5005         id = isl_map_get_tuple_id(map, isl_dim_in);
5006         stmt = find_stmt(info->gen->prog, id);
5007         isl_id_free(id);
5008
5009         stmt->tile_len = info->tile_len;
5010         stmt->n_parallel = info->n_parallel;
5011
5012         isl_map_free(map);
5013
5014         return 0;
5015 }
5016
5017 static void list_select_outer_band(struct gpu_gen *gen,
5018         __isl_take isl_band_list *list, int pos, struct band_info *list_info);
5019
5020 /* Check if this band has any parallel loops.  If so, take it as
5021  * the outermost tilable band.  If not, continue looking for the
5022  * outermost tilable band in the children of the current band.
5023  */
5024 static void band_select_outer_band(struct gpu_gen *gen,
5025         __isl_take isl_band *band, int pos, struct band_info *info)
5026 {
5027         int n = isl_band_n_member(band);
5028         int n_parallel;
5029
5030         for (n_parallel = 0; n_parallel < n; ++n_parallel)
5031                 if (!isl_band_member_is_coincident(band, n_parallel))
5032                         break;
5033
5034         info->n_parallel = n_parallel;
5035         if (n_parallel) {
5036                 gen->any_parallelism = 1;
5037                 info->gen = gen;
5038                 info->tile_first = pos;
5039                 info->tile_len = n;
5040                 info->prefix = isl_band_get_prefix_schedule(band);
5041                 info->suffix = isl_union_map_flat_range_product(
5042                                 isl_band_get_partial_schedule(band),
5043                                 isl_band_get_suffix_schedule(band));
5044                 isl_union_map_foreach_map(info->prefix,
5045                                             &set_stmt_tile_len, info);
5046         } else if (isl_band_has_children(band)) {
5047                 isl_band_list *children;
5048                 children = isl_band_get_children(band);
5049                 list_select_outer_band(gen, children, pos + n, info);
5050         } else {
5051                 info->gen = gen;
5052                 info->tile_first = pos + n;
5053                 info->tile_len = 0;
5054                 info->prefix = isl_union_map_flat_range_product(
5055                                 isl_band_get_prefix_schedule(band),
5056                                 isl_band_get_partial_schedule(band));
5057                 info->suffix = isl_band_get_suffix_schedule(band);
5058                 isl_union_map_foreach_map(info->prefix,
5059                                             &set_stmt_tile_len, info);
5060         }
5061
5062         isl_band_free(band);
5063 }
5064
5065 /* Comparison function that returns a non-zero value for band_infos
5066  * with different tile_len fields or different n_parallel fields.
5067  */
5068 static int cmp_band(const void *p1, const void *p2)
5069 {
5070         const struct band_info *info1 = p1;
5071         const struct band_info *info2 = p2;
5072
5073         if (info1->tile_len != info2->tile_len)
5074                 return info1->tile_len - info2->tile_len;
5075
5076         return info1->n_parallel - info2->n_parallel;
5077 }
5078
5079 /* Extend "umap" with coordinates with fixed value "val"
5080  * to a total length of "dst_len", assuming the original dimension is "src_len".
5081  */
5082 static __isl_give isl_union_map *extend_range(
5083         __isl_take isl_union_map *umap, int src_len, int dst_len, int val)
5084 {
5085         isl_space *dim;
5086         isl_map *map;
5087         int i;
5088
5089         dim = isl_union_map_get_space(umap);
5090         map = isl_map_reverse(projection(dim, dst_len, src_len));
5091         for (i = src_len; i < dst_len; ++i)
5092                 map = isl_map_fix_si(map, isl_dim_out, i, val);
5093
5094         umap = isl_union_map_apply_range(umap, isl_union_map_from_map(map));
5095
5096         return umap;
5097 }
5098
5099 /* Group bands with the same values for tile_len and n_parallel.
5100  * The prefix schedule is then extended with a fixed coordinate that
5101  * is different for each such group.
5102  * Note that the actual values for this coordinate are not important.
5103  * The bands have already been effectively separated at a higher level
5104  * or they are independent and may be executed in parallel.
5105  * The list of band_info has been sorted before this functions is called.
5106  */
5107 static void separate_bands(struct band_info *info, int n)
5108 {
5109         int i;
5110         int j = 0;
5111
5112         for (i = 0; i < n; ++i) {
5113                 int l = info[i].tile_first;
5114
5115                 if (i &&
5116                     (info[i].tile_len != info[i - 1].tile_len ||
5117                      info[i].n_parallel != info[i - 1].n_parallel))
5118                         j++;
5119
5120                 info[i].prefix = extend_range(info[i].prefix,
5121                                                 l, l + 1, j);
5122                 info[i].tile_first = l + 1;
5123         }
5124 }
5125
5126 /* Select the outermost bands in the elements of the list, align
5127  * their prefix schedules, separate bands with different values
5128  * for tile_len and/or n_parallel and then combine the resulting
5129  * prefix and suffix schedules into a single pair of prefix and
5130  * suffix schedules for the entire list.
5131  */
5132 static void list_select_outer_band(struct gpu_gen *gen,
5133         __isl_take isl_band_list *list, int pos, struct band_info *list_info)
5134 {
5135         isl_band *band;
5136         int i;
5137         int n = isl_band_list_n_band(list);
5138         isl_ctx *ctx = isl_band_list_get_ctx(list);
5139         struct band_info *info;
5140         int max_tile_first;
5141         isl_union_map *prefix;
5142         isl_union_map *suffix;
5143
5144         assert(n >= 1);
5145         info = isl_calloc_array(ctx, struct band_info, n);
5146         assert(info);
5147
5148         max_tile_first = 0;
5149         for (i = 0; i < n; ++i) {
5150                 band = isl_band_list_get_band(list, i);
5151                 band_select_outer_band(gen, band, pos, &info[i]);
5152                 if (info[i].tile_first > max_tile_first)
5153                         max_tile_first = info[i].tile_first;
5154         }
5155
5156         for (i = 0; i < n; ++i) {
5157                 if (info[i].tile_first == max_tile_first)
5158                         continue;
5159                 info[i].prefix = extend_range(info[i].prefix,
5160                                         info[i].tile_first, max_tile_first, 0);
5161                 info[i].tile_first = max_tile_first;
5162         }
5163
5164         qsort(info, n, sizeof(struct band_info), &cmp_band);
5165
5166         for (i = 0; i < n - 1; ++i)
5167                 if (info[i].tile_len != info[i + 1].tile_len ||
5168                     info[i].n_parallel != info[i + 1].n_parallel)
5169                         break;
5170
5171         if (i < n -1)
5172                 separate_bands(info, n);
5173
5174         prefix = info[0].prefix;
5175         suffix = info[0].suffix;
5176
5177         for (i = 1; i < n; ++i) {
5178                 prefix = isl_union_map_union(prefix, info[i].prefix);
5179                 suffix = isl_union_map_union(suffix, info[i].suffix);
5180         }
5181
5182         list_info->tile_first = info[0].tile_first;
5183         list_info->tile_len = -1;
5184         list_info->prefix = prefix;
5185         list_info->suffix = suffix;
5186
5187         isl_band_list_free(list);
5188         free(info);
5189 }
5190
5191 /* Select the outermost tilable band that (by construction)
5192  * has at least one parallel loop.
5193  * The starting position of the aligned band is stored in the pair
5194  * gen->tile_first.
5195  * The sizes and number of parallel loops may be different in different
5196  * parts of the band forest and are therefore stored in the gpu_stmts.
5197  *
5198  * Return the complete schedule, with the tilable bands aligned
5199  * at gen->tile_first and padded with zero, if needed.
5200  */
5201 static __isl_give isl_union_map *select_outer_tilable_band(struct gpu_gen *gen,
5202         __isl_keep isl_schedule *schedule)
5203 {
5204         isl_band_list *list;
5205         struct band_info info;
5206
5207         gen->n_parallel = 0;
5208         gen->tile_len = -1;
5209
5210         list = isl_schedule_get_band_forest(schedule);
5211
5212         if (isl_band_list_n_band(list) == 0) {
5213                 isl_band_list_free(list);
5214                 return isl_schedule_get_map(schedule);
5215         }
5216
5217         list_select_outer_band(gen, list, 0, &info);
5218
5219         gen->tile_first = info.tile_first;
5220         info.suffix = align_range(info.suffix);
5221
5222         return isl_union_map_flat_range_product(info.prefix, info.suffix);
5223 }
5224
5225 /* Set gen->untiled_len to the number of scheduling dimensions
5226  * for the schedule of the first domain.
5227  * We assume here that this number is the same for all domains.
5228  */
5229 static int set_untiled_len(__isl_take isl_map *map, void *user)
5230 {
5231         unsigned *untiled_len = user;
5232
5233         *untiled_len = isl_map_dim(map, isl_dim_out);
5234
5235         isl_map_free(map);
5236         return -1;
5237 }
5238
5239 /* Compute an appropriate schedule based on the accesses in
5240  * gen->read and gen->write.
5241  *
5242  * We use the dependences in gen->prog->scop to compute
5243  * a schedule that has a parallel loop in each tilable band.
5244  * Finally, we select the outermost tilable band.
5245  */
5246 static void compute_schedule(struct gpu_gen *gen)
5247 {
5248         isl_union_set *domain;
5249         isl_union_map *dep_raw, *dep;
5250         isl_union_map *sched;
5251         isl_schedule_constraints *sc;
5252         isl_schedule *schedule;
5253
5254         dep_raw = isl_union_map_copy(gen->prog->scop->dep_flow);
5255
5256         dep = isl_union_map_copy(gen->prog->scop->dep_false);
5257         dep = isl_union_map_union(dep, dep_raw);
5258         dep = isl_union_map_coalesce(dep);
5259
5260         domain = isl_union_set_copy(gen->prog->scop->domain);
5261         domain = isl_union_set_intersect_params(domain,
5262                                 isl_set_copy(gen->prog->scop->context));
5263         sc = isl_schedule_constraints_on_domain(isl_union_set_copy(domain));
5264         sc = isl_schedule_constraints_set_validity(sc, isl_union_map_copy(dep));
5265         sc = isl_schedule_constraints_set_coincidence(sc,
5266                                                     isl_union_map_copy(dep));
5267         sc = isl_schedule_constraints_set_proximity(sc, dep);
5268
5269         if (gen->options->debug->dump_schedule_constraints)
5270                 isl_schedule_constraints_dump(sc);
5271         schedule = isl_schedule_constraints_compute_schedule(sc);
5272         if (gen->options->debug->dump_schedule)
5273                 isl_schedule_dump(schedule);
5274
5275         sched = select_outer_tilable_band(gen, schedule);
5276
5277         isl_union_map_foreach_map(sched, &set_untiled_len, &gen->untiled_len);
5278         sched = isl_union_map_intersect_domain(sched, domain);
5279         gen->sched = sched;
5280
5281         isl_schedule_free(schedule);
5282 }
5283
5284 /* Compute the sets of outer array elements that need to be copied in and out.
5285  *
5286  * In particular, for each array that is possibly written anywhere in
5287  * gen->prog and that is visible outside the corresponding scop,
5288  * we copy out its entire extent.
5289  *
5290  * Any array elements that is read without first being written needs
5291  * to be copied in. Furthermore, if there are any array elements that
5292  * are copied out, but that may not be written inside gen->prog, then
5293  * they also need to be copied in to ensure that the value after execution
5294  * is the same as the value before execution.
5295  * In case the array elements are structures, we need to take into
5296  * account that all members of the structures need to be written
5297  * by gen->prog before we can avoid copying the data structure in.
5298  *
5299  * While computing the set of array elements that are copied out but
5300  * not necessarily written, we intersect both sets with the context.
5301  * This helps in those cases where the arrays are declared with a fixed size,
5302  * while the accesses are parametric and the context assigns a fixed value
5303  * to the parameters.
5304  *
5305  * If an element from a local array is read without first being written,
5306  * then there is no point in copying it in since it cannot have been
5307  * written prior to the scop.  Warn about the uninitialized read instead.
5308  */
5309 static void compute_copy_in_and_out(struct gpu_gen *gen)
5310 {
5311         int i;
5312         isl_union_set *local;
5313         isl_union_set *may_write, *must_write;
5314         isl_union_set *copy_in, *copy_out;
5315         isl_union_set *not_written;
5316         isl_union_map *uninitialized;
5317         isl_union_map *local_uninitialized;
5318
5319         must_write = isl_union_map_range(
5320                                 isl_union_map_copy(gen->prog->must_write));
5321         must_write = isl_union_set_intersect_params(must_write,
5322                                             isl_set_copy(gen->prog->context));
5323         may_write = isl_union_map_range(
5324                                 isl_union_map_copy(gen->prog->may_write));
5325         may_write = isl_union_set_intersect_params(may_write,
5326                                             isl_set_copy(gen->prog->context));
5327         may_write = isl_union_set_universe(may_write);
5328         may_write = isl_union_set_apply(may_write,
5329                                     isl_union_map_copy(gen->prog->to_outer));
5330         copy_out = isl_union_set_empty(isl_union_set_get_space(may_write));
5331         local = isl_union_set_copy(copy_out);
5332
5333         for (i = 0; i < gen->prog->n_array; ++i) {
5334                 isl_space *space;
5335                 isl_set *write_i;
5336                 int empty;
5337
5338                 space = isl_space_copy(gen->prog->array[i].space);
5339
5340                 if (gen->prog->array[i].local) {
5341                         isl_set *set;
5342
5343                         set = isl_set_universe(space);
5344                         local = isl_union_set_add_set(local, set);
5345                         continue;
5346                 }
5347
5348                 write_i = isl_union_set_extract_set(may_write, space);
5349                 empty = isl_set_fast_is_empty(write_i);
5350                 isl_set_free(write_i);
5351                 if (empty)
5352                         continue;
5353
5354                 write_i = isl_set_copy(gen->prog->array[i].extent);
5355                 copy_out = isl_union_set_add_set(copy_out, write_i);
5356         }
5357         isl_union_set_free(may_write);
5358
5359         copy_out = isl_union_set_intersect_params(copy_out,
5360                                             isl_set_copy(gen->prog->context));
5361
5362         gen->prog->copy_out = isl_union_set_copy(copy_out);
5363
5364         copy_out = isl_union_set_apply(copy_out,
5365                                     isl_union_map_copy(gen->prog->to_inner));
5366         not_written = isl_union_set_subtract(copy_out, must_write);
5367
5368         uninitialized = isl_union_map_copy(gen->prog->scop->live_in);
5369         local_uninitialized = isl_union_map_copy(uninitialized);
5370
5371         local = isl_union_set_apply(local,
5372                                     isl_union_map_copy(gen->prog->to_inner));
5373         local_uninitialized = isl_union_map_intersect_range(local_uninitialized,
5374                                                             local);
5375         if (!isl_union_map_is_empty(local_uninitialized)) {
5376                 fprintf(stderr,
5377                         "possibly uninitialized reads (not copied in):\n");
5378                 isl_union_map_dump(local_uninitialized);
5379         }
5380         uninitialized = isl_union_map_subtract(uninitialized,
5381                                                 local_uninitialized);
5382         copy_in = isl_union_map_range(uninitialized);
5383         copy_in = isl_union_set_union(copy_in, not_written);
5384         copy_in = isl_union_set_apply(copy_in,
5385                                     isl_union_map_copy(gen->prog->to_outer));
5386
5387         gen->prog->copy_in = copy_in;
5388 }
5389
5390 /* Extract a gpu_stmt_access from "expr", append it to the list
5391  * that ends in *next_access and return the updated end of the list.
5392  */
5393 static struct gpu_stmt_access **expr_extract_access(struct pet_expr *expr,
5394         struct gpu_stmt_access **next_access)
5395 {
5396         struct gpu_stmt_access *access;
5397         isl_ctx *ctx = isl_map_get_ctx(expr->acc.access);
5398
5399         access = isl_alloc_type(ctx, struct gpu_stmt_access);
5400         assert(access);
5401         access->next = NULL;
5402         access->read = expr->acc.read;
5403         access->write = expr->acc.write;
5404         access->access = pet_expr_access_get_may_access(expr);
5405         access->tagged_access = pet_expr_access_get_tagged_may_access(expr);
5406         access->exact_write = !expr->acc.write ||
5407                 isl_map_is_equal(expr->acc.access, access->access);
5408         access->ref_id = isl_id_copy(expr->acc.ref_id);
5409         access->group = -1;
5410
5411         *next_access = access;
5412         next_access = &(*next_access)->next;
5413         return next_access;
5414 }
5415
5416 static struct gpu_stmt_access **expr_extract_accesses(struct pet_expr *expr,
5417         struct gpu_stmt_access **next_access)
5418 {
5419         int i;
5420
5421         for (i = 0; i < expr->n_arg; ++i)
5422                 next_access = expr_extract_accesses(expr->args[i],
5423                                                         next_access);
5424
5425         if (expr->type == pet_expr_access)
5426                 next_access = expr_extract_access(expr, next_access);
5427
5428         return next_access;
5429 }
5430
5431 static void pet_stmt_extract_accesses(struct gpu_stmt *stmt)
5432 {
5433         struct gpu_stmt_access **next_access = &stmt->accesses;
5434
5435         stmt->accesses = NULL;
5436         expr_extract_accesses(stmt->stmt->body, next_access);
5437 }
5438
5439 /* Return an array of gpu_stmt representing the statements in "scop".
5440  */
5441 static struct gpu_stmt *extract_stmts(isl_ctx *ctx, struct ppcg_scop *scop,
5442         __isl_keep isl_set *context)
5443 {
5444         int i;
5445         struct gpu_stmt *stmts;
5446
5447         stmts = isl_calloc_array(ctx, struct gpu_stmt, scop->n_stmt);
5448         if (!stmts)
5449                 return NULL;
5450
5451         for (i = 0; i < scop->n_stmt; ++i) {
5452                 struct gpu_stmt *s = &stmts[i];
5453
5454                 s->id = isl_set_get_tuple_id(scop->stmts[i]->domain);
5455                 s->stmt = scop->stmts[i];
5456                 pet_stmt_extract_accesses(s);
5457         }
5458
5459         return stmts;
5460 }
5461
5462 /* Callback for ppcg_print_guarded that calls the callback for generate_gpu.
5463  */
5464 static __isl_give isl_printer *print_gpu(__isl_take isl_printer *p, void *user)
5465 {
5466         struct gpu_gen *gen = user;
5467
5468         return gen->print(p, gen->prog, gen->tree, &gen->types,
5469                             gen->print_user);
5470 }
5471
5472 /* Generate CUDA code for "scop" and print it to "p".
5473  * After generating an AST for the transformed scop as explained below,
5474  * we call "gen->print" to print the AST in the desired output format
5475  * to "p".
5476  *
5477  * If it turns out that it does not make sense to generate GPU code,
5478  * then we generate CPU code instead.
5479  *
5480  * The GPU code is generated in a context where at least one
5481  * statement instance is executed.  The corresponding guard (if any) is printed
5482  * around the entire generated GPU code, except for the declaration
5483  * of the arrays that are visible outside of the scop and that therefore
5484  * cannot be declared inside the body of any possible guard.
5485  *
5486  * We first compute a schedule that respects the dependences
5487  * of the original program and select the outermost band
5488  * of tilable dimensions that has at least one parallel loop.
5489  * We then have three blocks of dimensions
5490  *
5491  *      H               B                       G
5492  *
5493  * The tilable band "B" is first tiled according to "tile" sizes, resulting
5494  * in
5495  *
5496  *      H       T               P               G
5497  *
5498  * For each iteration of the T loop and for each array, we compute
5499  * the array elements accessed by that iteration, construct a rectangular
5500  * box around it and shift it to the origin.  The result is used
5501  * as shared memory for the array.
5502  *
5503  * We then split off at most 2 parallel loops from the T loops and
5504  * at most 3 parallel loops from the P loops
5505  *
5506  *      H       T1      T2      P1      P2      G
5507  *
5508  * The T1/P1 loops are then tiled or "wrapped" over the blocks/threads,
5509  * according to "grid"/"block" sizes.
5510  *
5511  *      H       T1T T1P T2      P1T P1P P2      G
5512  *
5513  * Finally, the T1P and P1P iterators are equated to the block and
5514  * thread dimensions respectively and so are effectively removed.
5515  * The H loops are run on the host.  The T1T, T2, P1T, P2 and G loops
5516  * are run on the GPU.
5517  *
5518  * Code is generated in three stages.  We first generate code for the
5519  * host (the H loops), with iterators h%d.  Then, for each leaf node
5520  * of the resulting AST, we generate code for the shared loops (up to
5521  * and including T2), with iterators g%d and after equating the H loops
5522  * to h%d parameters and the T1P loops to the block dimensions.
5523  * Finally, we generate code for the remaining loops in a similar fashion.
5524  */
5525 static __isl_give isl_printer *generate(__isl_take isl_printer *p,
5526         struct gpu_gen *gen, struct ppcg_scop *scop,
5527         struct ppcg_options *options)
5528 {
5529         struct gpu_prog *prog;
5530         isl_ctx *ctx;
5531         isl_set *context, *guard;
5532
5533         if (!scop)
5534                 return isl_printer_free(p);
5535
5536         ctx = isl_printer_get_ctx(p);
5537         prog = gpu_prog_alloc(ctx, scop);
5538         if (!prog)
5539                 return isl_printer_free(p);
5540
5541         context = isl_set_copy(prog->context);
5542         guard = isl_union_set_params(isl_union_set_copy(prog->scop->domain));
5543         prog->context = isl_set_intersect(prog->context, isl_set_copy(guard));
5544
5545         gen->prog = prog;
5546         gen->any_parallelism = 0;
5547         compute_schedule(gen);
5548
5549         if (!gen->any_parallelism) {
5550                 isl_set_free(context);
5551                 isl_set_free(guard);
5552                 p = print_cpu(p, scop, options);
5553         } else {
5554                 compute_copy_in_and_out(gen);
5555                 gen->tree = generate_host_code(gen);
5556                 p = ppcg_print_exposed_declarations(p, prog->scop);
5557                 p = ppcg_print_guarded(p, guard, context, &print_gpu, gen);
5558                 isl_ast_node_free(gen->tree);
5559         }
5560
5561         isl_union_map_free(gen->sched);
5562
5563         gpu_prog_free(prog);
5564
5565         return p;
5566 }
5567
5568 /* Wrapper around generate for use as a ppcg_transform callback.
5569  */
5570 static __isl_give isl_printer *generate_wrap(__isl_take isl_printer *p,
5571         struct ppcg_scop *scop, void *user)
5572 {
5573         struct gpu_gen *gen = user;
5574
5575         return generate(p, gen, scop, gen->options);
5576 }
5577
5578 /* Transform the code in the file called "input" by replacing
5579  * all scops by corresponding GPU code and write the results to "out".
5580  */
5581 int generate_gpu(isl_ctx *ctx, const char *input, FILE *out,
5582         struct ppcg_options *options,
5583         __isl_give isl_printer *(*print)(__isl_take isl_printer *p,
5584                 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
5585                 struct gpu_types *types, void *user), void *user)
5586 {
5587         struct gpu_gen gen;
5588         int r;
5589         int i;
5590
5591         gen.ctx = ctx;
5592         gen.sizes = extract_sizes_from_str(ctx, options->sizes);
5593         gen.options = options;
5594         gen.kernel_id = 0;
5595         gen.print = print;
5596         gen.print_user = user;
5597         gen.types.n = 0;
5598         gen.types.name = NULL;
5599
5600         r = ppcg_transform(ctx, input, out, options, &generate_wrap, &gen);
5601
5602         isl_union_map_free(gen.sizes);
5603         for (i = 0; i < gen.types.n; ++i)
5604                 free(gen.types.name[i]);
5605         free(gen.types.name);
5606
5607         return r;
5608 }
5609
5610 struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop)
5611 {
5612         struct gpu_prog *prog;
5613
5614         if (!scop)
5615                 return NULL;
5616
5617         prog = isl_calloc_type(ctx, struct gpu_prog);
5618         assert(prog);
5619
5620         prog->ctx = ctx;
5621         prog->scop = scop;
5622         prog->context = isl_set_copy(scop->context);
5623         prog->n_stmts = scop->n_stmt;
5624         prog->stmts = extract_stmts(ctx, scop, prog->context);
5625         prog->read = isl_union_map_copy(scop->reads);
5626         prog->may_write = isl_union_map_copy(scop->may_writes);
5627         prog->must_write = isl_union_map_copy(scop->must_writes);
5628         prog->to_inner = compute_to_inner(scop);
5629         prog->to_outer = isl_union_map_copy(prog->to_inner);
5630         prog->to_outer = isl_union_map_reverse(prog->to_outer);
5631
5632         if (!prog->stmts)
5633                 return gpu_prog_free(prog);
5634
5635         if (collect_array_info(prog) < 0)
5636                 return gpu_prog_free(prog);
5637
5638         return prog;
5639 }
5640
5641 void *gpu_prog_free(struct gpu_prog *prog)
5642 {
5643         if (!prog)
5644                 return NULL;
5645         free_array_info(prog);
5646         free_stmts(prog->stmts, prog->n_stmts);
5647         isl_union_map_free(prog->to_outer);
5648         isl_union_map_free(prog->to_inner);
5649         isl_union_set_free(prog->copy_in);
5650         isl_union_set_free(prog->copy_out);
5651         isl_union_map_free(prog->read);
5652         isl_union_map_free(prog->may_write);
5653         isl_union_map_free(prog->must_write);
5654         isl_set_free(prog->context);
5655         free(prog);
5656         return NULL;
5657 }