gpu.c

   1 /*
   2  * Copyright 2010-2011 INRIA Saclay
   3  * Copyright 2012-2013 Ecole Normale Superieure
   4  *
   5  * Use of this software is governed by the MIT license
   6  *
   7  * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
   8  * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
   9  * 91893 Orsay, France
  10  * and Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
  11  */
  12
  13 #include <assert.h>
  14 #include <stdlib.h>
  15 #include <string.h>
  16
  17 #include <isl/polynomial.h>
  18 #include <isl/union_set.h>
  19 #include <isl/aff.h>
  20 #include <isl/ilp.h>
  21 #include <isl/flow.h>
  22 #include <isl/band.h>
  23 #include <isl/schedule.h>
  24 #include <isl/options.h>
  25 #include <isl/ast_build.h>
  26
  27 #include "cpu.h"
  28 #include "gpu.h"
  29 #include "schedule.h"
  30 #include "ppcg_options.h"
  31 #include "print.h"
  32
  33 /* The fields stride, shift and shift_map only contain valid information
  34  * if shift != NULL.
  35  * If so, they express that current index is such that if you add shift,
  36  * then the result is always a multiple of stride.
  37  * shift_map contains the mapping
  38  *
  39  *      i -> (i + shift)/stride
  40  *
  41  * Let D represent the initial shared_len dimensions of the computed schedule.
  42  * The spaces of "lb" and "shift" are of the form
  43  *
  44  *      D -> [b]
  45  *
  46  * "shift_map" is of the form
  47  *
  48  *      [D -> i] -> [D -> (i + shift(D))/stride]
  49  */
  50 struct gpu_array_bound {
  51         isl_val *size;
  52         isl_aff *lb;
  53
  54         isl_val *stride;
  55         isl_aff *shift;
  56         isl_basic_map *shift_map;
  57 };
  58
  59 /* A tile of an array.
  60  *
  61  * n is the dimension of the array.
  62  * bound is an array of size "n" representing the lower bound
  63  *      and size for each index.
  64  *
  65  * tiling maps a tile in the global array to the corresponding
  66  * shared/private memory tile and is of the form
  67  *
  68  *      { [D[i] -> A[a]] -> T[(a + shift(i))/stride - lb(i)] }
  69  *
  70  * where D represents the initial shared_len dimensions
  71  * of the computed schedule.
  72  */
  73 struct gpu_array_tile {
  74         int n;
  75         struct gpu_array_bound *bound;
  76         isl_multi_aff *tiling;
  77 };
  78
  79 struct gpu_array_info;
  80
  81 /* A group of array references in a kernel that should be handled together.
  82  * If private_tile is not NULL, then it is mapped to registers.
  83  * Otherwise, if shared_tile is not NULL, it is mapped to shared memory.
  84  * Otherwise, it is accessed from global memory.
  85  */
  86 struct gpu_array_ref_group {
  87         /* The references in this group access this array. */
  88         struct gpu_array_info *array;
  89         /* Position of this group in the list of reference groups of array. */
  90         int nr;
  91
  92         /* The following fields are use during the construction of the groups.
  93          * access is the combined access relation relative to the shared
  94          * memory tiling.  In particular, the domain of the map corresponds
  95          * to the first shared_len dimensions of the computed schedule.
  96          * write is set if any access in the group is a write.
  97          * exact_write is set if all writes are definite writes.
  98          */
  99         isl_map *access;
 100         int write;
 101         int exact_write;
 102
 103         /* The shared memory tile, NULL if none. */
 104         struct gpu_array_tile *shared_tile;
 105
 106         /* The private memory tile, NULL if none. */
 107         struct gpu_array_tile *private_tile;
 108
 109         /* References in this group; point to elements of a linked list. */
 110         int n_ref;
 111         struct gpu_stmt_access **refs;
 112
 113         /* Last shared memory tile dimension that affects tile of this group. */
 114         int last_shared;
 115 };
 116
 117 struct gpu_gen {
 118         isl_ctx *ctx;
 119         struct ppcg_options *options;
 120
 121         /* Callback for printing of AST in appropriate format. */
 122         __isl_give isl_printer *(*print)(__isl_take isl_printer *p,
 123                 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
 124                 struct gpu_types *types, void *user);
 125         void *print_user;
 126
 127         struct gpu_prog *prog;
 128         /* The generated AST. */
 129         isl_ast_node *tree;
 130
 131         /* The sequence of types for which a definition has been printed. */
 132         struct gpu_types types;
 133
 134         /* tile, grid and block sizes for each kernel */
 135         isl_union_map *sizes;
 136
 137         /* Identifier of current kernel. */
 138         int kernel_id;
 139         /* Pointer to the current kernel. */
 140         struct ppcg_kernel *kernel;
 141         /* Does the computed schedule exhibit any parallelism? */
 142         int any_parallelism;
 143
 144         /* First tile dimension. */
 145         int tile_first;
 146         /* Number of tile dimensions. */
 147         int tile_len;
 148         /* Number of initial parallel loops among tile dimensions. */
 149         int n_parallel;
 150
 151         /* Number of dimensions determining shared memory. */
 152         int shared_len;
 153
 154         /* Number of rows in the untiled schedule. */
 155         int untiled_len;
 156         /* Number of rows in the tiled schedule. */
 157         int tiled_len;
 158         /* Number of rows in schedule after tiling/wrapping over threads. */
 159         int thread_tiled_len;
 160
 161         /* Global untiled schedule. */
 162         isl_union_map *sched;
 163         /* Local (per kernel launch) tiled schedule. */
 164         isl_union_map *tiled_sched;
 165         /* Local schedule per shared memory tile loop iteration. */
 166         isl_union_map *local_sched;
 167
 168         /* Local tiled schedule projected onto the shared tile loops and
 169          * the loops that will be wrapped over the threads,
 170          * with all shared tile loops parametrized.
 171          */
 172         isl_union_map *shared_sched;
 173         /* Projects out the loops that will be wrapped over the threads
 174          * from shared_sched.
 175          */
 176         isl_union_map *shared_proj;
 177
 178         /* A map that takes the range of shared_sched as input,
 179          * wraps the appropriate loops over the threads and then projects
 180          * out these loops.
 181          */
 182         isl_map *privatization;
 183
 184         /* A map from the shared memory tile loops and the thread indices
 185          * (as parameters) to the set of accessed memory elements that
 186          * will be accessed through private copies.
 187          */
 188         isl_union_map *private_access;
 189
 190         /* The schedule for the current private/shared access
 191          * (within print_private_access or print_shared_access).
 192          */
 193         isl_map *copy_sched;
 194         /* The array reference group corresponding to copy_sched. */
 195         struct gpu_array_ref_group *copy_group;
 196
 197         /* First loop to unroll (or -1 if none) in the current part of the
 198          * schedule.
 199          */
 200         int first_unroll;
 201
 202         int n_grid;
 203         int n_block;
 204         /* Note: in the input file, the sizes of the grid and the blocks
 205          * are specified in the order x, y, z, but internally, the sizes
 206          * are stored in reverse order, so that the last element always
 207          * refers to the x dimension.
 208          */
 209         int grid_dim[2];
 210         int block_dim[3];
 211         int *tile_size;
 212 };
 213
 214 /* Print the name of the local copy of a given group of array references.
 215  */
 216 static __isl_give isl_printer *print_array_name(__isl_take isl_printer *p,
 217         struct gpu_array_ref_group *group)
 218 {
 219         int global = 0;
 220
 221         if (group->private_tile)
 222                 p = isl_printer_print_str(p, "private_");
 223         else if (group->shared_tile)
 224                 p = isl_printer_print_str(p, "shared_");
 225         else
 226                 global = 1;
 227         p = isl_printer_print_str(p, group->array->name);
 228         if (!global && group->array->n_group > 1) {
 229                 p = isl_printer_print_str(p, "_");
 230                 p = isl_printer_print_int(p, group->nr);
 231         }
 232
 233         return p;
 234 }
 235
 236 /* Collect all references to the given array and store pointers to them
 237  * in array->refs.
 238  *
 239  * If the array contains structures, then there is no need to collect
 240  * the references since we will not be computing any reference groups.
 241  */
 242 static void collect_references(struct gpu_prog *prog,
 243         struct gpu_array_info *array)
 244 {
 245         int i;
 246         int n;
 247
 248         if (array->has_compound_element)
 249                 return;
 250
 251         n = 0;
 252         for (i = 0; i < prog->n_stmts; ++i) {
 253                 struct gpu_stmt *stmt = &prog->stmts[i];
 254                 struct gpu_stmt_access *access;
 255
 256                 for (access = stmt->accesses; access; access = access->next) {
 257                         const char *name;
 258                         name = isl_map_get_tuple_name(access->access,
 259                                                       isl_dim_out);
 260                         if (name && !strcmp(array->name, name))
 261                                 n++;
 262                 }
 263         }
 264
 265         array->n_ref = n;
 266         array->refs = isl_alloc_array(prog->ctx, struct gpu_stmt_access *, n);
 267         assert(array->refs);
 268
 269         n = 0;
 270         for (i = 0; i < prog->n_stmts; ++i) {
 271                 struct gpu_stmt *stmt = &prog->stmts[i];
 272                 struct gpu_stmt_access *access;
 273
 274                 for (access = stmt->accesses; access; access = access->next) {
 275                         const char *name;
 276                         name = isl_map_get_tuple_name(access->access,
 277                                                       isl_dim_out);
 278                         if (!name || strcmp(array->name, name))
 279                                 continue;
 280
 281                         array->refs[n++] = access;
 282                 }
 283         }
 284 }
 285
 286 /* Create a gpu_array_tile for an array of dimension "n_index".
 287  */
 288 static struct gpu_array_tile *create_tile(isl_ctx *ctx, int n_index)
 289 {
 290         int i;
 291         struct gpu_array_tile *tile;
 292
 293         tile = isl_calloc_type(ctx, struct gpu_array_tile);
 294         assert(tile);
 295
 296         tile->n = n_index;
 297
 298         tile->bound = isl_alloc_array(ctx, struct gpu_array_bound, n_index);
 299         assert(tile->bound);
 300
 301         for (i = 0; i < n_index; ++i) {
 302                 tile->bound[i].size = NULL;
 303                 tile->bound[i].lb = NULL;
 304                 tile->bound[i].stride = NULL;
 305                 tile->bound[i].shift = NULL;
 306                 tile->bound[i].shift_map = NULL;
 307         }
 308
 309         return tile;
 310 }
 311
 312 static void *free_tile(struct gpu_array_tile *tile)
 313 {
 314         int j;
 315
 316         if (!tile)
 317                 return NULL;
 318
 319         for (j = 0; j < tile->n; ++j) {
 320                 isl_val_free(tile->bound[j].size);
 321                 isl_val_free(tile->bound[j].stride);
 322                 isl_aff_free(tile->bound[j].lb);
 323                 isl_aff_free(tile->bound[j].shift);
 324                 isl_basic_map_free(tile->bound[j].shift_map);
 325         }
 326         free(tile->bound);
 327         isl_multi_aff_free(tile->tiling);
 328         free(tile);
 329
 330         return NULL;
 331 }
 332
 333 static struct pet_array *find_array(struct ppcg_scop *scop,
 334         __isl_keep isl_set *accessed)
 335 {
 336         int i;
 337         isl_id *id;
 338
 339         id = isl_set_get_tuple_id(accessed);
 340
 341         for (i = 0; i < scop->n_array; ++i) {
 342                 isl_id *id_i;
 343
 344                 id_i = isl_set_get_tuple_id(scop->arrays[i]->extent);
 345                 isl_id_free(id_i);
 346                 if (id == id_i)
 347                         break;
 348         }
 349         isl_id_free(id);
 350
 351         return i < scop->n_array ? scop->arrays[i] : NULL;
 352 }
 353
 354 /* Compute and return the extent of "array", taking into account the set of
 355  * accessed elements.
 356  *
 357  * In particular, the extent in the outer dimension is taken
 358  * from "accessed", while then extent in the remaing dimensions
 359  * are taken from array->extent.
 360  *
 361  * The extent in the outer dimension cannot be taken from array->extent
 362  * because that may be unbounded.  Furthermore, even if it is bounded,
 363  * it may be larger than the piece of the array that is being accessed.
 364  */
 365 static __isl_give isl_set *compute_extent(struct pet_array *array,
 366         __isl_keep isl_set *accessed)
 367 {
 368         int n_index;
 369         isl_id *id;
 370         isl_set *outer;
 371         isl_set *extent;
 372
 373         extent = isl_set_copy(array->extent);
 374
 375         n_index = isl_set_dim(accessed, isl_dim_set);
 376         if (n_index == 0)
 377                 return extent;
 378
 379         extent = isl_set_project_out(extent, isl_dim_set, 0, 1);
 380         outer = isl_set_copy(accessed);
 381         outer = isl_set_project_out(outer, isl_dim_set, 1, n_index - 1);
 382         extent = isl_set_flat_product(outer, extent);
 383         id = isl_set_get_tuple_id(accessed);
 384         extent = isl_set_set_tuple_id(extent, id);
 385
 386         return extent;
 387 }
 388
 389 /* Is the array "array" being extracted a read-only scalar?
 390  *
 391  * That is, is "array" a scalar that is never possibly written to.
 392  * An array containing structures is never considered to be a scalar.
 393  */
 394 static int is_read_only_scalar(struct gpu_array_info *array,
 395         struct gpu_prog *prog)
 396 {
 397         isl_set *space;
 398         isl_union_map *write;
 399         int empty;
 400
 401         if (array->has_compound_element)
 402                 return 0;
 403         if (array->n_index != 0)
 404                 return 0;
 405
 406         write = isl_union_map_copy(prog->may_write);
 407         space = isl_set_universe(isl_space_copy(array->space));
 408         write = isl_union_map_intersect_range(write,
 409                                                 isl_union_set_from_set(space));
 410         empty = isl_union_map_is_empty(write);
 411         isl_union_map_free(write);
 412
 413         return empty;
 414 }
 415
 416 /* Compute bounds on the host arrays based on the accessed elements
 417  * and collect all references to the array.
 418  *
 419  * If the array is zero-dimensional and does not contain structures,
 420  * i.e., if the array is a scalar, we check whether it is read-only.
 421  */
 422 static int extract_array_info(__isl_take isl_set *array, void *user)
 423 {
 424         int i;
 425         struct gpu_prog *prog = (struct gpu_prog *)user;
 426         const char *name;
 427         int n_index;
 428         isl_pw_aff **bounds;
 429         struct pet_array *pa;
 430         struct gpu_array_info *info;
 431         isl_set *extent;
 432
 433         info = &prog->array[prog->n_array];
 434         prog->n_array++;
 435
 436         n_index = isl_set_dim(array, isl_dim_set);
 437         name = isl_set_get_tuple_name(array);
 438         bounds = isl_alloc_array(isl_set_get_ctx(array),
 439                                  isl_pw_aff *, n_index);
 440         if (!bounds)
 441                 goto error;
 442
 443         info->space = isl_set_get_space(array);
 444         info->name = strdup(name);
 445         info->n_index = n_index;
 446         info->bound = bounds;
 447         info->linearize = prog->scop->options->linearize_device_arrays;
 448
 449         pa = find_array(prog->scop, array);
 450         if (!pa)
 451                 isl_die(isl_set_get_ctx(array), isl_error_internal,
 452                         "unable to find array in scop", goto error);
 453
 454         info->type = strdup(pa->element_type);
 455         info->size = pa->element_size;
 456         info->local = pa->declared && !pa->exposed;
 457         info->has_compound_element = pa->element_is_record;
 458         info->read_only_scalar = is_read_only_scalar(info, prog);
 459
 460         extent = compute_extent(pa, array);
 461         for (i = 0; i < n_index; ++i) {
 462                 isl_set *dom;
 463                 isl_local_space *ls;
 464                 isl_aff *one;
 465                 isl_pw_aff *bound;
 466
 467                 bound = isl_set_dim_max(isl_set_copy(extent), i);
 468                 assert(bound);
 469                 dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
 470                 ls = isl_local_space_from_space(isl_set_get_space(dom));
 471                 one = isl_aff_zero_on_domain(ls);
 472                 one = isl_aff_add_constant_si(one, 1);
 473                 bound = isl_pw_aff_add(bound, isl_pw_aff_alloc(dom, one));
 474                 bound = isl_pw_aff_gist(bound, isl_set_copy(prog->context));
 475
 476                 bounds[i] = bound;
 477                 if (!isl_pw_aff_is_cst(bound))
 478                         info->linearize = 1;
 479         }
 480         info->extent = extent;
 481
 482         collect_references(prog, info);
 483
 484         isl_set_free(array);
 485         return 0;
 486 error:
 487         isl_set_free(array);
 488         return -1;
 489 }
 490
 491 /* Compute a mapping from all outer arrays (of structs) in scop
 492  * to their innermost arrays.
 493  *
 494  * In particular, for each array of a primitive type, the result
 495  * contains the identity mapping on that array.
 496  * For each array involving member accesses, the result
 497  * contains a mapping from the elements of the outer array of structs
 498  * to all corresponding elements of the innermost nested arrays.
 499  */
 500 static __isl_give isl_union_map *compute_to_inner(struct ppcg_scop *scop)
 501 {
 502         int i;
 503         isl_union_map *to_inner;
 504
 505         to_inner = isl_union_map_empty(isl_set_get_space(scop->context));
 506
 507         for (i = 0; i < scop->n_array; ++i) {
 508                 struct pet_array *array = scop->arrays[i];
 509                 isl_set *set;
 510                 isl_map *map;
 511
 512                 if (array->element_is_record)
 513                         continue;
 514
 515                 set = isl_set_copy(array->extent);
 516                 map = isl_set_identity(isl_set_copy(set));
 517
 518                 while (set && isl_set_is_wrapping(set)) {
 519                         isl_id *id;
 520                         isl_map *wrapped;
 521
 522                         id = isl_set_get_tuple_id(set);
 523                         wrapped = isl_set_unwrap(set);
 524                         wrapped = isl_map_domain_map(wrapped);
 525                         wrapped = isl_map_set_tuple_id(wrapped, isl_dim_in, id);
 526                         map = isl_map_apply_domain(map, wrapped);
 527                         set = isl_map_domain(isl_map_copy(map));
 528                 }
 529
 530                 map = isl_map_gist_domain(map, set);
 531
 532                 to_inner = isl_union_map_add_map(to_inner, map);
 533         }
 534
 535         return to_inner;
 536 }
 537
 538 /* Construct a gpu_array_info for each array possibly accessed by "prog" and
 539  * collect them in prog->array.
 540  *
 541  * If there are any member accesses involved, then they are first mapped
 542  * to the outer arrays of structs.
 543  */
 544 static int collect_array_info(struct gpu_prog *prog)
 545 {
 546         int r;
 547         isl_union_set *arrays;
 548
 549         arrays = isl_union_map_range(isl_union_map_copy(prog->read));
 550         arrays = isl_union_set_union(arrays,
 551                     isl_union_map_range(isl_union_map_copy(prog->may_write)));
 552
 553         arrays = isl_union_set_apply(arrays,
 554                                         isl_union_map_copy(prog->to_outer));
 555
 556         arrays = isl_union_set_coalesce(arrays);
 557
 558         prog->n_array = isl_union_set_n_set(arrays);
 559         prog->array = isl_calloc_array(prog->ctx,
 560                                      struct gpu_array_info, prog->n_array);
 561         assert(prog->array);
 562         prog->n_array = 0;
 563         r = isl_union_set_foreach_set(arrays, &extract_array_info, prog);
 564         isl_union_set_free(arrays);
 565
 566         return r;
 567 }
 568
 569 static void free_array_info(struct gpu_prog *prog)
 570 {
 571         int i, j;
 572
 573         for (i = 0; i < prog->n_array; ++i) {
 574                 int n_index = prog->array[i].n_index;
 575                 free(prog->array[i].type);
 576                 free(prog->array[i].name);
 577                 for (j = 0; j < n_index; ++j)
 578                         isl_pw_aff_free(prog->array[i].bound[j]);
 579                 isl_space_free(prog->array[i].space);
 580                 isl_set_free(prog->array[i].extent);
 581                 free(prog->array[i].bound);
 582                 free(prog->array[i].refs);
 583         }
 584         free(prog->array);
 585 }
 586
 587 /* Check if a gpu array is a scalar.  A scalar is a value that is not stored
 588  * as an array or through a pointer reference, but as a single data element.
 589  * At the moment, scalars are represented as zero-dimensional arrays.
 590  * A zero-dimensional array containing structures is not considered
 591  * to be a scalar.
 592  */
 593 int gpu_array_is_scalar(struct gpu_array_info *array)
 594 {
 595         return !array->has_compound_element && array->n_index == 0;
 596 }
 597
 598 /* Is "array" a read-only scalar?
 599  */
 600 int gpu_array_is_read_only_scalar(struct gpu_array_info *array)
 601 {
 602         return array->read_only_scalar;
 603 }
 604
 605 /* Internal data structure for extract_size_of_type.
 606  * "type" specifies the name of the space that we want to extract.
 607  * "res" is used to store the subset of that space.
 608  */
 609 struct ppcg_extract_size_data {
 610         const char *type;
 611         isl_set *res;
 612 };
 613
 614 /* This function is called for each set in a union_set.
 615  * If the name of the set matches data->type, we store the
 616  * set in data->res.
 617  */
 618 static int extract_size_of_type(__isl_take isl_set *size, void *user)
 619 {
 620         struct ppcg_extract_size_data *data = user;
 621         const char *name;
 622
 623         name = isl_set_get_tuple_name(size);
 624         if (name && !strcmp(name, data->type)) {
 625                 data->res = size;
 626                 return -1;
 627         }
 628
 629         isl_set_free(size);
 630         return 0;
 631 }
 632
 633 /* Given a union map { kernel[i] -> *[...] },
 634  * return the range in the space called "type" for the kernel with
 635  * sequence number "id".
 636  */
 637 static __isl_give isl_set *extract_sizes(__isl_keep isl_union_map *sizes,
 638         const char *type, int id)
 639 {
 640         isl_space *space;
 641         isl_set *dom;
 642         isl_union_set *local_sizes;
 643         struct ppcg_extract_size_data data = { type, NULL };
 644
 645         if (!sizes)
 646                 return NULL;
 647
 648         space = isl_union_map_get_space(sizes);
 649         space = isl_space_set_from_params(space);
 650         space = isl_space_add_dims(space, isl_dim_set, 1);
 651         space = isl_space_set_tuple_name(space, isl_dim_set, "kernel");
 652         dom = isl_set_universe(space);
 653         dom = isl_set_fix_si(dom, isl_dim_set, 0, id);
 654
 655         local_sizes = isl_union_set_apply(isl_union_set_from_set(dom),
 656                                         isl_union_map_copy(sizes));
 657         isl_union_set_foreach_set(local_sizes, &extract_size_of_type, &data);
 658         isl_union_set_free(local_sizes);
 659         return data.res;
 660 }
 661
 662 /* Given a singleton set, extract the first (at most *len) elements
 663  * of the single integer tuple into *sizes and update *len if needed.
 664  */
 665 static void read_sizes_from_set(__isl_take isl_set *set, int *sizes, int *len)
 666 {
 667         int i;
 668         int dim;
 669
 670         if (!set)
 671                 return;
 672
 673         dim = isl_set_dim(set, isl_dim_set);
 674         if (dim < *len)
 675                 *len = dim;
 676
 677         for (i = 0; i < *len; ++i) {
 678                 isl_val *v;
 679
 680                 v = isl_set_plain_get_val_if_fixed(set, isl_dim_set, i);
 681                 assert(v);
 682
 683                 sizes[i] = isl_val_get_num_si(v);
 684                 isl_val_free(v);
 685         }
 686
 687         isl_set_free(set);
 688 }
 689
 690 /* Extract user specified "tile" sizes from the "sizes" command line option,
 691  * defaulting to option->tile_size in each dimension.
 692  */
 693 static void read_tile_sizes(struct gpu_gen *gen)
 694 {
 695         int n;
 696         isl_set *size;
 697
 698         gen->tile_size = isl_alloc_array(gen->ctx, int, gen->tile_len);
 699         assert(gen->tile_size);
 700         for (n = 0; n < gen->tile_len; ++n)
 701                 gen->tile_size[n] = gen->options->tile_size;
 702
 703         size = extract_sizes(gen->sizes, "tile", gen->kernel_id);
 704         read_sizes_from_set(size, gen->tile_size, &gen->tile_len);
 705
 706         if (gen->n_parallel > gen->tile_len)
 707                 gen->n_parallel = gen->tile_len;
 708 }
 709
 710 /* Extract user specified "block" sizes from the "sizes" command line option,
 711  * after filling in some potentially useful defaults.
 712  */
 713 static void read_block_sizes(struct gpu_gen *gen)
 714 {
 715         int n;
 716         isl_set *size;
 717
 718         n = gen->n_parallel;
 719         gen->n_block = (n <= 3) ? n : 3;
 720         switch (gen->n_block) {
 721         case 1:
 722                 gen->block_dim[0] = 512;
 723                 break;
 724         case 2:
 725                 gen->block_dim[0] = 32;
 726                 gen->block_dim[1] = 16;
 727                 break;
 728         default:
 729                 gen->block_dim[0] = 32;
 730                 gen->block_dim[1] = 4;
 731                 gen->block_dim[2] = 4;
 732                 break;
 733         }
 734
 735         size = extract_sizes(gen->sizes, "block", gen->kernel_id);
 736         read_sizes_from_set(size, gen->block_dim, &gen->n_block);
 737 }
 738
 739 /* Extract user specified "grid" sizes from the "sizes" command line option,
 740  * after filling in some potentially useful defaults.
 741  */
 742 static void read_grid_sizes(struct gpu_gen *gen)
 743 {
 744         int n = gen->n_parallel;
 745         isl_set *size;
 746
 747         gen->n_grid = (n <= 2) ? n : 2;
 748         switch (gen->n_grid) {
 749         case 1:
 750                 gen->grid_dim[0] = 32768;
 751                 break;
 752         default:
 753                 gen->grid_dim[0] = 256;
 754                 gen->grid_dim[1] = 256;
 755                 break;
 756         }
 757
 758         size = extract_sizes(gen->sizes, "grid", gen->kernel_id);
 759         read_sizes_from_set(size, gen->grid_dim, &gen->n_grid);
 760 }
 761
 762 /* Extract user specified sizes from the "sizes" command line option
 763  * after filling in some potentially useful defaults.
 764  */
 765 static void read_sizes(struct gpu_gen *gen)
 766 {
 767         read_tile_sizes(gen);
 768         read_block_sizes(gen);
 769         read_grid_sizes(gen);
 770 }
 771
 772 static void *free_stmts(struct gpu_stmt *stmts, int n)
 773 {
 774         int i;
 775
 776         if (!stmts)
 777                 return NULL;
 778
 779         for (i = 0; i < n; ++i) {
 780                 struct gpu_stmt_access *access, *next;
 781
 782                 for (access = stmts[i].accesses; access; access = next) {
 783                         next = access->next;
 784                         isl_id_free(access->ref_id);
 785                         isl_map_free(access->access);
 786                         isl_map_free(access->tagged_access);
 787                         free(access);
 788                 }
 789
 790                 isl_id_free(stmts[i].id);
 791         }
 792         free(stmts);
 793
 794         return NULL;
 795 }
 796
 797 /* Construct a map from a domain of dimensionality "len"
 798  * to a domain of dimensionality "len" + "tile_len" that tiles
 799  * the "tile_len" coordinates starting at "first".
 800  * In particular, [s_i] -> [s_i / tile_size[i], s_i % tile_size[i]].
 801  * "dim" prescribes the parameters.
 802  */
 803 static __isl_give isl_map *tile(__isl_take isl_space *dim, int len,
 804         int first, int tile_len, int *tile_size)
 805 {
 806         int i;
 807         isl_basic_map *bmap;
 808         isl_constraint *c;
 809         isl_local_space *ls;
 810
 811         dim = isl_space_add_dims(dim, isl_dim_in, len);
 812         dim = isl_space_add_dims(dim, isl_dim_out, len + tile_len);
 813         bmap = isl_basic_map_universe(isl_space_copy(dim));
 814         ls = isl_local_space_from_space(dim);
 815
 816         for (i = 0; i < len - tile_len; ++i) {
 817                 int j = i < first ? i : i + tile_len;
 818                 int k = i < first ? i : i + 2 * tile_len;
 819
 820                 c = isl_equality_alloc(isl_local_space_copy(ls));
 821                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, j, -1);
 822                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, k, 1);
 823                 bmap = isl_basic_map_add_constraint(bmap, c);
 824         }
 825
 826         for (i = 0; i < tile_len; ++i) {
 827                 c = isl_equality_alloc(isl_local_space_copy(ls));
 828                 c = isl_constraint_set_coefficient_si(c, isl_dim_in,
 829                                                 first + i, -1);
 830                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 831                                                 first + i, tile_size[i]);
 832                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 833                                                 first + i + tile_len, 1);
 834                 bmap = isl_basic_map_add_constraint(bmap, c);
 835
 836                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 837                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 838                                                    first + i + tile_len, 1);
 839                 bmap = isl_basic_map_add_constraint(bmap, c);
 840
 841                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 842                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 843                                                    first + i + tile_len, -1);
 844                 c = isl_constraint_set_constant_si(c, tile_size[i] - 1);
 845                 bmap = isl_basic_map_add_constraint(bmap, c);
 846         }
 847
 848         isl_local_space_free(ls);
 849
 850         return isl_map_from_basic_map(bmap);
 851 }
 852
 853 /* Construct a map from a domain of dimensionality "len"
 854  * to a domain of dimensionality "len" + "wrap_len" that "wraps"
 855  * the "wrap_len" coordinates starting at "first" according to "wrap_size".
 856  * In particular, [s_i] -> [s_i, s_i % wrap_size[i]].
 857  * To do so, we need extra variables corresponding to [s_i / wrap_size[i]],
 858  * that are projected out at the end.
 859  * "dim" prescribes the parameters.
 860  */
 861 static __isl_give isl_map *wrap(__isl_take isl_space *dim, int len,
 862         int first, int wrap_len, int *wrap_size)
 863 {
 864         int i;
 865         isl_basic_map *bmap;
 866         isl_constraint *c;
 867         isl_local_space *ls;
 868
 869         dim = isl_space_add_dims(dim, isl_dim_in, len);
 870         dim = isl_space_add_dims(dim, isl_dim_out, len + 2 * wrap_len);
 871         bmap = isl_basic_map_universe(isl_space_copy(dim));
 872         ls = isl_local_space_from_space(dim);
 873
 874         for (i = 0; i < len; ++i) {
 875                 int k = i < first + wrap_len ? i : i + 2 * wrap_len;
 876
 877                 c = isl_equality_alloc(isl_local_space_copy(ls));
 878                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, -1);
 879                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, k, 1);
 880                 bmap = isl_basic_map_add_constraint(bmap, c);
 881         }
 882
 883         for (i = 0; i < wrap_len; ++i) {
 884                 c = isl_equality_alloc(isl_local_space_copy(ls));
 885                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 886                                                     first + i, -1);
 887                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 888                                                     first + wrap_len + i, 1);
 889                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 890                                     first + 2 * wrap_len + i, wrap_size[i]);
 891                 bmap = isl_basic_map_add_constraint(bmap, c);
 892
 893                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 894                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 895                                                     first + wrap_len + i, 1);
 896                 bmap = isl_basic_map_add_constraint(bmap, c);
 897
 898                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 899                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 900                                                     first + wrap_len + i, -1);
 901                 c = isl_constraint_set_constant_si(c, wrap_size[i] - 1);
 902                 bmap = isl_basic_map_add_constraint(bmap, c);
 903         }
 904
 905         isl_local_space_free(ls);
 906
 907         bmap = isl_basic_map_project_out(bmap, isl_dim_out,
 908                                 first + 2 * wrap_len, wrap_len);
 909
 910         return isl_map_from_basic_map(bmap);
 911 }
 912
 913 /* Add "n" parameters named prefix%d.
 914  */
 915 static __isl_give isl_set *add_params( __isl_take isl_set *set,
 916         int n, const char *prefix)
 917 {
 918         int i;
 919         unsigned nparam;
 920         char name[20];
 921
 922         nparam = isl_set_dim(set, isl_dim_param);
 923         set = isl_set_add_dims(set, isl_dim_param, n);
 924
 925         for (i = 0; i < n; ++i) {
 926                 snprintf(name, sizeof(name), "%s%d", prefix, i);
 927                 set = isl_set_set_dim_name(set, isl_dim_param,
 928                                             nparam + i, name);
 929         }
 930
 931         return set;
 932 }
 933
 934 /* Equate the "n" dimensions of "set" starting at "first" to
 935  * freshly created parameters named prefix%d.
 936  */
 937 static __isl_give isl_set *parametrize(__isl_take isl_set *set,
 938         int first, int n, const char *prefix)
 939 {
 940         int i;
 941         unsigned nparam;
 942
 943         nparam = isl_set_dim(set, isl_dim_param);
 944
 945         set = add_params(set, n, prefix);
 946
 947         for (i = 0; i < n; ++i)
 948                 set = isl_set_equate(set, isl_dim_param, nparam + i,
 949                                         isl_dim_set, first + i);
 950
 951         return set;
 952 }
 953
 954 /* Given a parameter space "space", create a set of dimension "len"
 955  * of which the "n" dimensions starting at "first" are equated to
 956  * freshly created parameters named prefix%d.
 957  */
 958 static __isl_give isl_set *parametrization(__isl_take isl_space *space,
 959         int len, int first, int n, const char *prefix)
 960 {
 961         isl_set *set;
 962
 963         space = isl_space_set_from_params(space);
 964         space = isl_space_add_dims(space, isl_dim_set, len);
 965         set = isl_set_universe(space);
 966
 967         return parametrize(set, first, n, prefix);
 968 }
 969
 970 /* Tile the B loops over the tile sizes and then tile/wrap
 971  * the T1 loops over the blocks.
 972  */
 973 static __isl_give isl_union_map *tile_schedule(struct gpu_gen *gen,
 974         __isl_take isl_union_map *sched)
 975 {
 976         isl_space *dim;
 977         isl_map *tiling, *block_tiling;
 978
 979         dim = isl_union_map_get_space(sched);
 980         tiling = tile(isl_space_copy(dim), gen->untiled_len,
 981                       gen->tile_first, gen->tile_len, gen->tile_size);
 982
 983         if (gen->options->wrap)
 984                 block_tiling = wrap(dim, gen->untiled_len + gen->tile_len,
 985                                 gen->tile_first, gen->n_grid, gen->grid_dim);
 986         else
 987                 block_tiling = tile(dim, gen->untiled_len + gen->tile_len,
 988                                 gen->tile_first, gen->n_grid, gen->grid_dim);
 989
 990         gen->tiled_len = gen->untiled_len + gen->tile_len + gen->n_grid;
 991
 992         tiling = isl_map_apply_range(tiling, block_tiling);
 993
 994         sched = isl_union_map_apply_range(sched,
 995                                              isl_union_map_from_map(tiling));
 996
 997         gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
 998
 999         return sched;
1000 }
1001
1002 /* Equate the "T1P" iterators in the tiled schedule "sched"
1003  * to the block dimensions.
1004  */
1005 static __isl_give isl_union_map *parametrize_tiled_schedule(
1006         struct gpu_gen *gen, __isl_take isl_union_map *sched)
1007 {
1008         isl_space *dim;
1009         isl_set *par;
1010
1011         dim = isl_union_map_get_space(sched);
1012         par = parametrization(dim, gen->tiled_len,
1013                 gen->tile_first + gen->n_grid, gen->n_grid, "b");
1014         sched = isl_union_map_intersect_range(sched,
1015                                                 isl_union_set_from_set(par));
1016
1017         return sched;
1018 }
1019
1020 /* Tile/wrap the P1 loops over the threads.
1021  */
1022 static __isl_give isl_union_map *thread_tile_schedule(struct gpu_gen *gen,
1023         __isl_take isl_union_map *sched)
1024 {
1025         isl_space *dim;
1026         isl_map *tiling;
1027         isl_set *par;
1028
1029         dim = isl_union_map_get_space(sched);
1030
1031         if (gen->options->wrap)
1032                 tiling = wrap(isl_space_copy(dim), gen->tiled_len,
1033                                 gen->shared_len, gen->n_block, gen->block_dim);
1034         else
1035                 tiling = tile(isl_space_copy(dim), gen->tiled_len,
1036                                 gen->shared_len, gen->n_block, gen->block_dim);
1037         gen->thread_tiled_len = gen->tiled_len + gen->n_block;
1038
1039         sched = isl_union_map_apply_range(sched,
1040                                              isl_union_map_from_map(tiling));
1041
1042         par = parametrization(dim, gen->thread_tiled_len,
1043                 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
1044                 gen->n_block, "t");
1045         sched = isl_union_map_intersect_range(sched,
1046                                                 isl_union_set_from_set(par));
1047
1048         gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
1049
1050         return sched;
1051 }
1052
1053 /* If the user asked for it, scale the shared memory tile loops
1054  * (T1T and T2) of "sched" by gen->tile_size[i].
1055  * If we are not performing "wrapping", then additionally scale the T1P
1056  * loops by gen->grid_dim[i].
1057  */
1058 static __isl_give isl_union_map *scale_tile_loops(struct gpu_gen *gen,
1059         __isl_take isl_union_map *sched)
1060 {
1061         int i;
1062         isl_space *dim;
1063         isl_basic_map *scale;
1064         isl_constraint *c;
1065         isl_local_space *ls;
1066
1067         if (!gen->options->scale_tile_loops)
1068                 return sched;
1069
1070         dim = isl_union_map_get_space(sched);
1071         dim = isl_space_add_dims(dim, isl_dim_in, gen->tiled_len);
1072         dim = isl_space_add_dims(dim, isl_dim_out, gen->tiled_len);
1073         scale = isl_basic_map_universe(isl_space_copy(dim));
1074         ls = isl_local_space_from_space(dim);
1075
1076         for (i = 0; i < gen->tiled_len; ++i) {
1077                 int f = 1;
1078
1079                 if (i >= gen->tile_first && i < gen->tile_first + gen->n_grid) {
1080                         f = gen->tile_size[i - gen->tile_first];
1081                         if (!gen->options->wrap)
1082                                 f *= gen->grid_dim[i - gen->tile_first];
1083                 } else if (i >= gen->tile_first + gen->n_grid &&
1084                            i < gen->tile_first + gen->n_grid + gen->tile_len) {
1085                         f = gen->tile_size[i - (gen->tile_first + gen->n_grid)];
1086                 }
1087
1088                 c = isl_equality_alloc(isl_local_space_copy(ls));
1089                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1090                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1091                 scale = isl_basic_map_add_constraint(scale, c);
1092         }
1093
1094         isl_local_space_free(ls);
1095
1096         sched = isl_union_map_apply_range(sched,
1097                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1098
1099         return sched;
1100 }
1101
1102 /* If we are not performing "wrapping" and if the user asked for it,
1103  * scale the thread tile loops (P1T) of "sched" by gen->block_dim[i].
1104  */
1105 static __isl_give isl_union_map *scale_thread_tile_loops(struct gpu_gen *gen,
1106         __isl_take isl_union_map *sched)
1107 {
1108         int i;
1109         isl_space *dim;
1110         isl_basic_map *scale;
1111         isl_constraint *c;
1112         isl_local_space *ls;
1113
1114         if (gen->options->wrap)
1115                 return sched;
1116         if (!gen->options->scale_tile_loops)
1117                 return sched;
1118
1119         dim = isl_union_map_get_space(sched);
1120         dim = isl_space_add_dims(dim, isl_dim_in, gen->thread_tiled_len);
1121         dim = isl_space_add_dims(dim, isl_dim_out, gen->thread_tiled_len);
1122         scale = isl_basic_map_universe(isl_space_copy(dim));
1123         ls = isl_local_space_from_space(dim);
1124
1125         for (i = 0; i < gen->thread_tiled_len; ++i) {
1126                 int f = 1;
1127
1128                 if (i >= gen->shared_len &&
1129                     i < gen->shared_len + gen->n_block)
1130                         f = gen->block_dim[i - gen->shared_len];
1131
1132                 c = isl_equality_alloc(isl_local_space_copy(ls));
1133                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1134                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1135                 scale = isl_basic_map_add_constraint(scale, c);
1136         }
1137
1138         isl_local_space_free(ls);
1139
1140         sched = isl_union_map_apply_range(sched,
1141                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1142
1143         return sched;
1144 }
1145
1146 /* If we are not performing "wrapping" and if the user asked for it,
1147  * scale the "n_tile" loops starting at "first" of "sched" by gen->block_dim[i].
1148  */
1149 static __isl_give isl_union_map *scale_access_tile_loops(struct gpu_gen *gen,
1150         __isl_take isl_union_map *sched, int len, int first, int n_tile)
1151 {
1152         int i;
1153         isl_space *dim;
1154         isl_basic_map *scale;
1155         isl_constraint *c;
1156         isl_local_space *ls;
1157
1158         if (gen->options->wrap)
1159                 return sched;
1160         if (!gen->options->scale_tile_loops)
1161                 return sched;
1162
1163         dim = isl_union_map_get_space(sched);
1164         dim = isl_space_add_dims(dim, isl_dim_in, len);
1165         dim = isl_space_add_dims(dim, isl_dim_out, len);
1166         scale = isl_basic_map_universe(isl_space_copy(dim));
1167         ls = isl_local_space_from_space(dim);
1168
1169         for (i = 0; i < len; ++i) {
1170                 int f = 1;
1171
1172                 if (i >= first && i < first + n_tile)
1173                         f = gen->kernel->block_dim[i - first];
1174
1175                 c = isl_equality_alloc(isl_local_space_copy(ls));
1176                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1177                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1178                 scale = isl_basic_map_add_constraint(scale, c);
1179         }
1180
1181         isl_local_space_free(ls);
1182
1183         sched = isl_union_map_apply_range(sched,
1184                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1185
1186         return sched;
1187 }
1188
1189 /* Add "len" parameters p[i] called prefix%d,
1190  * with bounds to 0 <= p[i] < size[i].
1191  */
1192 __isl_give isl_set *add_bounded_parameters(__isl_take isl_set *set,
1193         int len, int *size, const char *prefix)
1194 {
1195         int i;
1196         unsigned nparam;
1197         isl_space *dim;
1198         isl_basic_set *bset;
1199         isl_constraint *c;
1200         isl_local_space *ls;
1201         char name[20];
1202
1203         nparam = isl_set_dim(set, isl_dim_param);
1204         set = isl_set_add_dims(set, isl_dim_param, len);
1205
1206         for (i = 0; i < len; ++i) {
1207                 snprintf(name, sizeof(name), "%s%d", prefix, i);
1208                 set = isl_set_set_dim_name(set, isl_dim_param,
1209                                             nparam + i, name);
1210         }
1211
1212         dim = isl_set_get_space(set);
1213         bset = isl_basic_set_universe(isl_space_copy(dim));
1214         ls = isl_local_space_from_space(dim);
1215
1216         for (i = 0; i < len; ++i) {
1217                 c = isl_inequality_alloc(isl_local_space_copy(ls));
1218                 c = isl_constraint_set_coefficient_si(c, isl_dim_param,
1219                                                         nparam + i, 1);
1220                 bset = isl_basic_set_add_constraint(bset, c);
1221
1222                 c = isl_inequality_alloc(isl_local_space_copy(ls));
1223                 c = isl_constraint_set_coefficient_si(c, isl_dim_param,
1224                                                         nparam + i, -1);
1225                 c = isl_constraint_set_constant_si(c, size[i] - 1);
1226                 bset = isl_basic_set_add_constraint(bset, c);
1227         }
1228
1229         isl_local_space_free(ls);
1230
1231         return isl_set_intersect(set, isl_set_from_basic_set(bset));
1232 }
1233
1234 /* Add "len" parameters p[i] called prefix%d,
1235  * with bounds to 0 <= p[i] < size[i].
1236  */
1237 static __isl_give isl_set *add_bounded_parameters_dynamic(
1238         __isl_take isl_set *set, __isl_keep isl_multi_pw_aff *size,
1239         const char *prefix)
1240 {
1241         int i, len;
1242         unsigned nparam;
1243         isl_space *space;
1244         isl_local_space *ls;
1245         char name[20];
1246
1247         len = isl_multi_pw_aff_dim(size, isl_dim_out);
1248         nparam = isl_set_dim(set, isl_dim_param);
1249         set = isl_set_add_dims(set, isl_dim_param, len);
1250
1251         for (i = 0; i < len; ++i) {
1252                 snprintf(name, sizeof(name), "%s%d", prefix, i);
1253                 set = isl_set_set_dim_name(set, isl_dim_param,
1254                                             nparam + i, name);
1255         }
1256
1257         space = isl_space_params(isl_set_get_space(set));
1258         ls = isl_local_space_from_space(space);
1259         for (i = 0; i < len; ++i) {
1260                 isl_pw_aff *param, *size_i, *zero;
1261                 isl_set *bound;
1262
1263                 param = isl_pw_aff_var_on_domain(isl_local_space_copy(ls),
1264                                                 isl_dim_param, nparam + i);
1265
1266                 size_i = isl_multi_pw_aff_get_pw_aff(size, i);
1267                 bound = isl_pw_aff_lt_set(isl_pw_aff_copy(param), size_i);
1268                 set = isl_set_intersect_params(set, bound);
1269
1270                 zero = isl_pw_aff_zero_on_domain(isl_local_space_copy(ls));
1271                 bound = isl_pw_aff_ge_set(param, zero);
1272                 set = isl_set_intersect_params(set, bound);
1273         }
1274         isl_local_space_free(ls);
1275
1276         return set;
1277 }
1278
1279 /* Construct a map from an access to group->array to the corresponding
1280  * shared/private memory tile.
1281  * The map is of the form
1282  *
1283  *      { [D[i] -> A[a]] -> T[t] }
1284  *
1285  * where D represents the initial shared_len dimensions
1286  * of the computed schedule.
1287  */
1288 static __isl_give isl_map *shift_access(struct gpu_array_ref_group *group)
1289 {
1290         struct gpu_array_tile *tile;
1291         isl_multi_aff *tiling;
1292
1293         tile = group->private_tile;
1294         if (!tile)
1295                 tile = group->shared_tile;
1296
1297         tiling = isl_multi_aff_copy(tile->tiling);
1298
1299         return isl_map_from_multi_aff(tiling);
1300 }
1301
1302 /* Does "map" have an obviously fixed value at variable "pos" of "type"?
1303  */
1304 static int map_plain_is_fixed(isl_map *map, enum isl_dim_type type,
1305         unsigned pos)
1306 {
1307         isl_val *v;
1308         int fixed;
1309
1310         v = isl_map_plain_get_val_if_fixed(map, type, pos);
1311         if (!v)
1312                 return -1;
1313         fixed = isl_val_is_int(v);
1314         isl_val_free(v);
1315
1316         return fixed;
1317 }
1318
1319 /* Given a schedule that iterates over all elements in a piece of an array,
1320  * perform tiling/wrapping over the threads.
1321  *
1322  * In particular, we tile the final iterators so that the final thread
1323  * dimension runs over the final array dimension.
1324  * However, if those final iterators have only a single iteration,
1325  * we try to tile earlier iterators instead.
1326  */
1327 static __isl_give isl_map *tile_access_schedule(struct gpu_gen *gen,
1328         __isl_take isl_map *sched)
1329 {
1330         isl_space *dim;
1331         isl_union_map *usched;
1332         isl_map *tiling;
1333         isl_set *par;
1334         unsigned nvar = isl_map_dim(sched, isl_dim_out);
1335         int n_tile;
1336         int first;
1337
1338         n_tile = gen->kernel->n_block;
1339         if (n_tile > nvar) {
1340                 int i;
1341                 sched = isl_map_insert_dims(sched,
1342                                                 isl_dim_out, 0, n_tile - nvar);
1343                 for (i = 0; i < n_tile - nvar; ++i)
1344                         sched = isl_map_fix_si(sched, isl_dim_out, i, 0);
1345                 nvar = n_tile;
1346         }
1347
1348         first = nvar - n_tile;
1349
1350         for (; first > 0; first --)
1351                 if (!map_plain_is_fixed(sched, isl_dim_out, first + n_tile - 1))
1352                         break;
1353
1354         dim = isl_map_get_space(sched);
1355         dim = isl_space_params(dim);
1356         if (gen->options->wrap)
1357                 tiling = wrap(isl_space_copy(dim), nvar, first,
1358                                 n_tile, gen->kernel->block_dim);
1359         else
1360                 tiling = tile(isl_space_copy(dim), nvar, first,
1361                                 n_tile, gen->kernel->block_dim);
1362         sched = isl_map_apply_range(sched, tiling);
1363
1364         par = parametrization(dim, nvar + n_tile, first + n_tile, n_tile, "t");
1365         sched = isl_map_intersect_range(sched, par);
1366
1367         usched = isl_union_map_from_map(sched);
1368         usched = scale_access_tile_loops(gen, usched, nvar + n_tile,
1369                                          first, n_tile);
1370         sched = isl_map_from_union_map(usched);
1371
1372         return sched;
1373 }
1374
1375 /* Return the union of all read (read = 1) and/or write (write = 1)
1376  * access relations in the group.
1377  */
1378 static __isl_give isl_union_map *group_access_relation(
1379         struct gpu_array_ref_group *group, int read, int write)
1380 {
1381         int i;
1382         isl_union_map *access;
1383
1384         access = isl_union_map_empty(isl_map_get_space(group->access));
1385         for (i = 0; i < group->n_ref; ++i) {
1386                 isl_map *map_i;
1387
1388                 if (!((read && group->refs[i]->read) ||
1389                      (write && group->refs[i]->write)))
1390                         continue;
1391                 map_i = isl_map_copy(group->refs[i]->access);
1392                 access = isl_union_map_union(access,
1393                                             isl_union_map_from_map(map_i));
1394         }
1395
1396         return access;
1397 }
1398
1399 /* Return the union of all tagged access relations in the group.
1400  */
1401 static __isl_give isl_union_map *group_tagged_access_relation(
1402         struct gpu_array_ref_group *group)
1403 {
1404         int i;
1405         isl_union_map *access;
1406
1407         access = isl_union_map_empty(isl_map_get_space(group->access));
1408         for (i = 0; i < group->n_ref; ++i) {
1409                 isl_map *map_i;
1410
1411                 map_i = isl_map_copy(group->refs[i]->tagged_access);
1412                 access = isl_union_map_union(access,
1413                                             isl_union_map_from_map(map_i));
1414         }
1415
1416         return access;
1417 }
1418
1419 /* Return the extent of "array", recomputed from the bounds.
1420  * The recomputed extent may be simpler than the original extent.
1421  */
1422 static __isl_give isl_set *array_extent(struct gpu_array_info *array)
1423 {
1424         int i;
1425         isl_id *id;
1426         isl_space *space;
1427         isl_local_space *ls;
1428         isl_set *extent;
1429
1430         id = isl_set_get_tuple_id(array->extent);
1431         space = isl_set_get_space(array->extent);
1432         extent = isl_set_universe(isl_space_copy(space));
1433         ls = isl_local_space_from_space(space);
1434         for (i = 0; i < array->n_index; ++i) {
1435                 isl_pw_aff *bound;
1436                 isl_aff *aff;
1437                 isl_pw_aff *index;
1438                 isl_set *lt;
1439
1440                 extent = isl_set_lower_bound_si(extent, isl_dim_set, i, 0);
1441
1442                 aff = isl_aff_var_on_domain(isl_local_space_copy(ls),
1443                                                 isl_dim_set, i);
1444                 index = isl_pw_aff_from_aff(aff);
1445                 bound = isl_pw_aff_copy(array->bound[i]);
1446                 bound = isl_pw_aff_from_range(bound);
1447                 bound = isl_pw_aff_add_dims(bound, isl_dim_in, array->n_index);
1448                 bound = isl_pw_aff_set_tuple_id(bound, isl_dim_in,
1449                                                 isl_id_copy(id));
1450                 lt = isl_pw_aff_lt_set(index, bound);
1451                 extent = isl_set_intersect(extent, lt);
1452         }
1453         isl_local_space_free(ls);
1454         isl_id_free(id);
1455
1456         return extent;
1457 }
1458
1459 /* Return a map from the first shared_len dimensions of the computed
1460  * schedule to the array tile in
1461  * global memory that corresponds to the shared memory copy.
1462  *
1463  * In particular, return a map
1464  *
1465  *      { D[i] -> A[a] }
1466  *
1467  * with constraints
1468  *
1469  *      tile_offset(i) <= a <= tile_offset(i) + tile_size - 1           (1)
1470  *
1471  * and
1472  *
1473  *      0 <= a <= array_size - 1                                        (2)
1474  *
1475  * Note that if some stride has been detected (i.e., when
1476  * group->shared_tile->bound[i].shift is set), then a in (1) refers
1477  * to the shifted and scaled down version.
1478  *
1479  * Constraints (1) are obtained by mapping the size constraints on the
1480  * shared/private memory tile back to the access relation.
1481  * Constraints (2) are obtained from the (recomputed) extent.
1482  */
1483 static __isl_give isl_map *group_tile(struct gpu_array_ref_group *group)
1484 {
1485         int i;
1486         int n_index = group->array->n_index;
1487         isl_map *tile;
1488         isl_space *space;
1489         isl_set *local;
1490         isl_set *extent;
1491
1492         space = isl_multi_aff_get_space(group->shared_tile->tiling);
1493         space = isl_space_range(space);
1494         local = isl_set_universe(space);
1495         for (i = 0; i < n_index; ++i) {
1496                 isl_val *bound;
1497
1498                 local = isl_set_lower_bound_si(local, isl_dim_set, i, 0);
1499                 bound = isl_val_copy(group->shared_tile->bound[i].size);
1500                 bound = isl_val_sub_ui(bound, 1);
1501                 local = isl_set_upper_bound_val(local, isl_dim_set, i, bound);
1502         }
1503         local = isl_set_preimage_multi_aff(local,
1504                                 isl_multi_aff_copy(group->shared_tile->tiling));
1505         tile = isl_set_unwrap(local);
1506         extent = array_extent(group->array);
1507         tile = isl_map_intersect_range(tile, extent);
1508
1509         return tile;
1510 }
1511
1512 /* Given a mapping "iterator_map" from the AST schedule to a domain,
1513  * return the corresponding mapping from the AST schedule to
1514  * to the first shared_len dimensions of the schedule computed by PPCG.
1515  */
1516 static __isl_give isl_pw_multi_aff *compute_sched_to_shared(struct gpu_gen *gen,
1517         __isl_take isl_pw_multi_aff *iterator_map)
1518 {
1519         isl_union_map *umap;
1520         isl_space *space;
1521         isl_map *map, *sched;;
1522
1523         space = isl_space_range(isl_pw_multi_aff_get_space(iterator_map));
1524         space = isl_space_from_domain(space);
1525         space = isl_space_add_dims(space, isl_dim_out, gen->shared_len);
1526
1527         umap = isl_union_map_copy(gen->shared_sched);
1528         umap = isl_union_map_apply_range(umap,
1529                         isl_union_map_copy(gen->shared_proj));
1530         map = isl_union_map_extract_map(umap, space);
1531         isl_union_map_free(umap);
1532
1533         sched = isl_map_preimage_domain_pw_multi_aff(map, iterator_map);
1534         sched = isl_map_detect_equalities(sched);
1535
1536         return isl_pw_multi_aff_from_map(sched);
1537 }
1538
1539 /* Set unroll[j] if the input dimension j is involved in
1540  * the index expression represented by ma.
1541  */
1542 static int check_unroll(__isl_take isl_set *set, __isl_take isl_multi_aff *ma,
1543         void *user)
1544 {
1545         int i, j;
1546         int n_in = isl_multi_aff_dim(ma, isl_dim_in);
1547         int n_out = isl_multi_aff_dim(ma, isl_dim_out);
1548         int *unroll = user;
1549
1550         for (i = 0; i < n_out; ++i) {
1551                 isl_aff *aff;
1552
1553                 aff = isl_multi_aff_get_aff(ma, i);
1554                 for (j = 0; j < n_in; ++j)
1555                         if (isl_aff_involves_dims(aff, isl_dim_in, j, 1))
1556                                 unroll[j] = 1;
1557                 isl_aff_free(aff);
1558         }
1559
1560         isl_set_free(set);
1561         isl_multi_aff_free(ma);
1562         return 0;
1563 }
1564
1565 /* Given an array pos mapping input dimensions to the corresponding
1566  * output dimension, construct the corresponding map.
1567  */
1568 static __isl_give isl_map *permutation(__isl_take isl_space *dim,
1569         int *pos, int len)
1570 {
1571         int i;
1572         isl_constraint *c;
1573         isl_basic_map *bmap;
1574         isl_local_space *ls;
1575
1576         dim = isl_space_add_dims(dim, isl_dim_in, len);
1577         dim = isl_space_add_dims(dim, isl_dim_out, len);
1578         bmap = isl_basic_map_universe(isl_space_copy(dim));
1579         ls = isl_local_space_from_space(dim);
1580
1581         for (i = 0; i < len; ++i) {
1582                 c = isl_equality_alloc(isl_local_space_copy(ls));
1583                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i,
1584                                                       -1);
1585                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, pos[i],
1586                                                       1);
1587                 bmap = isl_basic_map_add_constraint(bmap, c);
1588         }
1589         isl_local_space_free(ls);
1590
1591         return isl_map_from_basic_map(bmap);
1592 }
1593
1594 /* Find all loops involved in any of the index expressions for any of
1595  * the private accesses, move them innermost and then mark them as
1596  * requiring unrolling by setting gen->first_unroll.
1597  * The loops involved should all be parallel because of the checks
1598  * we performed in check_private_group_access.  Moving them innermost
1599  * is therefore a valid transformation.
1600  *
1601  * Loops up to gen->shared_len are generated before the mapping to
1602  * threads is applied.  They should therefore be ignored.
1603  *
1604  * We compute the hidden equalities of the schedule first
1605  * since we will need them in our calls to isl_pw_multi_aff_from_map
1606  * and because we want to make sure that the same equalities
1607  * are also available to the code generator.
1608  */
1609 static __isl_give isl_union_map *interchange_for_unroll(struct gpu_gen *gen,
1610         __isl_take isl_union_map *sched)
1611 {
1612         int i, j;
1613         int unroll[gen->thread_tiled_len];
1614         int perm[gen->thread_tiled_len];
1615         isl_space *dim;
1616         isl_map *permute;
1617         int len = gen->shared_len + gen->n_parallel + gen->n_block;
1618
1619         gen->first_unroll = -1;
1620
1621         sched = isl_union_map_detect_equalities(sched);
1622         for (i = 0; i < gen->thread_tiled_len; ++i)
1623                 unroll[i] = 0;
1624         for (i = 0; i < gen->prog->n_array; ++i) {
1625                 struct gpu_array_info *array = &gen->prog->array[i];
1626
1627                 for (j = 0; j < array->n_group; ++j) {
1628                         isl_union_map *access;
1629                         isl_map *acc;
1630                         isl_pw_multi_aff *pma;
1631
1632                         if (!array->groups[j]->private_tile)
1633                                 continue;
1634
1635                         access = group_access_relation(array->groups[j], 1, 1);
1636                         access = isl_union_map_apply_domain(access,
1637                                                 isl_union_map_copy(sched));
1638
1639                         acc = isl_map_from_union_map(access);
1640                         pma = isl_pw_multi_aff_from_map(acc);
1641                         isl_pw_multi_aff_foreach_piece(pma,
1642                                                         &check_unroll, unroll);
1643
1644                         isl_pw_multi_aff_free(pma);
1645                 }
1646         }
1647
1648         for (i = gen->shared_len; i < len; ++i)
1649                 if (unroll[i])
1650                         break;
1651
1652         if (i >= len)
1653                 return sched;
1654
1655         for (i = len; i < gen->thread_tiled_len; ++i)
1656                 if (unroll[i])
1657                         return sched;
1658
1659         j = 0;
1660         for (i = 0; i < gen->shared_len; ++i)
1661                 perm[i] = j++;
1662         for (i = gen->shared_len; i < gen->thread_tiled_len; ++i)
1663                 if (!unroll[i])
1664                         perm[i] = j++;
1665         gen->first_unroll = j - gen->shared_len;
1666         for (i = gen->shared_len; i < len; ++i)
1667                 if (unroll[i])
1668                         perm[i] = j++;
1669
1670         dim = isl_union_map_get_space(sched);
1671         permute = permutation(dim, perm, gen->thread_tiled_len);
1672         sched = isl_union_map_apply_range(sched,
1673                                           isl_union_map_from_map(permute));
1674
1675         return sched;
1676 }
1677
1678 /* Given a constraint
1679  *
1680  *              a(p,i) + j = g f(e)
1681  *
1682  * or -a(p,i) - j = g f(e) if sign < 0,
1683  * store a(p,i) in bound->shift and g (stride) in bound->stride.
1684  * a(p,i) is assumed to be an expression in only the parameters
1685  * and the input dimensions.
1686  */
1687 static void extract_stride(__isl_keep isl_constraint *c,
1688         struct gpu_array_bound *bound, __isl_keep isl_val *stride, int sign)
1689 {
1690         int i;
1691         isl_val *v;
1692         isl_space *space;
1693         unsigned nparam;
1694         unsigned nvar;
1695         isl_aff *aff;
1696
1697         isl_val_free(bound->stride);
1698         bound->stride = isl_val_copy(stride);
1699
1700         space = isl_constraint_get_space(c);
1701         space = isl_space_domain(space);
1702
1703         nparam = isl_space_dim(space, isl_dim_param);
1704         nvar = isl_space_dim(space, isl_dim_set);
1705
1706         v = isl_constraint_get_constant_val(c);
1707         if (sign < 0)
1708                 v = isl_val_neg(v);
1709         aff = isl_aff_zero_on_domain(isl_local_space_from_space(space));
1710         aff = isl_aff_set_constant_val(aff, v);
1711
1712         for (i = 0; i < nparam; ++i) {
1713                 if (!isl_constraint_involves_dims(c, isl_dim_param, i, 1))
1714                         continue;
1715                 v = isl_constraint_get_coefficient_val(c, isl_dim_param, i);
1716                 if (sign < 0)
1717                         v = isl_val_neg(v);
1718                 aff = isl_aff_add_coefficient_val(aff, isl_dim_param, i, v);
1719         }
1720
1721         for (i = 0; i < nvar; ++i) {
1722                 if (!isl_constraint_involves_dims(c, isl_dim_in, i, 1))
1723                         continue;
1724                 v = isl_constraint_get_coefficient_val(c, isl_dim_in, i);
1725                 if (sign < 0)
1726                         v = isl_val_neg(v);
1727                 aff = isl_aff_add_coefficient_val(aff, isl_dim_in, i, v);
1728         }
1729
1730         bound->shift = aff;
1731 }
1732
1733 /* Given an equality constraint of a map with a single output dimension j,
1734  * check if the constraint is of the form
1735  *
1736  *              a(p,i) + j = g f(e)
1737  *
1738  * with a(p,i) an expression in the parameters and input dimensions
1739  * and f(e) an expression in the existentially quantified variables.
1740  * If so, and if g is larger than any such g from a previously considered
1741  * constraint, then call extract_stride to record the stride information
1742  * in bound.
1743  */
1744 static int check_stride_constraint(__isl_take isl_constraint *c, void *user)
1745 {
1746         int i;
1747         isl_ctx *ctx;
1748         isl_val *v;
1749         unsigned n_div;
1750         struct gpu_array_bound *bound = user;
1751
1752         ctx = isl_constraint_get_ctx(c);
1753         n_div = isl_constraint_dim(c, isl_dim_div);
1754         v = isl_constraint_get_coefficient_val(c, isl_dim_out, 0);
1755
1756         if (n_div && (isl_val_is_one(v) || isl_val_is_negone(v))) {
1757                 int s = isl_val_sgn(v);
1758                 isl_val *stride = isl_val_zero(ctx);
1759
1760                 isl_val_free(v);
1761                 for (i = 0; i < n_div; ++i) {
1762                         v = isl_constraint_get_coefficient_val(c,
1763                                                                 isl_dim_div, i);
1764                         stride = isl_val_gcd(stride, v);
1765                 }
1766                 if (!isl_val_is_zero(stride) &&
1767                     isl_val_gt(stride, bound->stride))
1768                         extract_stride(c, bound, stride, s);
1769
1770                 isl_val_free(stride);
1771         } else
1772                 isl_val_free(v);
1773
1774         isl_constraint_free(c);
1775         return 0;
1776 }
1777
1778 /* Given contraints on an array index i, check if we can find
1779  * a shift a(p) and a stride g such that
1780  *
1781  *      a(p) + i = 0 mod g
1782  *
1783  * If so, record the information in bound and apply the mapping
1784  * i -> (i + a(p))/g to the array index in bounds and return
1785  * the new constraints.
1786  * If not, simply return the original constraints.
1787  *
1788  * If bounds is a subset of the space
1789  *
1790  *      D -> i
1791  *
1792  * then the bound recorded in bound->shift is of the form
1793  *
1794  *      D -> s(D)
1795  *
1796  * with s(D) equal to a(p) above.
1797  * The mapping recorded in bound->shift_map is of the form
1798  *
1799  *      [D -> i] -> [D -> (i + S(D))/g]
1800  *
1801  * This mapping is computed as follows.
1802  * We first introduce "i" in the domain through precomposition
1803  * with [D -> i] -> D obtaining
1804  *
1805  *      [D -> i] -> s(D)
1806  *
1807  * Adding [D -> i] -> i produces
1808  *
1809  *      [D -> i] -> i + s(D)
1810  *
1811  * and the domain product with [D -> i] -> D yields
1812  *
1813  *      [D -> i] -> [D -> i + s(D)]
1814  *
1815  * Composition with [D -> i] -> [D -> i/g] gives the desired result.
1816  */
1817 static __isl_give isl_basic_map *check_stride(struct gpu_array_bound *bound,
1818         __isl_take isl_basic_map *bounds)
1819 {
1820         isl_space *space;
1821         isl_basic_map *hull;
1822         isl_basic_map *shift, *id, *bmap, *scale;
1823         isl_basic_set *bset;
1824         isl_aff *aff;
1825
1826         bound->stride = NULL;
1827
1828         hull = isl_basic_map_affine_hull(isl_basic_map_copy(bounds));
1829
1830         isl_basic_map_foreach_constraint(hull, &check_stride_constraint, bound);
1831
1832         isl_basic_map_free(hull);
1833
1834         if (!bound->stride)
1835                 return bounds;
1836
1837         shift = isl_basic_map_from_aff(isl_aff_copy(bound->shift));
1838         space = isl_basic_map_get_space(bounds);
1839         bmap = isl_basic_map_domain_map(isl_basic_map_universe(space));
1840         shift = isl_basic_map_apply_range(bmap, shift);
1841         space = isl_basic_map_get_space(bounds);
1842         id = isl_basic_map_range_map(isl_basic_map_universe(space));
1843         shift = isl_basic_map_sum(id, shift);
1844         space = isl_basic_map_get_space(bounds);
1845         id = isl_basic_map_domain_map(isl_basic_map_universe(space));
1846         shift = isl_basic_map_range_product(id, shift);
1847
1848         space = isl_space_domain(isl_basic_map_get_space(bounds));
1849         id = isl_basic_map_identity(isl_space_map_from_set(space));
1850         space = isl_space_range(isl_basic_map_get_space(bounds));
1851         aff = isl_aff_zero_on_domain(isl_local_space_from_space(space));
1852         aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1);
1853         aff = isl_aff_scale_down_val(aff, isl_val_copy(bound->stride));
1854         scale = isl_basic_map_from_aff(aff);
1855         scale = isl_basic_map_product(id, scale);
1856
1857         bound->shift_map = isl_basic_map_apply_range(shift, scale);
1858         bmap = isl_basic_map_copy(bound->shift_map);
1859         bset = isl_basic_set_apply(isl_basic_map_wrap(bounds), bmap);
1860         bounds = isl_basic_set_unwrap(bset);
1861
1862         return bounds;
1863 }
1864
1865 /* Data used in compute_array_dim_size and compute_size_in_direction.
1866  *
1867  * pos is the position of the variable representing the array index,
1868  * i.e., the variable for which want to compute the size.  This variable
1869  * is also the last variable in the set.
1870  */
1871 struct gpu_size_info {
1872         isl_basic_set *bset;
1873         struct gpu_array_bound *bound;
1874         int pos;
1875 };
1876
1877 /* Given a constraint from the basic set describing the bounds on
1878  * an array index, check if it is a lower bound, say m i >= b(x), and,
1879  * if so, check whether the expression "i - ceil(b(x)/m) + 1" has a constant
1880  * upper bound.  If so, and if this bound is smaller than any bound
1881  * derived from earlier constraints, set the size to this bound on
1882  * the expression and the lower bound to ceil(b(x)/m).
1883  */
1884 static int compute_size_in_direction(__isl_take isl_constraint *c, void *user)
1885 {
1886         struct gpu_size_info *size = user;
1887         unsigned nparam;
1888         unsigned n_div;
1889         isl_val *v;
1890         isl_aff *aff;
1891         isl_aff *lb;
1892
1893         nparam = isl_basic_set_dim(size->bset, isl_dim_param);
1894         n_div = isl_constraint_dim(c, isl_dim_div);
1895
1896         if (isl_constraint_involves_dims(c, isl_dim_div, 0, n_div) ||
1897             !isl_constraint_is_lower_bound(c, isl_dim_set, size->pos)) {
1898                 isl_constraint_free(c);
1899                 return 0;
1900         }
1901
1902         aff = isl_constraint_get_bound(c, isl_dim_set, size->pos);
1903         aff = isl_aff_ceil(aff);
1904
1905         lb = isl_aff_copy(aff);
1906
1907         aff = isl_aff_neg(aff);
1908         aff = isl_aff_add_coefficient_si(aff, isl_dim_in, size->pos, 1);
1909
1910         v = isl_basic_set_max_val(size->bset, aff);
1911         isl_aff_free(aff);
1912
1913         if (isl_val_is_int(v)) {
1914                 v = isl_val_add_ui(v, 1);
1915                 if (!size->bound->size || isl_val_lt(v, size->bound->size)) {
1916                         isl_val_free(size->bound->size);
1917                         size->bound->size = isl_val_copy(v);
1918                         lb = isl_aff_drop_dims(lb, isl_dim_in, size->pos, 1);
1919                         isl_aff_free(size->bound->lb);
1920                         size->bound->lb = isl_aff_copy(lb);
1921                 }
1922         }
1923         isl_val_free(v);
1924         isl_aff_free(lb);
1925
1926         isl_constraint_free(c);
1927
1928         return 0;
1929 }
1930
1931 /* Given a basic map "bounds" that maps parameters and input dimensions
1932  * to a single output dimension, look for an expression in the parameters
1933  * and input dimensions such that the range of the output dimension shifted
1934  * by this expression is a constant.
1935  *
1936  * In particular, we currently only consider lower bounds on the output
1937  * dimension as candidate expressions.
1938  */
1939 static int compute_array_dim_size(struct gpu_array_bound *bound,
1940         __isl_take isl_basic_map *bounds)
1941 {
1942         struct gpu_size_info size;
1943
1944         bounds = isl_basic_map_detect_equalities(bounds);
1945         bounds = check_stride(bound, bounds);
1946
1947         bound->size = NULL;
1948         bound->lb = NULL;
1949
1950         size.bound = bound;
1951         size.pos = isl_basic_map_dim(bounds, isl_dim_in);
1952         size.bset = isl_basic_map_wrap(bounds);
1953         size.bset = isl_basic_set_flatten(size.bset);
1954         size.bset = isl_set_simple_hull(isl_basic_set_compute_divs(size.bset));
1955         isl_basic_set_foreach_constraint(size.bset, &compute_size_in_direction,
1956                                         &size);
1957         isl_basic_set_free(size.bset);
1958
1959         return bound->size ? 0 : -1;
1960 }
1961
1962 /* Check if we can find a memory tile for the given array
1963  * based on the given accesses, and if so, put the results in "tile".
1964  *
1965  * We project the accesses on each index in turn and look for a parametric
1966  * offset such that the size is constant.
1967  */
1968 static int can_tile(__isl_keep isl_map *access, struct gpu_array_tile *tile)
1969 {
1970         int i;
1971
1972         for (i = 0; i < tile->n; ++i) {
1973                 isl_map *access_i;
1974                 isl_basic_map *hull;
1975
1976                 access_i = isl_map_copy(access);
1977                 access_i = isl_map_project_out(access_i, isl_dim_out, 0, i);
1978                 access_i = isl_map_project_out(access_i, isl_dim_out,
1979                                             1, tile->n - (i + 1));
1980                 access_i = isl_map_compute_divs(access_i);
1981                 hull = isl_map_simple_hull(access_i);
1982                 if (compute_array_dim_size(&tile->bound[i], hull) < 0)
1983                         return 0;
1984         }
1985
1986         return 1;
1987 }
1988
1989 /* Construct a map with input the shared tile loops and the loops that
1990  * will be wrapped around the threads that relates these later loops
1991  * to the thread indices and then projects them out.
1992  */
1993 static __isl_give isl_map *compute_privatization(struct gpu_gen *gen)
1994 {
1995         isl_map *priv;
1996         isl_map *tiling;
1997         isl_map *proj;
1998         isl_set *par;
1999         isl_space *dim;
2000
2001         dim = isl_union_map_get_space(gen->shared_sched);
2002
2003         if (gen->options->wrap)
2004                 tiling = wrap(isl_space_copy(dim), gen->shared_len + gen->n_block,
2005                                 gen->shared_len, gen->n_block, gen->block_dim);
2006         else
2007                 tiling = tile(isl_space_copy(dim), gen->shared_len + gen->n_block,
2008                                 gen->shared_len, gen->n_block, gen->block_dim);
2009
2010         priv = tiling;
2011
2012         par = parametrization(dim, gen->shared_len + 2 * gen->n_block,
2013                 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
2014                 gen->n_block, "t");
2015
2016         priv = isl_map_align_params(priv, isl_set_get_space(par));
2017         priv = isl_map_intersect_range(priv, par);
2018
2019         dim = isl_map_get_space(priv);
2020         dim = isl_space_drop_dims(dim, isl_dim_in, 0, isl_space_dim(dim, isl_dim_in));
2021         dim = isl_space_drop_dims(dim, isl_dim_out, 0, isl_space_dim(dim, isl_dim_out));
2022         proj = projection(dim, gen->shared_len + 2 * gen->n_block,
2023                           gen->shared_len);
2024
2025         priv = isl_map_apply_range(priv, proj);
2026
2027         return priv;
2028 }
2029
2030 /* Construct a map from domain_dim to domain_dim that increments
2031  * the dimension at position "pos" and leaves all other dimensions
2032  * constant.
2033  */
2034 static __isl_give isl_map *next(__isl_take isl_space *domain_dim, int pos)
2035 {
2036         int i;
2037         int len = isl_space_dim(domain_dim, isl_dim_set);
2038         isl_space *dim;
2039         isl_basic_map *next;
2040         isl_local_space *ls;
2041
2042         dim = isl_space_map_from_set(domain_dim);
2043         next = isl_basic_map_universe(isl_space_copy(dim));
2044         ls = isl_local_space_from_space(dim);
2045
2046         for (i = 0; i < len; ++i) {
2047                 isl_constraint *c;
2048
2049                 c = isl_equality_alloc(isl_local_space_copy(ls));
2050                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, 1);
2051                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
2052                 if (i == pos)
2053                         c = isl_constraint_set_constant_si(c, 1);
2054                 next = isl_basic_map_add_constraint(next, c);
2055         }
2056
2057         isl_local_space_free(ls);
2058
2059         return isl_map_from_basic_map(next);
2060 }
2061
2062 /* Check if the given access is coalesced.
2063  * That is, check whether incrementing the dimension that will get
2064  * wrapped over the last thread index results in incrementing
2065  * the last array index.
2066  *
2067  * This function is only called for access relations without reuse.
2068  */
2069 static int access_is_coalesced(struct gpu_gen *gen,
2070         __isl_keep isl_union_map *access)
2071 {
2072         isl_space *dim;
2073         isl_map *access_map;
2074         isl_map *next_thread_x;
2075         isl_map *next_element;
2076         isl_map *map;
2077         int coalesced;
2078
2079         access = isl_union_map_copy(access);
2080         access = isl_union_map_apply_domain(access,
2081                                 isl_union_map_copy(gen->tiled_sched));
2082         access_map = isl_map_from_union_map(access);
2083
2084         dim = isl_map_get_space(access_map);
2085         dim = isl_space_domain(dim);
2086         next_thread_x = next(dim, gen->shared_len + gen->n_block - 1);
2087
2088         dim = isl_map_get_space(access_map);
2089         dim = isl_space_range(dim);
2090         next_element = next(dim, isl_space_dim(dim, isl_dim_set) - 1);
2091
2092         map = isl_map_apply_domain(next_thread_x, isl_map_copy(access_map));
2093         map = isl_map_apply_range(map, access_map);
2094
2095         coalesced = isl_map_is_subset(map, next_element);
2096
2097         isl_map_free(next_element);
2098         isl_map_free(map);
2099
2100         return coalesced;
2101 }
2102
2103 /* Given an access relation in terms of the first gen->shared_len + gen->n_block
2104  * dimensions of the computed schedule, check if it is bijective for
2105  * fixed values of the first gen->shared_len dimensions.
2106  * We perform this check by equating these dimensions to parameters.
2107  */
2108 static int access_is_bijective(struct gpu_gen *gen, __isl_keep isl_map *access)
2109 {
2110         int res;
2111         isl_set *par;
2112         isl_space *space;
2113
2114         access = isl_map_copy(access);
2115         space = isl_space_params(isl_map_get_space(access));
2116         par = parametrization(space, gen->shared_len + gen->n_block,
2117                                 0, gen->shared_len, "s");
2118         access = isl_map_intersect_domain(access, par);
2119         res = isl_map_is_bijective(access);
2120         isl_map_free(access);
2121
2122         return res;
2123 }
2124
2125 /* Look for the last shared tile loop that affects the offset of "tile"
2126  * and return the result.
2127  * If there is no such loop, then return the index of the loop
2128  * before the first shared tile loop, in particular gen->tile_first - 1.
2129  */
2130 static int compute_tile_last_shared(struct gpu_gen *gen,
2131         struct gpu_array_tile *tile)
2132 {
2133         int i, j;
2134
2135         for (j = gen->shared_len - 1; j >= gen->tile_first; --j) {
2136                 for (i = 0; i < tile->n; ++i) {
2137                         isl_aff *lb;
2138                         isl_aff *shift;
2139
2140                         lb = tile->bound[i].lb;
2141                         if (isl_aff_involves_dims(lb, isl_dim_in, j, 1))
2142                                 break;
2143
2144                         shift = tile->bound[i].shift;
2145                         if (!shift)
2146                                 continue;
2147                         if (isl_aff_involves_dims(shift, isl_dim_in, j, 1))
2148                                 break;
2149                 }
2150                 if (i < tile->n)
2151                         break;
2152         }
2153
2154         return j;
2155 }
2156
2157 /* Look for the last shared tile loop that affects the offset of the
2158  * shared or private tile and store the result in group->last_shared.
2159  * If there is no such loop, then group->last_shared is set to a value
2160  * before the first shared tile loop, in particular gen->tile_first - 1.
2161  * If there is no tile defined on the array reference group,
2162  * then set group->last_shared to gen->shared_len - 1.
2163  */
2164 static void set_last_shared(struct gpu_gen *gen,
2165         struct gpu_array_ref_group *group)
2166 {
2167         struct gpu_array_tile *tile;
2168
2169         group->last_shared = gen->shared_len - 1;
2170
2171         tile = group->private_tile;
2172         if (!tile)
2173                 tile = group->shared_tile;
2174         if (!tile)
2175                 return;
2176
2177         group->last_shared = compute_tile_last_shared(gen, tile);
2178 }
2179
2180 /* Compute a privatized copy of all access relations from reference groups that
2181  * are mapped to private memory and store the result in gen->privatization.
2182  *
2183  * Read-only scalars and arrays containing structures are not mapped
2184  * to private memory.
2185  */
2186 static void compute_private_access(struct gpu_gen *gen)
2187 {
2188         int i, j;
2189         isl_union_map *private;
2190
2191         if (!gen->options->use_private_memory)
2192                 return;
2193
2194         private = isl_union_map_empty(isl_union_map_get_space(gen->shared_sched));
2195
2196         for (i = 0; i < gen->prog->n_array; ++i) {
2197                 struct gpu_array_info *array = &gen->prog->array[i];
2198
2199                 if (gpu_array_is_read_only_scalar(array))
2200                         continue;
2201                 if (array->has_compound_element)
2202                         continue;
2203
2204                 for (j = 0; j < array->n_group; ++j) {
2205                         if (!array->groups[j]->private_tile)
2206                                 continue;
2207
2208                         private = isl_union_map_union(private,
2209                                 group_access_relation(array->groups[j], 1, 1));
2210                 }
2211         }
2212
2213         if (isl_union_map_is_empty(private))
2214                 isl_union_map_free(private);
2215         else {
2216                 isl_union_map *priv;
2217
2218                 private = isl_union_map_apply_domain(private,
2219                                         isl_union_map_copy(gen->shared_sched));
2220                 priv = isl_union_map_from_map(isl_map_copy(gen->privatization));
2221                 private = isl_union_map_apply_domain(private, priv);
2222                 gen->private_access = private;
2223         }
2224 }
2225
2226 /* Compute the size of the tile specified by "tile"
2227  * in number of elements and return the result.
2228  */
2229 static __isl_give isl_val *tile_size(isl_ctx *ctx, struct gpu_array_tile *tile)
2230 {
2231         int i;
2232         isl_val *size;
2233
2234         size = isl_val_one(ctx);
2235
2236         for (i = 0; i < tile->n; ++i)
2237                 size = isl_val_mul(size, isl_val_copy(tile->bound[i].size));
2238
2239         return size;
2240 }
2241
2242 /* If max_shared_memory is not set to infinity (-1), then make
2243  * sure that the total amount of shared memory required by the
2244  * array reference groups mapped to shared memory is no larger
2245  * than this maximum.
2246  *
2247  * We apply a greedy approach and discard (keep in global memory)
2248  * those groups that would result in a total memory size that
2249  * is larger than the maximum.
2250  */
2251 static void check_shared_memory_bound(struct gpu_gen *gen)
2252 {
2253         int i, j;
2254         isl_val *left, *size;
2255
2256         if (gen->options->max_shared_memory < 0)
2257                 return;
2258
2259         left = isl_val_int_from_si(gen->ctx, gen->options->max_shared_memory);
2260
2261         for (i = 0; i < gen->prog->n_array; ++i) {
2262                 struct gpu_array_info *array = &gen->prog->array[i];
2263
2264                 for (j = 0; j < array->n_group; ++j) {
2265                         struct gpu_array_ref_group *group;
2266
2267                         group = array->groups[j];
2268                         if (group->private_tile)
2269                                 continue;
2270                         if (!group->shared_tile)
2271                                 continue;
2272
2273                         size = tile_size(gen->ctx, group->shared_tile);
2274                         size = isl_val_mul_ui(size, array->size);
2275
2276                         if (isl_val_le(size, left)) {
2277                                 left = isl_val_sub(left, size);
2278                                 continue;
2279                         }
2280                         isl_val_free(size);
2281
2282                         group->shared_tile = free_tile(group->shared_tile);
2283                 }
2284         }
2285
2286         isl_val_free(left);
2287 }
2288
2289 /* Given a description of an array tile "tile" and the "space"
2290  *
2291  *      { D -> A }
2292  *
2293  * where D represents the first shared_len schedule dimensions
2294  * and A represents the array, construct an isl_multi_aff
2295  *
2296  *      { [D[i] -> A[a]] -> A'[a'] }
2297  *
2298  * with A' a scaled down copy of A according to the shifts and strides
2299  * in "tile".  In particular,
2300  *
2301  *      a' = (a + shift(i))/stride
2302  *
2303  * "insert_array" represents
2304  *
2305  *      { [D -> A] -> D }
2306  *
2307  * and is used to insert A into the domain of functions that only
2308  * reference D.
2309  */
2310 static __isl_give isl_multi_aff *strided_tile(
2311         struct gpu_array_tile *tile, __isl_keep isl_space *space,
2312         __isl_keep isl_multi_aff *insert_array)
2313 {
2314         int i;
2315         isl_ctx *ctx;
2316         isl_multi_aff *shift;
2317         isl_multi_val *stride;
2318         isl_space *space2;
2319         isl_local_space *ls;
2320         isl_multi_aff *tiling;
2321
2322         ctx = isl_space_get_ctx(space);
2323         space2 = isl_space_domain(isl_space_copy(space));
2324         ls = isl_local_space_from_space(space2);
2325         space2 = isl_space_range(isl_space_copy(space));
2326         stride = isl_multi_val_zero(space2);
2327         shift = isl_multi_aff_zero(isl_space_copy(space));
2328
2329         for (i = 0; i < tile->n; ++i) {
2330                 struct gpu_array_bound *bound = &tile->bound[i];
2331                 isl_val *stride_i;
2332                 isl_aff *shift_i;
2333
2334                 if (tile->bound[i].shift) {
2335                         stride_i = isl_val_copy(bound->stride);
2336                         shift_i = isl_aff_copy(bound->shift);
2337                 } else {
2338                         stride_i = isl_val_one(ctx);
2339                         shift_i = isl_aff_zero_on_domain(
2340                                         isl_local_space_copy(ls));
2341                 }
2342
2343                 stride = isl_multi_val_set_val(stride, i, stride_i);
2344                 shift = isl_multi_aff_set_aff(shift, i, shift_i);
2345         }
2346         isl_local_space_free(ls);
2347
2348         shift = isl_multi_aff_pullback_multi_aff(shift,
2349                                     isl_multi_aff_copy(insert_array));
2350
2351         tiling = isl_multi_aff_range_map(isl_space_copy(space));
2352         tiling = isl_multi_aff_add(tiling, shift);
2353         tiling = isl_multi_aff_scale_down_multi_val(tiling, stride);
2354
2355         return tiling;
2356 }
2357
2358 /* Compute a tiling for the array reference group "group".
2359  *
2360  * The tiling is of the form
2361  *
2362  *      { [D[i] -> A[a]] -> T[t] }
2363  *
2364  * where D represents the first shared_len schedule dimensions,
2365  * A represents the global array and T represents the shared or
2366  * private memory tile.  The name of T is the name of the local
2367  * array.
2368  *
2369  * If there is any stride in the accesses, then the mapping is
2370  *
2371  *      t = (a + shift(i))/stride - lb(i)
2372  *
2373  * otherwise, it is simply
2374  *
2375  *      t = a - lb(i)
2376  */
2377 static void compute_group_tiling(struct gpu_array_ref_group *group)
2378 {
2379         int i;
2380         struct gpu_array_tile *tile;
2381         struct gpu_array_info *array = group->array;
2382         isl_space *space;
2383         isl_multi_aff *tiling, *lb, *insert_array;
2384         isl_printer *p;
2385         char *local_name;
2386
2387         tile = group->private_tile;
2388         if (!tile)
2389                 tile = group->shared_tile;
2390         if (!tile)
2391                 return;
2392
2393         space = isl_map_get_space(group->access);
2394         insert_array = isl_multi_aff_domain_map(isl_space_copy(space));
2395
2396         for (i = 0; i < tile->n; ++i)
2397                 if (tile->bound[i].shift)
2398                         break;
2399
2400         if (i < tile->n)
2401                 tiling = strided_tile(tile, space, insert_array);
2402         else
2403                 tiling = isl_multi_aff_range_map(isl_space_copy(space));
2404
2405         lb = isl_multi_aff_zero(space);
2406         for (i = 0; i < tile->n; ++i) {
2407                 isl_aff *lb_i = isl_aff_copy(tile->bound[i].lb);
2408                 lb = isl_multi_aff_set_aff(lb, i, lb_i);
2409         }
2410         lb = isl_multi_aff_pullback_multi_aff(lb, insert_array);
2411
2412         tiling = isl_multi_aff_sub(tiling, lb);
2413
2414         p = isl_printer_to_str(isl_multi_aff_get_ctx(tiling));
2415         p = print_array_name(p, group);
2416         local_name = isl_printer_get_str(p);
2417         isl_printer_free(p);
2418         tiling = isl_multi_aff_set_tuple_name(tiling, isl_dim_out, local_name);
2419         free(local_name);
2420
2421         tile->tiling = tiling;
2422 }
2423
2424 /* Compute a tiling for all the array reference groups.
2425  */
2426 static void compute_group_tilings(struct gpu_gen *gen)
2427 {
2428         int i, j;
2429
2430         for (i = 0; i < gen->prog->n_array; ++i) {
2431                 struct gpu_array_info *array = &gen->prog->array[i];
2432
2433                 for (j = 0; j < array->n_group; ++j)
2434                         compute_group_tiling(array->groups[j]);
2435         }
2436 }
2437
2438 /* Fill up the groups array with singleton groups, i.e., one group
2439  * per reference, initializing the array, access, write, n_ref and refs fields.
2440  * In particular the access field is initialized to the scheduled
2441  * access relation of the array reference.
2442  *
2443  * Return the number of elements initialized, i.e., the number of
2444  * active references in the current kernel.
2445  */
2446 static int populate_array_references(struct gpu_array_info *array,
2447         __isl_keep isl_union_map *sched, struct gpu_array_ref_group **groups)
2448 {
2449         int i;
2450         int n;
2451         isl_ctx *ctx = isl_union_map_get_ctx(sched);
2452
2453         n = 0;
2454         for (i = 0; i < array->n_ref; ++i) {
2455                 isl_union_map *umap;
2456                 isl_map *map;
2457                 struct gpu_array_ref_group *group;
2458                 struct gpu_stmt_access *access = array->refs[i];
2459
2460                 map = isl_map_copy(access->access);
2461                 umap = isl_union_map_from_map(map);
2462                 umap = isl_union_map_apply_domain(umap,
2463                                 isl_union_map_copy(sched));
2464
2465                 if (isl_union_map_is_empty(umap)) {
2466                         isl_union_map_free(umap);
2467                         continue;
2468                 }
2469
2470                 map = isl_map_from_union_map(umap);
2471                 map = isl_map_detect_equalities(map);
2472
2473                 group = isl_calloc_type(ctx, struct gpu_array_ref_group);
2474                 assert(group);
2475                 group->array = array;
2476                 group->access = map;
2477                 group->write = access->write;
2478                 group->exact_write = access->exact_write;
2479                 group->refs = &array->refs[i];
2480                 group->n_ref = 1;
2481
2482                 groups[n++] = group;
2483         }
2484
2485         return n;
2486 }
2487
2488 /* If group->n_ref == 1, then group->refs was set by
2489  * populate_array_references to point directly into
2490  * group->array->refs and should not be freed.
2491  * If group->n_ref > 1, then group->refs was set by join_groups
2492  * to point to a newly allocated array.
2493  */
2494 static void free_array_ref_group(struct gpu_array_ref_group *group)
2495 {
2496         if (!group)
2497                 return;
2498         free_tile(group->shared_tile);
2499         free_tile(group->private_tile);
2500         isl_map_free(group->access);
2501         if (group->n_ref > 1)
2502                 free(group->refs);
2503         free(group);
2504 }
2505
2506 /* Given a map where the input dimensions represent the tile loops,
2507  * eliminate the innermost of those that have a fixed value
2508  * until we reach one that does not (obviously) have a fixed value.
2509  */
2510 static __isl_give isl_map *eliminate_fixed_inner_loops(
2511         __isl_take isl_map *access)
2512 {
2513         int i, n;
2514
2515         n = isl_map_dim(access, isl_dim_in);
2516
2517         for (i = n - 1; i >= 0; --i) {
2518                 if (!map_plain_is_fixed(access, isl_dim_in, i))
2519                         break;
2520                 access = isl_map_eliminate(access, isl_dim_in, i, 1);
2521         }
2522         return access;
2523 }
2524
2525 /* Check if the access relations of group1 and group2 overlap within
2526  * the innermost loop.  In particular, ignore any inner dimension
2527  * with a fixed value.
2528  * The copying to and from shared memory will be performed within
2529  * the innermost actual loop so we are only allowed to consider
2530  * the dimensions up to that innermost loop while checking whether
2531  * two access relations overlap.
2532  */
2533 static int accesses_overlap(struct gpu_array_ref_group *group1,
2534         struct gpu_array_ref_group *group2)
2535 {
2536         int empty;
2537         isl_map *access1, *access2;
2538
2539         access1 = isl_map_copy(group1->access);
2540         access1 = eliminate_fixed_inner_loops(access1);
2541         access2 = isl_map_copy(group2->access);
2542         access2 = eliminate_fixed_inner_loops(access2);
2543         access1 = isl_map_intersect(access1, access2);
2544         empty = isl_map_is_empty(access1);
2545         isl_map_free(access1);
2546
2547         return !empty;
2548 }
2549
2550 /* Combine the given two groups into a single group, containing
2551  * the references of both groups.
2552  */
2553 static struct gpu_array_ref_group *join_groups(
2554         struct gpu_array_ref_group *group1,
2555         struct gpu_array_ref_group *group2)
2556 {
2557         int i;
2558         isl_ctx *ctx;
2559         struct gpu_array_ref_group *group;
2560
2561         ctx = isl_map_get_ctx(group1->access);
2562         group = isl_calloc_type(ctx, struct gpu_array_ref_group);
2563         assert(group);
2564         group->array = group1->array;
2565         group->access = isl_map_union(isl_map_copy(group1->access),
2566                                         isl_map_copy(group2->access));
2567         group->write = group1->write || group2->write;
2568         group->exact_write = group1->exact_write && group2->exact_write;
2569         group->n_ref = group1->n_ref + group2->n_ref;
2570         group->refs = isl_alloc_array(ctx, struct gpu_stmt_access *,
2571                                         group->n_ref);
2572         assert(group->refs);
2573         for (i = 0; i < group1->n_ref; ++i)
2574                 group->refs[i] = group1->refs[i];
2575         for (i = 0; i < group2->n_ref; ++i)
2576                 group->refs[group1->n_ref + i] = group2->refs[i];
2577
2578         return group;
2579 }
2580
2581 /* Combine the given two groups into a single group and free
2582  * the original two groups.
2583  */
2584 static struct gpu_array_ref_group *join_groups_and_free(
2585         struct gpu_array_ref_group *group1,
2586         struct gpu_array_ref_group *group2)
2587 {
2588         struct gpu_array_ref_group *group;
2589
2590         group = join_groups(group1, group2);
2591         free_array_ref_group(group1);
2592         free_array_ref_group(group2);
2593         return group;
2594 }
2595
2596 /* Compute the private and/or shared memory tiles for the array
2597  * reference group "group" of array "array".
2598  *
2599  * If the array is a read-only scalar or if the user requested
2600  * not to use shared or private memory, then we do not need to do anything.
2601  *
2602  * If the array group involves any may writes (that are not must writes),
2603  * then we would have to make sure that we load the data into shared/private
2604  * memory first in case the data is not written by the kernel
2605  * (but still written back out to global memory).
2606  * Since we don't have any such mechanism at the moment, we don't
2607  * compute shared/private tiles for groups involving may writes.
2608  *
2609  * We only try to compute a shared memory tile if there is any reuse
2610  * or if the access is not coalesced.
2611  *
2612  * For computing a private memory tile, we also require that there is
2613  * some reuse.  Moreover, we require that the access is private
2614  * to the thread.  That is, we check that any given array element
2615  * is only accessed by a single thread.
2616  * We compute an access relation that maps the shared tile loop iterators
2617  * and the shared point loop iterators that will be wrapped over the
2618  * threads to the array elements.
2619  * We actually check that those iterators that will be wrapped
2620  * partition the array space.  This check is stricter than necessary
2621  * since several iterations may be mapped onto the same thread
2622  * and then they could be allowed to access the same memory elements,
2623  * but our check does not allow this situation.
2624  *
2625  * We also check that the index expression only depends on parallel
2626  * loops.  That way, we can move those loops innermost and unroll them.
2627  * Again, we use a test that is stricter than necessary.
2628  * We actually check whether the index expression only depends
2629  * on the iterators that are wrapped over the threads.
2630  * These are necessarily parallel, but there may be more parallel loops.
2631  *
2632  * Combining the injectivity of the first test with the single-valuedness
2633  * of the second test, we simply test for bijectivity.
2634  *
2635  * If it turns out we can use registers, we compute the private memory
2636  * tile size using can_tile, after introducing a dependence
2637  * on the thread indices.
2638  */
2639 static void compute_group_bounds_core(struct gpu_gen *gen,
2640         struct gpu_array_ref_group *group)
2641 {
2642         isl_ctx *ctx = isl_space_get_ctx(group->array->space);
2643         isl_union_map *access;
2644         int n_index = group->array->n_index;
2645         int no_reuse;
2646         isl_map *acc;
2647         int use_shared = gen->options->use_shared_memory;
2648         int use_private = gen->options->use_private_memory;
2649
2650         if (!use_shared && !use_private)
2651                 return;
2652         if (gpu_array_is_read_only_scalar(group->array))
2653                 return;
2654         if (!group->exact_write)
2655                 return;
2656
2657         access = group_access_relation(group, 1, 1);
2658         no_reuse = isl_union_map_is_injective(access);
2659
2660         if (use_shared && (!no_reuse || !access_is_coalesced(gen, access))) {
2661                 group->shared_tile = create_tile(ctx, group->array->n_index);
2662                 if (!can_tile(group->access, group->shared_tile))
2663                         group->shared_tile = free_tile(group->shared_tile);
2664         }
2665
2666         if (!use_private || no_reuse) {
2667                 isl_union_map_free(access);
2668                 return;
2669         }
2670
2671         access = isl_union_map_apply_domain(access,
2672                                         isl_union_map_copy(gen->shared_sched));
2673
2674         acc = isl_map_from_union_map(access);
2675
2676         if (!access_is_bijective(gen, acc)) {
2677                 isl_map_free(acc);
2678                 return;
2679         }
2680
2681         group->private_tile = create_tile(gen->ctx, n_index);
2682         acc = isl_map_apply_domain(acc, isl_map_copy(gen->privatization));
2683         if (!can_tile(acc, group->private_tile))
2684                 group->private_tile = free_tile(group->private_tile);
2685
2686         isl_map_free(acc);
2687 }
2688
2689 /* Compute the private and/or shared memory tiles for the array
2690  * reference group "group" of array "array" and set last_shared.
2691  */
2692 static void compute_group_bounds(struct gpu_gen *gen,
2693         struct gpu_array_ref_group *group)
2694 {
2695         compute_group_bounds_core(gen, group);
2696         set_last_shared(gen, group);
2697 }
2698
2699 /* If two groups have overlapping access relations (as determined by
2700  * the "overlap" function) and if one of them involves a write,
2701  * then merge the two groups into one.
2702  * If "compute_bounds" is set, then call compute_group_bounds
2703  * on the merged groups.
2704  *
2705  * Return the updated number of groups.
2706  */
2707 static int group_writes(struct gpu_gen *gen,
2708         int n, struct gpu_array_ref_group **groups,
2709         int (*overlap)(struct gpu_array_ref_group *group1,
2710                 struct gpu_array_ref_group *group2), int compute_bounds)
2711 {
2712         int i, j;
2713
2714         for (i = 0; i < n; ++i) {
2715                 for (j = n - 1; j > i; --j) {
2716                         if (!groups[i]->write && !groups[j]->write)
2717                                 continue;
2718
2719                         if (!overlap(groups[i], groups[j]))
2720                                 continue;
2721
2722                         groups[i] = join_groups_and_free(groups[i], groups[j]);
2723                         if (compute_bounds)
2724                                 compute_group_bounds(gen, groups[i]);
2725                         if (j != n - 1)
2726                                 groups[j] = groups[n - 1];
2727                         n--;
2728                 }
2729         }
2730
2731         return n;
2732 }
2733
2734 /* If two groups have overlapping access relations (within the innermost
2735  * loop) and if one of them involves a write, then merge the two groups
2736  * into one.
2737  *
2738  * Return the updated number of groups.
2739  */
2740 static int group_overlapping_writes(struct gpu_gen *gen,
2741         int n, struct gpu_array_ref_group **groups)
2742 {
2743         return group_writes(gen, n, groups, &accesses_overlap, 0);
2744 }
2745
2746 /* Check if the access relations of group1 and group2 overlap within
2747  * the outermost min(group1->last_shared, group2->last_shared) loops.
2748  */
2749 static int last_shared_accesses_overlap(struct gpu_array_ref_group *group1,
2750         struct gpu_array_ref_group *group2)
2751 {
2752         int last_shared;
2753         int dim;
2754         int empty;
2755         isl_map *map_i, *map_j, *map;
2756
2757         last_shared = group1->last_shared;
2758         if (group2->last_shared < last_shared)
2759                 last_shared = group2->last_shared;
2760         map_i = isl_map_copy(group1->access);
2761         dim = isl_map_dim(map_i, isl_dim_in);
2762         map_i = isl_map_eliminate(map_i, isl_dim_in,
2763                                 last_shared + 1, dim - (last_shared + 1));
2764         map_j = isl_map_copy(group2->access);
2765         map_j = isl_map_eliminate(map_j, isl_dim_in,
2766                                 last_shared + 1, dim - (last_shared + 1));
2767         map = isl_map_intersect(map_i, map_j);
2768         empty = isl_map_is_empty(map);
2769         isl_map_free(map);
2770
2771         return !empty;
2772 }
2773
2774 /* If two groups have overlapping access relations (within the outer
2775  * last_shared loops) and if one of them involves a write,
2776  * then merge the two groups into one.
2777  *
2778  * Return the updated number of groups.
2779  */
2780 static int group_last_shared_overlapping_writes(struct gpu_gen *gen, int n,
2781         struct gpu_array_ref_group **groups)
2782 {
2783         return group_writes(gen, n, groups, &last_shared_accesses_overlap, 1);
2784 }
2785
2786 /* Is the size of the tile specified by "tile" smaller than the sum of
2787  * the sizes of the tiles specified by "tile1" and "tile2"?
2788  */
2789 static int smaller_tile(isl_ctx *ctx, struct gpu_array_tile *tile,
2790         struct gpu_array_tile *tile1, struct gpu_array_tile *tile2)
2791 {
2792         int smaller;
2793         isl_val *size, *size1, *size2;
2794
2795         size = tile_size(ctx, tile);
2796         size1 = tile_size(ctx, tile1);
2797         size2 = tile_size(ctx, tile2);
2798
2799         size = isl_val_sub(size, size1);
2800         size = isl_val_sub(size, size2);
2801         smaller = isl_val_is_neg(size);
2802
2803         isl_val_free(size);
2804
2805         return smaller;
2806 }
2807
2808 /* Given an initial grouping of array references and shared memory tiles
2809  * for each group that allows for a shared memory tile, merge two groups
2810  * if both have a shared memory tile, the merged group also has
2811  * a shared memory tile and the size of the tile for the merge group
2812  * is smaller than the sum of the tile sizes of the individual groups.
2813  *
2814  * If merging two groups decreases the "last_shared" dimension of
2815  * one or both of the two groups, then we need to check for overlapping
2816  * writes again.
2817  *
2818  * Return the number of groups after merging.
2819  */
2820 static int group_common_shared_memory_tile(struct gpu_gen *gen,
2821         struct gpu_array_info *array, int n,
2822         struct gpu_array_ref_group **groups)
2823 {
2824         int i, j;
2825         int recompute_overlap = 0;
2826         isl_ctx *ctx = isl_space_get_ctx(array->space);
2827
2828         for (i = 0; i < n; ++i) {
2829                 if (!groups[i]->shared_tile)
2830                         continue;
2831                 for (j = n - 1; j > i; --j) {
2832                         isl_map *map;
2833                         int empty;
2834                         struct gpu_array_ref_group *group;
2835
2836                         if (!groups[j]->shared_tile)
2837                                 continue;
2838
2839                         map = isl_map_intersect(isl_map_copy(groups[i]->access),
2840                                             isl_map_copy(groups[j]->access));
2841                         empty = isl_map_is_empty(map);
2842                         isl_map_free(map);
2843
2844                         if (empty)
2845                                 continue;
2846
2847                         group = join_groups(groups[i], groups[j]);
2848                         compute_group_bounds(gen, group);
2849                         if (!group->shared_tile ||
2850                             !smaller_tile(ctx, group->shared_tile,
2851                                         groups[i]->shared_tile,
2852                                         groups[j]->shared_tile)) {
2853                                 free_array_ref_group(group);
2854                                 continue;
2855                         }
2856
2857                         if (group->last_shared < groups[i]->last_shared ||
2858                             group->last_shared < groups[j]->last_shared)
2859                                 recompute_overlap = 1;
2860                         free_array_ref_group(groups[i]);
2861                         free_array_ref_group(groups[j]);
2862                         groups[i] = group;
2863                         if (j != n - 1)
2864                                 groups[j] = groups[n - 1];
2865                         n--;
2866                 }
2867         }
2868
2869         if (recompute_overlap)
2870                 n = group_last_shared_overlapping_writes(gen, n, groups);
2871         return n;
2872 }
2873
2874 /* Set array->n_group and array->groups to n and groups.
2875  *
2876  * Additionally, set the "nr" field of each group
2877  * and the "group" field of each reference in each group.
2878  */
2879 static void set_array_groups(struct gpu_array_info *array,
2880         int n, struct gpu_array_ref_group **groups)
2881 {
2882         int i, j;
2883
2884         array->n_group = n;
2885         array->groups = groups;
2886
2887         for (i = 0; i < n; ++i) {
2888                 groups[i]->nr = i;
2889
2890                 for (j = 0; j < groups[i]->n_ref; ++j)
2891                         groups[i]->refs[j]->group = i;
2892         }
2893 }
2894
2895 /* Group array references that should be considered together when
2896  * deciding whether to access them from private, shared or global memory.
2897  * Return -1 on error.
2898  *
2899  * In particular, if two array references overlap and if one of them
2900  * is a write, then the two references are grouped together.
2901  * We first perform an initial grouping based only on the access relation.
2902  * After computing shared and private memory tiles, we check for
2903  * overlapping writes again, but this time taking into account
2904  * the "last_shared" property.
2905  *
2906  * Furthermore, if two groups admit a shared memory tile and if the
2907  * combination of the two also admits a shared memory tile, we merge
2908  * the two groups.
2909  *
2910  * If the array contains structures, then there is no need to compute
2911  * reference groups since we do not map such arrays to private or shared
2912  * memory.
2913  */
2914 static int group_array_references(struct gpu_gen *gen,
2915         struct gpu_array_info *array, __isl_keep isl_union_map *sched)
2916 {
2917         int i;
2918         int n;
2919         isl_ctx *ctx = isl_union_map_get_ctx(sched);
2920         struct gpu_array_ref_group **groups;
2921
2922         if (array->has_compound_element)
2923                 return 0;
2924
2925         groups = isl_calloc_array(ctx, struct gpu_array_ref_group *,
2926                                         array->n_ref);
2927         if (!groups)
2928                 return -1;
2929
2930         n = populate_array_references(array, sched, groups);
2931
2932         n = group_overlapping_writes(gen, n, groups);
2933
2934         for (i = 0; i < n; ++i)
2935                 compute_group_bounds(gen, groups[i]);
2936
2937         n = group_last_shared_overlapping_writes(gen, n, groups);
2938
2939         n = group_common_shared_memory_tile(gen, array, n, groups);
2940
2941         set_array_groups(array, n, groups);
2942
2943         return 0;
2944 }
2945
2946 /* Take tiled_sched, project it onto the shared tile loops and
2947  * the loops that will be wrapped over the threads and
2948  * store the result in gen->shared_sched.
2949  * Also compute a projection that projects out the loops that will be
2950  * wrapped over the threads and store this projection in gen->shared_proj.
2951  */
2952 static void compute_shared_sched(struct gpu_gen *gen)
2953 {
2954         isl_space *dim;
2955         isl_map *proj;
2956         isl_set *par;
2957         isl_union_map *sched;
2958
2959         sched = isl_union_map_copy(gen->tiled_sched);
2960
2961         dim = isl_union_map_get_space(sched);
2962         proj = projection(dim, gen->tiled_len, gen->shared_len + gen->n_block);
2963         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
2964
2965         dim = isl_union_map_get_space(sched);
2966         proj = projection(dim, gen->shared_len + gen->n_block, gen->shared_len);
2967
2968         gen->shared_sched = sched;
2969         gen->shared_proj = isl_union_map_from_map(proj);
2970 }
2971
2972 /* Group references of all arrays in the program.
2973  */
2974 static int group_references(struct gpu_gen *gen)
2975 {
2976         int i;
2977         int r = 0;
2978         isl_union_map *sched;
2979
2980         sched = isl_union_map_apply_range(isl_union_map_copy(gen->shared_sched),
2981                                           isl_union_map_copy(gen->shared_proj));
2982
2983         for (i = 0; i < gen->prog->n_array; ++i) {
2984                 r = group_array_references(gen, &gen->prog->array[i], sched);
2985                 if (r < 0)
2986                         break;
2987         }
2988
2989         isl_union_map_free(sched);
2990
2991         return r;
2992 }
2993
2994 /* Free all array information that is local to the current kernel.
2995  */
2996 static void free_local_array_info(struct gpu_gen *gen)
2997 {
2998         int i, j;
2999
3000         for (i = 0; i < gen->prog->n_array; ++i) {
3001                 struct gpu_array_info *array = &gen->prog->array[i];
3002
3003                 for (j = 0; j < array->n_group; ++j)
3004                         free_array_ref_group(array->groups[j]);
3005                 free(array->groups);
3006         }
3007 }
3008
3009 /* Compute the size of a bounding box around the origin and "set",
3010  * where "set" is assumed to contain only non-negative elements.
3011  * In particular, compute the maximal value of "set" in each direction
3012  * and add one.
3013  */
3014 static __isl_give isl_multi_pw_aff *extract_size(__isl_take isl_set *set,
3015         __isl_keep isl_set *context)
3016 {
3017         int i, n;
3018         isl_multi_pw_aff *mpa;
3019
3020         n = isl_set_dim(set, isl_dim_set);
3021         mpa = isl_multi_pw_aff_zero(isl_set_get_space(set));
3022         for (i = 0; i < n; ++i) {
3023                 isl_space *space;
3024                 isl_aff *one;
3025                 isl_pw_aff *bound;
3026
3027                 bound = isl_set_dim_max(isl_set_copy(set), i);
3028                 bound = isl_pw_aff_coalesce(bound);
3029                 bound = isl_pw_aff_gist(bound, isl_set_copy(context));
3030
3031                 space = isl_pw_aff_get_domain_space(bound);
3032                 one = isl_aff_zero_on_domain(isl_local_space_from_space(space));
3033                 one = isl_aff_add_constant_si(one, 1);
3034                 bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one));
3035                 mpa = isl_multi_pw_aff_set_pw_aff(mpa, i, bound);
3036         }
3037         isl_set_free(set);
3038
3039         return mpa;
3040 }
3041
3042 /* Compute the effective grid size as a list of the sizes in each dimension.
3043  *
3044  * The grid size specified by the user or set by default
3045  * in read_grid_sizes() and applied in tile_schedule(),
3046  * may be too large for the given code in the sense that
3047  * it may contain blocks that don't need to execute anything.
3048  * We therefore don't return this grid size, but instead the
3049  * smallest grid size that ensures that all blocks that actually
3050  * execute code are included in the grid.
3051  *
3052  * We first extract a description of the grid, i.e., the possible values
3053  * of the block ids, from gen->tiled_sched.
3054  * The block ids are parameters in gen->tiled_sched.
3055  * We simply need to change them into set dimensions.
3056  *
3057  * Then, for each block dimension, we compute the maximal value of the block id
3058  * and add one.
3059  */
3060 static __isl_give isl_multi_pw_aff *extract_grid_size(struct gpu_gen *gen,
3061         struct ppcg_kernel *kernel)
3062 {
3063         int i;
3064         isl_set *grid;
3065
3066         grid = isl_union_map_params(isl_union_map_copy(gen->tiled_sched));
3067         grid = isl_set_from_params(grid);
3068         grid = isl_set_add_dims(grid, isl_dim_set, gen->n_grid);
3069         for (i = 0; i < gen->n_grid; ++i) {
3070                 int pos;
3071                 char name[20];
3072
3073                 snprintf(name, sizeof(name), "b%d", i);
3074                 pos = isl_set_find_dim_by_name(grid, isl_dim_param, name);
3075                 assert(pos >= 0);
3076                 grid = isl_set_equate(grid, isl_dim_param, pos, isl_dim_set, i);
3077                 grid = isl_set_project_out(grid, isl_dim_param, pos, 1);
3078         }
3079
3080         return extract_size(grid, kernel->context);
3081 }
3082
3083 /* Compute the size of a fixed bounding box around the origin and "set",
3084  * where "set" is assumed to contain only non-negative elements,
3085  * and store the results in "size".
3086  * In particular, compute the maximal value of "set" in each direction
3087  * and add one.
3088  */
3089 static void extract_fixed_size(__isl_take isl_set *set, int *size)
3090 {
3091         int i, n;
3092         isl_local_space *ls;
3093         isl_aff *obj;
3094
3095         n = isl_set_dim(set, isl_dim_set);
3096         ls = isl_local_space_from_space(isl_set_get_space(set));
3097         obj = isl_aff_zero_on_domain(ls);
3098         for (i = 0; i < n; ++i) {
3099                 isl_val *max;
3100
3101                 obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 1);
3102                 max = isl_set_max_val(set, obj);
3103                 size[i] = isl_val_get_num_si(max) + 1;
3104                 isl_val_free(max);
3105                 obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 0);
3106         }
3107         isl_aff_free(obj);
3108         isl_set_free(set);
3109 }
3110
3111 /* Compute the effective block size as a list of the sizes in each dimension
3112  * and store the sizes in kernel->block_dim.
3113  *
3114  * The block size specified by the user or set by default
3115  * in read_block_sizes() and applied in thread_tile_schedule(),
3116  * may be too large for the given code in the sense that
3117  * it may contain threads that don't need to execute anything.
3118  * We therefore don't store this block size in kernel->block_dim,
3119  * but instead the smallest block size that ensures that all threads
3120  * that actually execute code are included in the block.
3121  *
3122  * The current implementation eliminates all parameters, ensuring
3123  * that the size is a fixed constant in each dimension.
3124  * In principle we could also compute parametric sizes.
3125  * We would have to make sure to project out all b%d and t%d parameters,
3126  * however.
3127  */
3128 static void extract_block_size(struct gpu_gen *gen, struct ppcg_kernel *kernel)
3129 {
3130         int i;
3131         int nparam;
3132         isl_set *block;
3133         isl_multi_pw_aff *mpa;
3134
3135         block = isl_union_map_params(isl_union_map_copy(gen->local_sched));
3136         block = isl_set_from_params(block);
3137         block = isl_set_add_dims(block, isl_dim_set, gen->n_block);
3138         kernel->n_block = gen->n_block;
3139         for (i = 0; i < gen->n_block; ++i) {
3140                 int pos;
3141                 char name[20];
3142
3143                 snprintf(name, sizeof(name), "t%d", i);
3144                 pos = isl_set_find_dim_by_name(block, isl_dim_param, name);
3145                 assert(pos >= 0);
3146                 block = isl_set_equate(block, isl_dim_param, pos,
3147                                         isl_dim_set, i);
3148         }
3149         nparam = isl_set_dim(block, isl_dim_param);
3150         block = isl_set_project_out(block, isl_dim_param, 0, nparam);
3151
3152         extract_fixed_size(block, kernel->block_dim);
3153 }
3154
3155 void ppcg_kernel_free(void *user)
3156 {
3157         struct ppcg_kernel *kernel = user;
3158         int i;
3159
3160         if (!kernel)
3161                 return;
3162
3163         isl_multi_pw_aff_free(kernel->grid_size);
3164         isl_set_free(kernel->context);
3165         isl_union_set_free(kernel->arrays);
3166         isl_space_free(kernel->space);
3167         isl_ast_node_free(kernel->tree);
3168
3169         for (i = 0; i < kernel->n_array; ++i)
3170                 isl_pw_aff_list_free(kernel->array[i].bound);
3171         free(kernel->array);
3172
3173         for (i = 0; i < kernel->n_var; ++i) {
3174                 free(kernel->var[i].name);
3175                 isl_vec_free(kernel->var[i].size);
3176         }
3177         free(kernel->var);
3178
3179         free(kernel);
3180 }
3181
3182 static void create_kernel_var(isl_ctx *ctx, struct gpu_array_ref_group *group,
3183         struct ppcg_kernel_var *var)
3184 {
3185         int j;
3186         struct gpu_array_tile *tile;
3187         isl_printer *p;
3188         char *name;
3189
3190         var->array = group->array;
3191
3192         tile = group->private_tile;
3193         var->type = ppcg_access_private;
3194         if (!tile) {
3195                 tile = group->shared_tile;
3196                 var->type = ppcg_access_shared;
3197         }
3198
3199         p = isl_printer_to_str(ctx);
3200         p = print_array_name(p, group);
3201         var->name = isl_printer_get_str(p);
3202         isl_printer_free(p);
3203
3204         var->size = isl_vec_alloc(ctx, group->array->n_index);
3205
3206         for (j = 0; j < group->array->n_index; ++j)
3207                 var->size = isl_vec_set_element_val(var->size, j,
3208                                             isl_val_copy(tile->bound[j].size));
3209 }
3210
3211 static void create_kernel_vars(struct gpu_gen *gen, struct ppcg_kernel *kernel)
3212 {
3213         int i, j, n;
3214
3215         n = 0;
3216         for (i = 0; i < gen->prog->n_array; ++i) {
3217                 struct gpu_array_info *array = &gen->prog->array[i];
3218
3219                 for (j = 0; j < array->n_group; ++j) {
3220                         struct gpu_array_ref_group *group = array->groups[j];
3221                         if (group->private_tile || group->shared_tile)
3222                                 ++n;
3223                 }
3224         }
3225
3226         kernel->n_var = n;
3227         kernel->var = isl_calloc_array(gen->ctx, struct ppcg_kernel_var, n);
3228         assert(kernel->var);
3229
3230         n = 0;
3231         for (i = 0; i < gen->prog->n_array; ++i) {
3232                 struct gpu_array_info *array = &gen->prog->array[i];
3233
3234                 for (j = 0; j < array->n_group; ++j) {
3235                         struct gpu_array_ref_group *group = array->groups[j];
3236                         if (!group->private_tile && !group->shared_tile)
3237                                 continue;
3238                         create_kernel_var(gen->ctx, group, &kernel->var[n]);
3239                         ++n;
3240                 }
3241         }
3242 }
3243
3244 /* The sizes of the arrays on the host that have been computed by
3245  * extract_array_info may depend on the parameters.  Use the extra
3246  * constraints on the parameters that are valid at "host_domain"
3247  * to simplify these expressions and store the results in kernel->array.
3248  *
3249  * We only need these localized bounds for arrays that are accessed
3250  * by the current kernel.  If we have found at least one reference group
3251  * then the array is accessed by the kernel.  If the array has compound
3252  * elements then we skipped the construction of array reference groups.
3253  */
3254 static void localize_bounds(struct gpu_gen *gen, struct ppcg_kernel *kernel,
3255         __isl_keep isl_set *host_domain)
3256 {
3257         int i, j;
3258         isl_set *context;
3259
3260         kernel->array = isl_calloc_array(gen->ctx,
3261                             struct gpu_local_array_info, gen->prog->n_array);
3262         assert(kernel->array);
3263         kernel->n_array = gen->prog->n_array;
3264
3265         context = isl_set_copy(host_domain);
3266         context = isl_set_params(context);
3267
3268         for (i = 0; i < gen->prog->n_array; ++i) {
3269                 struct gpu_array_info *array = &gen->prog->array[i];
3270                 isl_pw_aff_list *local;
3271
3272                 if (array->n_group == 0 && !array->has_compound_element)
3273                         continue;
3274
3275                 local = isl_pw_aff_list_alloc(gen->ctx, array->n_index);
3276
3277                 for (j = 0; j < array->n_index; ++j) {
3278                         isl_pw_aff *pwaff;
3279
3280                         pwaff = isl_pw_aff_copy(array->bound[j]);
3281                         pwaff = isl_pw_aff_gist(pwaff, isl_set_copy(context));
3282                         local = isl_pw_aff_list_add(local, pwaff);
3283                 }
3284
3285                 kernel->array[i].bound = local;
3286         }
3287         isl_set_free(context);
3288 }
3289
3290 /* Find the element in gen->stmt that has the given "id".
3291  * Return NULL if no such gpu_stmt can be found.
3292  */
3293 static struct gpu_stmt *find_stmt(struct gpu_prog *prog, __isl_keep isl_id *id)
3294 {
3295         int i;
3296
3297         for (i = 0; i < prog->n_stmts; ++i) {
3298                 if (id == prog->stmts[i].id)
3299                         break;
3300         }
3301
3302         return i < prog->n_stmts ? &prog->stmts[i] : NULL;
3303 }
3304
3305 /* Set gen->tile_len and gen->n_parallel to those of the statement
3306  * affected by the first map (part of the schedule)
3307  * on which this function is called.
3308  * Because of the way the schedule is constructed, the other statements
3309  * in the list, if any, should have the same values for these properties.
3310  */
3311 static int extract_tile_len(__isl_take isl_map *map, void *user)
3312 {
3313         struct gpu_gen *gen = (struct gpu_gen *) user;
3314         isl_id *id;
3315         struct gpu_stmt *stmt;
3316
3317         id = isl_map_get_tuple_id(map, isl_dim_in);
3318         stmt = find_stmt(gen->prog, id);
3319         isl_id_free(id);
3320
3321         isl_map_free(map);
3322
3323         if (!stmt)
3324                 isl_die(gen->ctx, isl_error_unknown,
3325                         "statement not found", return -1);
3326
3327         gen->tile_len = stmt->tile_len;
3328         gen->n_parallel = stmt->n_parallel;
3329
3330         return -1;
3331 }
3332
3333 void ppcg_kernel_stmt_free(void *user)
3334 {
3335         int i;
3336         struct ppcg_kernel_stmt *stmt = user;
3337
3338         if (!stmt)
3339                 return;
3340
3341         switch (stmt->type) {
3342         case ppcg_kernel_copy:
3343                 isl_ast_expr_free(stmt->u.c.index);
3344                 isl_ast_expr_free(stmt->u.c.local_index);
3345                 break;
3346         case ppcg_kernel_domain:
3347                 isl_id_to_ast_expr_free(stmt->u.d.ref2expr);
3348                 break;
3349         case ppcg_kernel_sync:
3350                 break;
3351         }
3352
3353         free(stmt);
3354 }
3355
3356 /* Set the options of "context" to
3357  *
3358  *      { space -> [x] : x >= first }
3359  */
3360 static __isl_give isl_ast_build *set_unroll(
3361         __isl_take isl_ast_build *build, __isl_take isl_space *space,
3362         int first)
3363 {
3364         isl_ctx *ctx;
3365         isl_map *unroll;
3366         isl_union_map *opt;
3367
3368         ctx = isl_ast_build_get_ctx(build);
3369
3370         space = isl_space_from_domain(space);
3371         space = isl_space_add_dims(space, isl_dim_out, 1);
3372         space = isl_space_set_tuple_name(space, isl_dim_out, "unroll");
3373         unroll = isl_map_universe(space);
3374         unroll = isl_map_lower_bound_si(unroll, isl_dim_out, 0, first);
3375         opt = isl_union_map_from_map(unroll);
3376
3377         build = isl_ast_build_set_options(build, opt);
3378
3379         return build;
3380 }
3381
3382 /* Return a list of isl_ids of the form "prefix%d".
3383  */
3384 static __isl_give isl_id_list *generate_names(isl_ctx *ctx,
3385         int n, const char *prefix)
3386 {
3387         int i;
3388         char name[10];
3389         isl_id_list *names;
3390
3391         names = isl_id_list_alloc(ctx, n);
3392         for (i = 0; i < n; ++i) {
3393                 isl_id *id;
3394
3395                 snprintf(name, sizeof(name), "%s%d", prefix, i);
3396                 id = isl_id_alloc(ctx, name, NULL);
3397                 names = isl_id_list_add(names, id);
3398         }
3399
3400         return names;
3401 }
3402
3403 /* Extend the schedule "schedule" with the part of "extension"
3404  * starting at "first" up to "len".
3405  */
3406 static __isl_give isl_union_map *extend_schedule(
3407         __isl_take isl_union_map *schedule,
3408         __isl_take isl_union_map *extension, int first, int len)
3409 {
3410         isl_space *space;
3411         isl_map *proj;
3412         isl_union_map *umap;
3413         isl_set *set;
3414
3415         space = isl_union_map_get_space(schedule);
3416         space = isl_space_set_from_params(space);
3417         space = isl_space_add_dims(space, isl_dim_set, len);
3418         proj = isl_set_identity(isl_set_universe(space));
3419         proj = isl_map_project_out(proj, isl_dim_out, 0, first);
3420         extension = isl_union_map_apply_range(extension,
3421                                                 isl_union_map_from_map(proj));
3422
3423         schedule = isl_union_map_range_product(schedule, extension);
3424
3425         return schedule;
3426 }
3427
3428 /* Return the gpu_stmt_access in the list "accesses" that corresponds
3429  * to "ref_id".
3430  */
3431 static struct gpu_stmt_access *find_access(struct gpu_stmt_access *accesses,
3432         __isl_keep isl_id *ref_id)
3433 {
3434         struct gpu_stmt_access *access;
3435
3436         for (access = accesses; access; access = access->next)
3437                 if (access->ref_id == ref_id)
3438                         return access;
3439
3440         return NULL;
3441 }
3442
3443 /* Return the index of the array called "name" in the list of arrays.
3444  */
3445 static int find_array_index(struct gpu_gen *gen, const char *name)
3446 {
3447         int i;
3448
3449         for (i = 0; i < gen->prog->n_array; ++i)
3450                 if (!strcmp(name, gen->prog->array[i].name))
3451                         return i;
3452
3453         return -1;
3454 }
3455
3456 /* Internal data structure for the index and AST expression transformation
3457  * callbacks for pet_stmt_build_ast_exprs.
3458  *
3459  * "accesses" is the list of gpu_stmt_access in the statement.
3460  * "iterator_map" expresses the statement iterators in terms of
3461  * the AST loop iterators.
3462  * "sched2shared" expresses the first shared_len dimensions of
3463  * the computed schedule in terms of the AST loop iterators.
3464  *
3465  * The following fields are set in transform_index and used in transform_expr.
3466  * "array" is the array that is being accessed.
3467  * "global" is set if the global array is accessed (rather than
3468  * shared/private memory).
3469  * "local_array" refers to information on the array specialized
3470  * to the current kernel.
3471  */
3472 struct ppcg_transform_data {
3473         struct gpu_gen *gen;
3474         struct gpu_stmt_access *accesses;
3475         isl_pw_multi_aff *iterator_map;
3476         isl_pw_multi_aff *sched2shared;
3477
3478         struct gpu_array_info *array;
3479         int global;
3480         struct gpu_local_array_info *local_array;
3481 };
3482
3483 /* Return the name of the outer array (of structs) accessed by "access".
3484  */
3485 static const char *get_outer_array_name(__isl_keep isl_map *access)
3486 {
3487         isl_space *space;
3488         const char *name;
3489
3490         space = isl_space_range(isl_map_get_space(access));
3491         while (space && isl_space_is_wrapping(space))
3492                 space = isl_space_domain(isl_space_unwrap(space));
3493         name = isl_space_get_tuple_name(space, isl_dim_set);
3494         isl_space_free(space);
3495
3496         return name;
3497 }
3498
3499 /* Index transformation callback for pet_stmt_build_ast_exprs.
3500  *
3501  * "index" expresses the array indices in terms of statement iterators
3502  *
3503  * We first reformulate "index" in terms of the AST loop iterators.
3504  * Then we check if we are accessing the global array or
3505  * a shared/private copy.  In the former case, we simply return
3506  * the updated index.  If "index" is an affine expression rather
3507  * than an array access, then we also return the updated index here.
3508  *
3509  * If no reference groups have been computed for the array,
3510  * then we can only be accessing the global array.
3511  *
3512  * Otherwise, we apply the tiling to the index.
3513  * This tiling is of the form
3514  *
3515  *      [D -> A] -> T
3516  *
3517  * The index is of the form
3518  *
3519  *      L -> A
3520  *
3521  * We update the tiling to refer to the AST loop iteratos
3522  *
3523  *      [L -> A] -> T
3524  *
3525  * and modify index to keep track of those iterators
3526  *
3527  *      L -> [L -> A]
3528  *
3529  * Combining these two yields a tiled index expression in terms
3530  * of the AST loop iterators
3531  *
3532  *      L -> T
3533  */
3534 static __isl_give isl_multi_pw_aff *transform_index(
3535         __isl_take isl_multi_pw_aff *index, __isl_keep isl_id *ref_id,
3536         void *user)
3537 {
3538         struct ppcg_transform_data *data = user;
3539         struct gpu_stmt_access *access;
3540         struct gpu_array_ref_group *group;
3541         struct gpu_array_tile *tile;
3542         isl_pw_multi_aff *iterator_map;
3543         int i;
3544         const char *name;
3545         isl_space *space;
3546         isl_multi_pw_aff *tiling;
3547         isl_pw_multi_aff *pma;
3548         isl_multi_pw_aff *mpa;
3549
3550         data->array = NULL;
3551
3552         iterator_map = isl_pw_multi_aff_copy(data->iterator_map);
3553         index = isl_multi_pw_aff_pullback_pw_multi_aff(index, iterator_map);
3554
3555         access = find_access(data->accesses, ref_id);
3556         if (!access)
3557                 return index;
3558         if (!isl_map_has_tuple_name(access->access, isl_dim_out))
3559                 return index;
3560
3561         name = get_outer_array_name(access->access);
3562         i = find_array_index(data->gen, name);
3563         if (i < 0)
3564                 isl_die(isl_multi_pw_aff_get_ctx(index), isl_error_internal,
3565                         "cannot find array",
3566                         return isl_multi_pw_aff_free(index));
3567         data->array = &data->gen->prog->array[i];
3568         data->local_array = &data->gen->kernel->array[i];
3569
3570         if (access->group < 0) {
3571                 data->global = 1;
3572                 return index;
3573         }
3574
3575         group = data->array->groups[access->group];
3576         tile = group->private_tile;
3577         if (!tile)
3578                 tile = group->shared_tile;
3579         data->global = !tile;
3580         if (!tile)
3581                 return index;
3582
3583         space = isl_space_range(isl_multi_pw_aff_get_space(index));
3584         space = isl_space_map_from_set(space);
3585         pma = isl_pw_multi_aff_identity(space);
3586         pma = isl_pw_multi_aff_product(
3587                         isl_pw_multi_aff_copy(data->sched2shared), pma);
3588         tiling = isl_multi_pw_aff_from_multi_aff(
3589                                     isl_multi_aff_copy(tile->tiling));
3590         tiling = isl_multi_pw_aff_pullback_pw_multi_aff(tiling, pma);
3591
3592         space = isl_space_domain(isl_multi_pw_aff_get_space(index));
3593         space = isl_space_map_from_set(space);
3594         mpa = isl_multi_pw_aff_identity(space);
3595         index = isl_multi_pw_aff_range_product(mpa, index);
3596         index = isl_multi_pw_aff_pullback_multi_pw_aff(tiling, index);
3597
3598         return index;
3599 }
3600
3601 /* Dereference "expr" by adding an index [0].
3602  * The original "expr" is assumed not to have any indices.
3603  *
3604  * If "expr" is a member access, then the dereferencing needs
3605  * to be applied to the structure argument of this member access.
3606  */
3607 static __isl_give isl_ast_expr *dereference(__isl_take isl_ast_expr *expr)
3608 {
3609         isl_ctx *ctx;
3610         isl_ast_expr *res;
3611         isl_ast_expr_list *list;
3612
3613         if (isl_ast_expr_get_op_type(expr) == isl_ast_op_member) {
3614                 isl_ast_expr *arg;
3615
3616                 arg = isl_ast_expr_get_op_arg(expr, 0);
3617                 arg = dereference(arg);
3618                 expr = isl_ast_expr_set_op_arg(expr, 0, arg);
3619
3620                 return expr;
3621         }
3622
3623         ctx = isl_ast_expr_get_ctx(expr);
3624         res = isl_ast_expr_from_val(isl_val_zero(ctx));
3625         list = isl_ast_expr_list_from_ast_expr(res);
3626         res = isl_ast_expr_get_op_arg(expr, 0);
3627         res = isl_ast_expr_access(res, list);
3628         isl_ast_expr_free(expr);
3629
3630         return res;
3631 }
3632
3633 /* Linearize the index expression "expr" based on the array bounds
3634  * of "array".
3635  *
3636  * That is, transform expression
3637  *
3638  *      A[i_0][i_1]...[i_n]
3639  *
3640  * to
3641  *
3642  *      A[(..((i_0 * b_1 + i_1) ... ) * b_n + i_n]
3643  *
3644  * where b_0, b_1, ..., b_n are the bounds on the array.
3645  *
3646  * If the base of "expr" is a member access, then the linearization needs
3647  * to be applied to the structure argument of this member access.
3648  */
3649 __isl_give isl_ast_expr *gpu_local_array_info_linearize_index(
3650         struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr)
3651 {
3652         int i, n;
3653         isl_ctx *ctx;
3654         isl_set *context;
3655         isl_ast_expr *arg0;
3656         isl_ast_expr *res;
3657         isl_ast_expr_list *list;
3658         isl_ast_build *build;
3659
3660         arg0 = isl_ast_expr_get_op_arg(expr, 0);
3661         if (isl_ast_expr_get_type(arg0) == isl_ast_expr_op &&
3662             isl_ast_expr_get_op_type(arg0) == isl_ast_op_member) {
3663                 isl_ast_expr *arg;
3664
3665                 arg = isl_ast_expr_get_op_arg(arg0, 0);
3666                 arg = gpu_local_array_info_linearize_index(array, arg);
3667                 arg0 = isl_ast_expr_set_op_arg(arg0, 0, arg);
3668                 expr = isl_ast_expr_set_op_arg(expr, 0, arg0);
3669
3670                 return expr;
3671         }
3672         isl_ast_expr_free(arg0);
3673
3674         ctx = isl_ast_expr_get_ctx(expr);
3675         context = isl_set_universe(isl_space_params_alloc(ctx, 0));
3676         build = isl_ast_build_from_context(context);
3677
3678         n = isl_ast_expr_get_op_n_arg(expr);
3679         res = isl_ast_expr_get_op_arg(expr, 1);
3680         for (i = 2; i < n; ++i) {
3681                 isl_pw_aff *bound_i;
3682                 isl_ast_expr *expr_i;
3683
3684                 bound_i = isl_pw_aff_list_get_pw_aff(array->bound, i - 1);
3685                 expr_i = isl_ast_build_expr_from_pw_aff(build, bound_i);
3686                 res = isl_ast_expr_mul(res, expr_i);
3687                 expr_i = isl_ast_expr_get_op_arg(expr, i);
3688                 res = isl_ast_expr_add(res, expr_i);
3689         }
3690
3691         isl_ast_build_free(build);
3692
3693         list = isl_ast_expr_list_from_ast_expr(res);
3694         res = isl_ast_expr_get_op_arg(expr, 0);
3695         res = isl_ast_expr_access(res, list);
3696
3697         isl_ast_expr_free(expr);
3698
3699         return res;
3700 }
3701
3702 /* AST expression transformation callback for pet_stmt_build_ast_exprs.
3703  *
3704  * If the AST expression refers to a global scalar that is not
3705  * a read-only scalar, then its address was passed to the kernel and
3706  * we need to dereference it.
3707  *
3708  * If the AST expression refers to an access to a global array,
3709  * then we linearize the access exploiting the bounds in data->local_array.
3710  */
3711 static __isl_give isl_ast_expr *transform_expr(__isl_take isl_ast_expr *expr,
3712         __isl_keep isl_id *id, void *user)
3713 {
3714         struct ppcg_transform_data *data = user;
3715
3716         if (!data->array)
3717                 return expr;
3718         if (gpu_array_is_read_only_scalar(data->array))
3719                 return expr;
3720         if (!data->global)
3721                 return expr;
3722         if (data->array->n_index == 0)
3723                 return dereference(expr);
3724         if (!data->array->linearize)
3725                 return expr;
3726
3727         return gpu_local_array_info_linearize_index(data->local_array, expr);
3728 }
3729
3730 /* This function is called for each instance of a user statement
3731  * in the kernel.
3732  *
3733  * We attach a struct ppcg_kernel_stmt to the "node", containing
3734  * a computed AST expression for each access.
3735  * These AST expressions are computed from iterator_map,
3736  * which expresses the domain
3737  * elements in terms of the generated loops, and sched2shared,
3738  * which expresses the first shared_len dimensions of the schedule
3739  * computed by PPCG in terms of the generated loops.
3740  */
3741 static __isl_give isl_ast_node *at_each_domain(__isl_take isl_ast_node *node,
3742         __isl_keep isl_ast_build *build, void *user)
3743 {
3744         struct ppcg_transform_data data;
3745         struct gpu_gen *gen = (struct gpu_gen *) user;
3746         struct ppcg_kernel_stmt *stmt;
3747         isl_id *id;
3748         isl_pw_multi_aff *sched2shared;
3749         isl_map *map;
3750         isl_pw_multi_aff *iterator_map;
3751         isl_ast_expr *expr, *arg;
3752         isl_union_map *schedule;
3753         int i, n;
3754         struct gpu_stmt_access *access;
3755
3756         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
3757         if (!stmt)
3758                 return isl_ast_node_free(node);
3759
3760         expr = isl_ast_node_user_get_expr(node);
3761         arg = isl_ast_expr_get_op_arg(expr, 0);
3762         id = isl_ast_expr_get_id(arg);
3763
3764         schedule = isl_ast_build_get_schedule(build);
3765         map = isl_map_reverse(isl_map_from_union_map(schedule));
3766         iterator_map = isl_pw_multi_aff_from_map(map);
3767         sched2shared = compute_sched_to_shared(gen,
3768                                         isl_pw_multi_aff_copy(iterator_map));
3769
3770         stmt->type = ppcg_kernel_domain;
3771         stmt->u.d.stmt = find_stmt(gen->prog, id);
3772         if (!stmt->u.d.stmt)
3773                 goto error;
3774
3775         data.gen = gen;
3776         data.accesses = stmt->u.d.stmt->accesses;
3777         data.iterator_map = iterator_map;
3778         data.sched2shared = sched2shared;
3779         stmt->u.d.ref2expr = pet_stmt_build_ast_exprs(stmt->u.d.stmt->stmt,
3780                                             build, &transform_index, &data,
3781                                             &transform_expr, &data);
3782
3783         isl_id_free(id);
3784         isl_pw_multi_aff_free(iterator_map);
3785         isl_pw_multi_aff_free(sched2shared);
3786         isl_ast_expr_free(arg);
3787         isl_ast_expr_free(expr);
3788
3789         id = isl_id_alloc(gen->ctx, NULL, stmt);
3790         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
3791         return isl_ast_node_set_annotation(node, id);
3792 error:
3793         isl_id_free(id);
3794         isl_pw_multi_aff_free(iterator_map);
3795         ppcg_kernel_stmt_free(stmt);
3796         isl_pw_multi_aff_free(sched2shared);
3797         return isl_ast_node_free(node);
3798 }
3799
3800 /* This function is called when code has been generated for the shared
3801  * tile loops.  The "schedule" refers only to the original statements.
3802  *
3803  * We extend the schedule with that part of gen->local_sched that hasn't
3804  * been taken into account yet.  This introduces parameters referring
3805  * to thread ids in the schedule, so we add them (with the appropriate
3806  * bounds to the context as well).
3807  * Finally, we set the appropriate unrolling options
3808  * if gen->first_unroll is set.
3809  */
3810 static __isl_give isl_ast_node *create_domain_leaf(
3811         __isl_take isl_union_map *schedule, __isl_take isl_ast_build *build,
3812         void *user)
3813 {
3814         struct gpu_gen *gen = (struct gpu_gen *) user;
3815         isl_space *space;
3816         isl_union_map *sched;
3817         isl_ast_node *tree;
3818         isl_set *set;
3819         isl_id_list *iterators;
3820         int n;
3821
3822         schedule = extend_schedule(schedule,
3823                         isl_union_map_copy(gen->local_sched),
3824                         gen->shared_len, gen->thread_tiled_len);
3825
3826         space = isl_ast_build_get_schedule_space(build);
3827         set = isl_set_universe(space);
3828         set = add_bounded_parameters(set, gen->kernel->n_block,
3829                                         gen->kernel->block_dim, "t");
3830         build = isl_ast_build_restrict(build, set);
3831
3832         n = gen->thread_tiled_len - gen->shared_len;
3833
3834         if (gen->first_unroll >= 0) {
3835                 space = isl_space_set_alloc(gen->ctx, 0, n);
3836                 build = set_unroll(build, space, gen->first_unroll);
3837         }
3838         iterators = generate_names(gen->ctx, n, "c");
3839         build = isl_ast_build_set_iterators(build, iterators);
3840         build = isl_ast_build_set_at_each_domain(build, &at_each_domain, gen);
3841         tree = isl_ast_build_ast_from_schedule(build, schedule);
3842         isl_ast_build_free(build);
3843
3844         return tree;
3845 }
3846
3847 /* This function is called for each statement node in the AST of the code
3848  * for copying to or from shared/private memory.
3849  * Attach a pointer to a ppcg_kernel_stmt representing the copy
3850  * statement to the node.
3851  * The statement name is "read" or "write", depending on whether we are
3852  * reading from global memory or writing to global memory.
3853  * The name of the T space is {shared,private}_<array>.
3854  *
3855  * The schedule is of the form
3856  *
3857  *      type[A -> T] -> L
3858  *
3859  * where A refers to a piece of an array and T to the corresponding
3860  * shifted tile.  We split this schedule into mappings L -> A and L -> T
3861  * and store the corresponding expressions in stmt->index and stmt->local_index,
3862  * where stmt points to the ppcg_kernel_stmt that is attached to the node.
3863  */
3864 static __isl_give isl_ast_node *attach_copy_stmt(__isl_take isl_ast_node *node,
3865         __isl_keep isl_ast_build *build, void *user)
3866 {
3867         struct gpu_gen *gen = (struct gpu_gen *) user;
3868         struct ppcg_kernel_stmt *stmt;
3869         isl_id *id;
3870         isl_ast_expr *expr;
3871         isl_space *space;
3872         isl_map *access, *local_access, *map;
3873         isl_pw_multi_aff *pma;
3874         const char *type;
3875         int array_index;
3876
3877         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
3878         if (!stmt)
3879                 return isl_ast_node_free(node);
3880
3881         access = isl_map_from_union_map(isl_ast_build_get_schedule(build));
3882         type = isl_map_get_tuple_name(access, isl_dim_in);
3883         stmt->u.c.read = !strcmp(type, "read");
3884         access = isl_map_reverse(access);
3885         space = isl_space_unwrap(isl_space_range(isl_map_get_space(access)));
3886         local_access = isl_map_copy(access);
3887
3888         map = isl_map_domain_map(isl_map_universe(isl_space_copy(space)));
3889         id = isl_map_get_tuple_id(access, isl_dim_out);
3890         map = isl_map_set_tuple_id(map, isl_dim_in, id);
3891         access = isl_map_apply_range(access, map);
3892         pma = isl_pw_multi_aff_from_map(access);
3893         expr = isl_ast_build_access_from_pw_multi_aff(build, pma);
3894         stmt->u.c.index = expr;
3895
3896         map = isl_map_range_map(isl_map_universe(space));
3897         id = isl_map_get_tuple_id(local_access, isl_dim_out);
3898         map = isl_map_set_tuple_id(map, isl_dim_in, id);
3899         local_access = isl_map_apply_range(local_access, map);
3900         pma = isl_pw_multi_aff_from_map(local_access);
3901         expr = isl_ast_build_access_from_pw_multi_aff(build, pma);
3902         stmt->u.c.local_index = expr;
3903
3904         stmt->u.c.array = gen->copy_group->array;
3905         array_index = stmt->u.c.array - gen->prog->array;
3906         stmt->u.c.local_array = &gen->kernel->array[array_index];
3907         stmt->type = ppcg_kernel_copy;
3908
3909         id = isl_id_alloc(gen->ctx, NULL, stmt);
3910         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
3911         return isl_ast_node_set_annotation(node, id);
3912 }
3913
3914 /* Given a schedule of the form
3915  *
3916  *      [S -> A] -> L
3917  *
3918  * (with S the first shared_len dimensions of the computed schedule,
3919  * A the array and L the schedule correponding to the generated loops),
3920  * indicating where to copy the array elements that need to be copied,
3921  * construct code for performing the copying.
3922  *
3923  * "group" is the array reference group that is being copied
3924  * "type" is either "read" or "write"
3925  * private is set if copying needs to be performed to/from registers
3926  *
3927  * We first construct a mapping to a shifted tile of the array,
3928  *
3929  *      [S -> A] -> T(S,A)                                      (1)
3930  *
3931  * If private is set, then we also use this mapping as a schedule
3932  * (which is already thread-specific and will be completely unrolled).
3933  * Otherwise, we wrap/tile the range over the threads.
3934  * The result is
3935  *
3936  *      [S -> A] -> T'(S,A)
3937  *
3938  * Combined with the given schedule, we have
3939  *
3940  *      [S -> A] -> [L -> T'(S,A)]                              (2)
3941  *
3942  * From the shifted tile mapping, we construct a mapping
3943  *
3944  *      [S -> A] -> [A -> T(S,A)]
3945  *
3946  * and apply it to the schedule (2), obtaining
3947  *
3948  *      [A -> T(S(L),A)] -> [L -> T'(S(L),A)]
3949  *
3950  * Note that we can project out S because it is uniquely defined by L.
3951  */
3952 static __isl_give isl_ast_node *copy_access(struct gpu_gen *gen,
3953         __isl_take isl_map *sched,
3954         const char *type, struct gpu_array_ref_group *group,
3955         __isl_take isl_ast_build *build, int private)
3956 {
3957         isl_space *space;
3958         isl_ast_node *tree;
3959         isl_map *schedule, *shift, *map;
3960         isl_set *set;
3961         isl_id_list *iterators;
3962         int n;
3963
3964         shift = shift_access(group);
3965
3966         schedule = isl_map_copy(shift);
3967         schedule = isl_map_reset_tuple_id(schedule, isl_dim_out);
3968         if (!private)
3969                 schedule = tile_access_schedule(gen, schedule);
3970
3971         n = isl_map_dim(schedule, isl_dim_out);
3972         set = isl_set_universe(isl_ast_build_get_schedule_space(build));
3973         set = add_bounded_parameters(set, gen->kernel->n_block,
3974                                         gen->kernel->block_dim, "t");
3975
3976         schedule = isl_map_range_product(sched, schedule);
3977
3978         space = isl_space_domain(isl_map_get_space(shift));
3979         map = isl_map_range_map(isl_map_universe(isl_space_unwrap(space)));
3980         map = isl_map_range_product(map, shift);
3981
3982         schedule = isl_map_apply_domain(schedule, map);
3983
3984         schedule = isl_map_set_tuple_name(schedule, isl_dim_in, type);
3985
3986         build = isl_ast_build_restrict(build, set);
3987
3988         gen->copy_group = group;
3989
3990         if (private) {
3991                 space = isl_space_range(isl_map_get_space(schedule));
3992                 space = isl_space_range(isl_space_unwrap(space));
3993                 build = set_unroll(build, space, 0);
3994         }
3995         iterators = generate_names(gen->ctx, n, "c");
3996         build = isl_ast_build_set_iterators(build, iterators);
3997         build = isl_ast_build_set_at_each_domain(build, &attach_copy_stmt, gen);
3998         tree = isl_ast_build_ast_from_schedule(build,
3999                                             isl_union_map_from_map(schedule));
4000         isl_ast_build_free(build);
4001
4002         return tree;
4003 }
4004
4005 /* Return code for reading into or writing from shared memory
4006  * the given array reference group.
4007  *
4008  * If we are performing a read from global memory to shared memory and
4009  * if the array involved is not a scalar, then we copy
4010  * the entire tile to shared memory.  This may result in some extra
4011  * elements getting copied, but it should lead to simpler code
4012  * (which means that fewer registers may be needed) and less divergence.
4013  *
4014  * Otherwise, we only copy the elements that will be read or have been written
4015  * in the kernel.
4016  *
4017  *
4018  * The input "sched" is of the form.
4019  *
4020  *      type[S -> A] -> L
4021  *
4022  * with S the first shared_len dimensions of the computed schedule,
4023  * A the array and L the schedule correponding to the generated loops.
4024  *
4025  * We first drop "type",
4026  *
4027  *      [S -> A] -> L
4028  *
4029  * If the above conditions are satisfied, we project out A,
4030  * resulting in
4031  *
4032  *      S -> L
4033  *
4034  * and then introduce the group tile [S -> T], resulting in
4035  *
4036  *      [S -> T] -> L
4037  */
4038 static __isl_give isl_ast_node *copy_group_shared_accesses(
4039         struct gpu_gen *gen, struct gpu_array_ref_group *group,
4040         __isl_take isl_map *sched, __isl_take isl_ast_build *build)
4041 {
4042         const char *type;
4043         int read;
4044         isl_union_map *access;
4045
4046         type = isl_map_get_tuple_name(sched, isl_dim_in);
4047         read = !strcmp(type, "read");
4048
4049         sched = isl_map_reset_tuple_id(sched, isl_dim_in);
4050
4051         if (read && !gpu_array_is_scalar(group->array)) {
4052                 isl_space *space;
4053                 isl_map *map;
4054
4055                 space = isl_space_domain(isl_map_get_space(sched));
4056                 space = isl_space_unwrap(space);
4057                 map = isl_map_domain_map(isl_map_universe(space));
4058                 sched = isl_map_apply_domain(sched, map);
4059
4060                 map = group_tile(group);
4061                 map = isl_map_reverse(isl_map_domain_map(map));
4062                 sched = isl_map_apply_domain(sched, map);
4063         }
4064
4065         return copy_access(gen, sched, type, group, build, 0);
4066 }
4067
4068 /* Return code for reading into or writing from private memory
4069  * the given array reference group.
4070  *
4071  * Let S be the first shared_len dimensions of the computed schedule,
4072  * D the iteration domains, A the array and L the schedule correponding
4073  * to the generated loops.
4074  * "sched" is of the form
4075  *
4076  *      type[S -> A] -> L
4077  *
4078  * where type is either "read" or "write".
4079  * We apply the privatization D -> S(t), with t the thread ids,
4080  * to the access relation D -> A to obtain the privatized access relation
4081  *
4082  *      S(t) -> A
4083  *
4084  * We drop the type from "sched" and intersect with the privatized access
4085  * relation to obtain
4086  *
4087  *      [S(t) -> A] -> L
4088  */
4089 static __isl_give isl_ast_node *copy_group_private_accesses(
4090         struct gpu_gen *gen, struct gpu_array_ref_group *group,
4091         __isl_take isl_map *sched, __isl_take isl_ast_build *build)
4092 {
4093         const char *type;
4094         int read;
4095         isl_union_map *priv;
4096         isl_union_map *access;
4097         isl_map *access_map;
4098
4099         type = isl_map_get_tuple_name(sched, isl_dim_in);
4100         read = !strcmp(type, "read");
4101
4102         priv = isl_union_map_from_map(isl_map_copy(gen->privatization));
4103         priv = isl_union_map_apply_range(isl_union_map_copy(gen->shared_sched),
4104                                         priv);
4105
4106         access = group_access_relation(group, read, !read);
4107         access = isl_union_map_apply_domain(access, priv);
4108         access_map = isl_map_from_union_map(access);
4109
4110         sched = isl_map_reset_tuple_id(sched, isl_dim_in);
4111         sched = isl_map_intersect_domain(sched, isl_map_wrap(access_map));
4112
4113         return copy_access(gen, sched, type, group, build, 1);
4114 }
4115
4116 /* Return code for reading into or writing from shared or private memory.
4117  *
4118  * "schedule" is of the form
4119  *
4120  *      type[S -> A] -> L
4121  *
4122  * with S be the first shared_len dimensions of the computed schedule,
4123  * A the array and L the schedule correponding to the generated loops.
4124  * The array reference group is attached to "type".
4125  */
4126 static __isl_give isl_ast_node *create_access_leaf(
4127         struct gpu_gen *gen, __isl_take isl_map *schedule,
4128         __isl_take isl_ast_build *build)
4129 {
4130         struct gpu_array_ref_group *group;
4131         isl_id *id;
4132
4133         id = isl_map_get_tuple_id(schedule, isl_dim_in);
4134         group = isl_id_get_user(id);
4135         isl_id_free(id);
4136
4137         if (group->private_tile)
4138                 return copy_group_private_accesses(gen, group, schedule,
4139                                                         build);
4140         else
4141                 return copy_group_shared_accesses(gen, group, schedule,
4142                                                         build);
4143 }
4144
4145 /* Create a domain node representing a synchronization.
4146  */
4147 static __isl_give isl_ast_node *create_sync_leaf(
4148         struct gpu_gen *gen, __isl_take isl_map *schedule,
4149         __isl_take isl_ast_build *build)
4150 {
4151         struct ppcg_kernel_stmt *stmt;
4152         isl_id *id;
4153         isl_space *space;
4154         isl_ast_node *node;
4155         isl_ast_expr *expr;
4156
4157         isl_map_free(schedule);
4158
4159         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
4160         if (!stmt)
4161                 return NULL;
4162
4163         stmt->type = ppcg_kernel_sync;
4164
4165         space = isl_ast_build_get_schedule_space(build);
4166         space = isl_space_from_domain(space);
4167         space = isl_space_set_tuple_name(space, isl_dim_out, "sync");
4168         expr = isl_ast_build_call_from_pw_multi_aff(build,
4169                     isl_pw_multi_aff_from_multi_aff(isl_multi_aff_zero(space)));
4170         node = isl_ast_node_alloc_user(expr);
4171         isl_ast_build_free(build);
4172
4173         id = isl_id_alloc(gen->ctx, NULL, stmt);
4174         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
4175         return isl_ast_node_set_annotation(node, id);
4176 }
4177
4178 /* This function is called during the code generation at the point
4179  * where the schedule domain element is completely determined by
4180  * the generated code.  The input schedule contains the original
4181  * statements as well as synchronization and copy "statements".
4182  * The latter are scheduled at different points than any of the original
4183  * statements, so they will only arrive here in isolation.
4184  *
4185  * If the current schedule only refers to a single statement,
4186  * we check if it is a copy or synchronization statement and
4187  * call the appropriate functions.
4188  * Otherwise, we assume we are dealing with the original statements
4189  * and we call create_domain_leaf.
4190  */
4191 static __isl_give isl_ast_node *create_kernel_leaf(
4192         __isl_take isl_ast_build *build, void *user)
4193 {
4194         struct gpu_gen *gen = (struct gpu_gen *) user;
4195         isl_map *map;
4196         isl_union_map *schedule;
4197         const char *name;
4198
4199         schedule = isl_ast_build_get_schedule(build);
4200
4201         if (isl_union_map_n_map(schedule) != 1)
4202                 return create_domain_leaf(schedule, build, user);
4203
4204         map = isl_map_from_union_map(schedule);
4205         name = isl_map_get_tuple_name(map, isl_dim_in);
4206         if (!strcmp(name, "read") || !strcmp(name, "write"))
4207                 return create_access_leaf(gen, map, build);
4208         if (!strcmp(name, "sync"))
4209                 return create_sync_leaf(gen, map, build);
4210
4211         return create_domain_leaf(isl_union_map_from_map(map), build, user);
4212 }
4213
4214 /* Mark all odd schedule dimensions as "atomic" (when the even dimensions
4215  * have value 0) and all even schedule dimensions as "unroll".
4216  *
4217  * That is, the options look as follows
4218  *
4219  *      { [0, b, 0, d, ..., 0] -> atomic[i] : exists a : i = 2 a + 1;
4220  *        [a, b, c, d, ..., z] -> unroll[i] : exists a : i = 2 a }
4221  *
4222  * The even positions are used to be able to schedule copying blocks
4223  * and synchronization before or after each level of the shared memory
4224  * tile loops and we want to make sure that code for these is generated
4225  * separately (within each level).
4226  */
4227 static __isl_give isl_ast_build *set_atomic_and_unroll(
4228         __isl_take isl_ast_build *build,
4229         __isl_take isl_space *space, int sched_len)
4230 {
4231         isl_ctx *ctx;
4232         isl_map *map;
4233         isl_constraint *c;
4234         isl_union_map *opt;
4235         isl_local_space *ls;
4236         int i, n;
4237
4238         ctx = isl_ast_build_get_ctx(build);
4239
4240         space = isl_space_params(space);
4241         space = isl_space_add_dims(space, isl_dim_set, sched_len);
4242         space = isl_space_from_domain(space);
4243         space = isl_space_add_dims(space, isl_dim_out, 2);
4244         map = isl_map_universe(isl_space_copy(space));
4245         for (i = 0; i < sched_len; i += 2)
4246                 map = isl_map_fix_si(map, isl_dim_in, i, 0);
4247         ls = isl_local_space_from_space(isl_map_get_space(map));
4248         c = isl_equality_alloc(ls);
4249         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 0, 1);
4250         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 1, 2);
4251         c = isl_constraint_set_constant_si(c, 1);
4252         map = isl_map_add_constraint(map, c);
4253         map = isl_map_project_out(map, isl_dim_out, 1, 1);
4254         map = isl_map_set_tuple_name(map, isl_dim_out, "atomic");
4255         opt = isl_union_map_from_map(map);
4256
4257         map = isl_map_universe(space);
4258         ls = isl_local_space_from_space(isl_map_get_space(map));
4259         c = isl_equality_alloc(ls);
4260         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 0, 1);
4261         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 1, 2);
4262         map = isl_map_add_constraint(map, c);
4263         map = isl_map_project_out(map, isl_dim_out, 1, 1);
4264         map = isl_map_set_tuple_name(map, isl_dim_out, "unroll");
4265         opt = isl_union_map_add_map(opt, map);
4266
4267         build = isl_ast_build_set_options(build, opt);
4268
4269         return build;
4270 }
4271
4272 /* Return a map that maps a space of dimension gen->shared_len
4273  * to its last dimensions starting at gen->tile_first.
4274  * The range is of dimension
4275  *
4276  *      2 * (gen->shared_len - gen->tile_first) + 1
4277  *
4278  * The input dimensions are mapped to the odd dimensions in the output,
4279  * while the even dimensions (except 2*pos) are fixed to 0.
4280  * Output dimension 2*pos (if pos >= 0) is fixed to "val".
4281  * If pos >= 0, then only the pos first dimensions starting at gen->tile_first
4282  * are mapped to the output.  The remaining input dimensions are projected
4283  * out and the corresponding output dimensions are fixed to 0.
4284  */
4285 static __isl_give isl_map *insert_even(struct gpu_gen *gen,
4286         __isl_take isl_space *space, int pos, int val)
4287 {
4288         int i, n;
4289         isl_map *proj;
4290
4291         space = isl_space_set_from_params(space);
4292         space = isl_space_add_dims(space, isl_dim_set, gen->shared_len);
4293         space = isl_space_map_from_set(space);
4294         proj = isl_map_identity(space);
4295         proj = isl_map_project_out(proj, isl_dim_out, 0, gen->tile_first);
4296         n = gen->shared_len - gen->tile_first;
4297         for (i = 0; i <= n; ++i) {
4298                 proj = isl_map_insert_dims(proj, isl_dim_out, 2 * i, 1);
4299                 if (i == pos)
4300                         proj = isl_map_fix_si(proj, isl_dim_out, 2 * i, val);
4301                 else
4302                         proj = isl_map_fix_si(proj, isl_dim_out, 2 * i, 0);
4303         }
4304
4305         if (pos < 0)
4306                 return proj;
4307
4308         proj = isl_map_eliminate(proj, isl_dim_in, gen->tile_first + pos,
4309                                 gen->shared_len - (gen->tile_first + pos));
4310         for (i = pos; i < n; ++i)
4311                 proj = isl_map_fix_si(proj, isl_dim_out, 2 * i + 1, 0);
4312
4313         return proj;
4314 }
4315
4316 /* Given the AST context schedule "schedule" and the mapping from
4317  * domains to the shared tile loops "shared_sched", add a schedule
4318  * for a synchronization operation at position "val" of loop level "pos".
4319  *
4320  * schedule is of the form
4321  *
4322  *      D -> L
4323  *
4324  * (with D the iteration domains and L the already generated loops),
4325  * while shared_sched is of the form
4326  *
4327  *      D -> S
4328  *
4329  * We combine them into
4330  *
4331  *      L -> S
4332  *
4333  * apply a mapping
4334  *
4335  *      [s_0,...] -> [0,s_{tile_first},0,..., val, 0, 0, ... 0]
4336  *
4337  * and use the result as a schedule for "sync".
4338  */
4339 static __isl_give isl_union_map *add_sync_schedule(struct gpu_gen *gen,
4340         __isl_take isl_union_map *res, __isl_keep isl_union_map *schedule,
4341         __isl_keep isl_union_map *shared_sched, int pos, int val)
4342 {
4343         isl_space *space;
4344         isl_map *proj, *map;
4345
4346         shared_sched = isl_union_map_copy(shared_sched);
4347         schedule = isl_union_map_copy(schedule);
4348
4349         space = isl_union_map_get_space(shared_sched);
4350         schedule = isl_union_map_apply_domain(shared_sched, schedule);
4351         map = isl_map_from_union_map(schedule);
4352
4353         proj = insert_even(gen, space, pos, val);
4354         map = isl_map_apply_range(map, proj);
4355         map = isl_map_from_range(isl_map_wrap(map));
4356         map = isl_map_set_tuple_name(map, isl_dim_in, "sync");
4357
4358         res = isl_union_map_add_map(res, map);
4359
4360         return res;
4361 }
4362
4363 /* Given a set of wrapped references "ref", return the corresponding
4364  * access relations based on the tagged access relations "tagged".
4365  *
4366  * The elements of "ref" are of the form
4367  *
4368  *      [D -> R]
4369  *
4370  * with D an iteration domains and R a reference.
4371  * The elements of "tagged" are of the form
4372  *
4373  *      [D -> R] -> A
4374  *
4375  * with A an array.
4376  *
4377  * Extend "tagged" to include the iteration domain in the range, i.e.,
4378  *
4379  *      [D -> R] -> [D -> A]
4380  *
4381  * apply the result to "ref" and then unwrap the resulting set
4382  * to obtain relations of the form
4383  *
4384  *      D -> A
4385  */
4386 static __isl_give isl_union_map *wrapped_reference_to_access(
4387         __isl_take isl_union_set *ref, __isl_take isl_union_map *tagged)
4388 {
4389         isl_union_map *tag2access;
4390
4391         tag2access = isl_union_map_copy(tagged);
4392         tag2access = isl_union_map_universe(tag2access);
4393         tag2access = isl_union_set_unwrap(isl_union_map_domain(tag2access));
4394         tag2access = isl_union_map_domain_map(tag2access);
4395         tag2access = isl_union_map_range_product(tag2access, tagged);
4396
4397         ref = isl_union_set_coalesce(ref);
4398         ref = isl_union_set_apply(ref, tag2access);
4399
4400         return isl_union_set_unwrap(ref);
4401 }
4402
4403 /* Given an access relation "access" from "group", remove those reads
4404  * if ("read" is 1) or writes (if "read" is 0) that are only needed to
4405  * communicate data within the same iteration of the last_shared dimension
4406  * of the group.
4407  *
4408  * If the access is a read then it is necessarily an element of
4409  *
4410  *      live_in union (range flow)
4411  *
4412  * where live_in and flow may be overapproximations.
4413  * If the access is a write then it is necessarily an element of
4414  *
4415  *      live_out union (domain flow)
4416  *
4417  * In both cases, the access relation is also a subset of
4418  * the group access relation.
4419  *
4420  * Essentially, we compute the intersection of "access" with either
4421  *
4422  *      live_in union (range non-local-flow)
4423  *
4424  * or
4425  *
4426  *      live_out union (domain non-local-flow)
4427  *
4428  * We first construct a relation "local"
4429  *
4430  *      [[D -> R] -> [D' -> R']]
4431  *
4432  * of pairs of domain iterations accessing the reference group
4433  * and references in the group that are scheduled to the same iteration
4434  * of the last_shared dimension.
4435  *
4436  * If this relation does not intersect the dataflow dependences,
4437  * then there is nothing we can possibly remove and we simply
4438  * return the input.
4439  *
4440  * Otherwise, we remove the "local" dataflow dependences from
4441  * the set of all dataflow dependences.
4442  * Note that if the potential dataflow dependences are an overapproximation
4443  * of the actual dataflow dependences, then the result remains an
4444  * overapproximation of the non-local dataflow dependences.
4445  * Copying to/from global memory is only needed for the references
4446  * in the domain/range of the result or for accesses that are live out/in
4447  * for the entire scop.
4448  *
4449  * We therefore map the domain/range of the "external" relation
4450  * to the corresponding access relation and take the union with
4451  * the live out/in relation.
4452  */
4453 static __isl_give isl_union_map *remove_local_accesses(struct gpu_gen *gen,
4454         struct gpu_array_ref_group *group, __isl_take isl_union_map *access,
4455         int read)
4456 {
4457         int empty;
4458         isl_union_map *tagger;
4459         isl_union_set *domain;
4460         isl_space *space;
4461         isl_union_map *sched, *local, *tagged, *external;
4462         isl_union_set *tag_set;
4463         isl_map *proj;
4464
4465         if (isl_union_map_is_empty(access))
4466                 return access;
4467
4468         tagged = group_tagged_access_relation(group);
4469
4470         sched = isl_union_map_copy(gen->sched);
4471
4472         space = isl_union_map_get_space(sched);
4473         proj = projection(space, gen->untiled_len, group->last_shared + 1);
4474         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
4475
4476         tagger = isl_union_map_copy(gen->prog->scop->tagger);
4477         domain = isl_union_map_domain(isl_union_map_copy(tagged));
4478         tagger = isl_union_map_intersect_range(tagger, domain);
4479         sched = isl_union_map_apply_domain(sched, tagger);
4480
4481         local = isl_union_map_apply_range(sched,
4482                             isl_union_map_reverse(isl_union_map_copy(sched)));
4483         local = isl_union_map_intersect(local,
4484                         isl_union_map_copy(gen->prog->scop->tagged_dep_flow));
4485
4486         empty = isl_union_map_is_empty(local);
4487         if (empty < 0 || empty) {
4488                 isl_union_map_free(tagged);
4489                 isl_union_map_free(local);
4490                 if (empty < 0)
4491                         return isl_union_map_free(access);
4492                 return access;
4493         }
4494
4495         external = isl_union_map_copy(gen->prog->scop->tagged_dep_flow);
4496         external = isl_union_map_intersect_params(external,
4497                                 isl_set_copy(gen->prog->scop->context));
4498         external = isl_union_map_subtract(external, local);
4499
4500         if (read) {
4501                 tag_set = isl_union_map_range(external);
4502                 external = wrapped_reference_to_access(tag_set, tagged);
4503                 external = isl_union_map_union(external,
4504                                 isl_union_map_copy(gen->prog->scop->live_in));
4505         } else {
4506                 tag_set = isl_union_map_domain(external);
4507                 external = wrapped_reference_to_access(tag_set, tagged);
4508                 external = isl_union_map_union(external,
4509                                 isl_union_map_copy(gen->prog->scop->live_out));
4510         }
4511
4512         access = isl_union_map_intersect(access, external);
4513
4514         return access;
4515 }
4516
4517 /* Given the AST context schedule "schedule" and the mapping from
4518  * domains to the shared tile loops "shared_sched", add a schedule
4519  * for copying an array reference group to/from shared/private memory.
4520  * "read" is set if data should be copied from global memory
4521  * to shared/private memory.
4522  * "k" represents the current group
4523  * "s" is the total number of groups
4524  *
4525  * We schedule an operation before or after the innermost loop
4526  * of "shared_sched" that affects the tile of the array reference group.
4527  *
4528  * schedule is of the form
4529  *
4530  *      D -> L
4531  *
4532  * (with D the iteration domains and L the already generated loops),
4533  * while shared_sched is of the form
4534  *
4535  *      D -> S
4536  *
4537  * We first compute the access relation for the reference group
4538  *
4539  *      D -> A
4540  *
4541  * and remove from this access relation those reads or writes
4542  * that only needed to communicate data within the same iteration
4543  * of the last_shared dimension of the group.
4544  * We then combine what is left with shared_sched into
4545  *
4546  *      D -> [S -> A]
4547  *
4548  * If this results in an empty relation, no copying needs to be performed
4549  * at this point.
4550  * Otherwise, we invert the relation and combine it with "schedule" into
4551  *
4552  *      [S -> A] -> L
4553  *
4554  * The actual additional piece of the schedule is obtained from combining
4555  *
4556  *      [S -> A] -> S
4557  *
4558  * with a mapping
4559  *
4560  *      [s_0,...] -> [0,s_{tile_first},0,..., val, 0, 0, ... 0]
4561  *
4562  * The position of "val" corresponds to the innermost loop that affects
4563  * the tile and the value indicates where the copying is scheduled
4564  * with respect to the actual kernel code (at value 0).
4565  * Reads are schedule before the code, writes to global memory from
4566  * private memory are scheduled at values 1 to s, writes to global
4567  * memory from shared memory are scheduled at values s + 2 to 2 * s + 1.
4568  *
4569  * If we are scheduling a read from global memory to shared memory,
4570  * we insert a synchronization before the kernel code (at the innermost
4571  * level).
4572  * If we are scheduling a write to global memory, then we add
4573  * a synchronization after all writes (at value 2 *s + 2).
4574  * However, there is no need for a synchronization after the outermost loop.
4575  * A write to global memory from private memory at the innermost level
4576  * does not require a synchronization, because it is covered by
4577  * the synchronization after the kernel inserted by body_schedule.
4578  */
4579 static __isl_give isl_union_map *add_group_schedule(struct gpu_gen *gen,
4580         __isl_take isl_union_map *res, __isl_keep isl_union_map *schedule,
4581         __isl_keep isl_union_map *shared_sched,
4582         struct gpu_array_ref_group *group, int read, int k, int s)
4583 {
4584         int n;
4585         int pos, val;
4586         isl_space *space;
4587         isl_union_map *access;
4588         isl_map *map, *proj, *access_map;
4589         isl_id *id;
4590
4591         access = group_access_relation(group, read, !read);
4592         access = remove_local_accesses(gen, group, access, read);
4593         access = isl_union_map_range_product(isl_union_map_copy(shared_sched),
4594                                                 access);
4595
4596         if (isl_union_map_is_empty(access)) {
4597                 isl_union_map_free(access);
4598                 return res;
4599         }
4600
4601         access = isl_union_map_reverse(access);
4602         access = isl_union_map_apply_range(access,
4603                                             isl_union_map_copy(schedule));
4604         access_map = isl_map_from_union_map(access);
4605
4606         space = isl_space_copy(group->array->space);
4607         space = isl_space_from_range(space);
4608         space = isl_space_add_dims(space, isl_dim_in, gen->shared_len);
4609         map = isl_map_domain_map(isl_map_universe(space));
4610
4611         space = isl_union_map_get_space(schedule);
4612         pos = group->last_shared + 1 - gen->tile_first;
4613         assert(pos >= 0);
4614         if (read)
4615                 val = -2 - k;
4616         else if (group->private_tile)
4617                 val = 1 + k;
4618         else
4619                 val = 1 + s + 1 + k;
4620         proj = insert_even(gen, space, pos, val);
4621         map = isl_map_apply_range(map, proj);
4622
4623         access_map = isl_map_range_product(access_map, map);
4624
4625         id = isl_id_alloc(gen->ctx, read ? "read" : "write", group);
4626         access_map = isl_map_set_tuple_id(access_map, isl_dim_in, id);
4627
4628         res = isl_union_map_add_map(res, access_map);
4629
4630         n = gen->shared_len - gen->tile_first;
4631         if (read) {
4632                 if (!group->private_tile)
4633                         res = add_sync_schedule(gen, res, schedule,
4634                                                 shared_sched, n, -1);
4635         } else {
4636                 if (pos == 0)
4637                         return res;
4638                 if (pos == n && group->private_tile)
4639                         return res;
4640                 res = add_sync_schedule(gen, res, schedule, shared_sched,
4641                                         pos, 2 * s + 2);
4642         }
4643
4644         return res;
4645 }
4646
4647 /* Return a schedule for the shared tile loops based on the current
4648  * AST context schedule.
4649  *
4650  * We create a "shared_sched" that maps the domains to the first
4651  * shared_len dimensions of the computed schedule, project out the
4652  * first tile_first dimensions (as these are already covered by
4653  * the host code) and insert "statement-level" dimensions at even
4654  * positions so that we can schedule copy blocks and synchronization
4655  * before/after each level.
4656  *
4657  * In particular, copy blocks are inserted inside the innermost
4658  * level that affect the tile.  For the copying to global memory,
4659  * those from private memory are scheduled before those from shared
4660  * memory such that synchronization can be inserted between the two
4661  * at the innermost level.
4662  * Synchronization is inserted at the innermost level before the
4663  * actual kernel code if there is any copying from global memory
4664  * to shared memory.  It is inserted unconditionally at the innermost
4665  * level after the actual kernel code and the copying to global memory
4666  * from private memory (if any).  Finally, it is inserted after
4667  * any copying to global memory, except at the outermost level
4668  * and at the innermost level if there is no copying from shared
4669  * memory.  The copying from private memory is covered by the unconditional
4670  * synchronization at the innermost level.
4671  */
4672 static __isl_give isl_union_map *body_schedule(struct gpu_gen *gen,
4673         __isl_take isl_union_map *schedule)
4674 {
4675         isl_space *space;
4676         isl_union_map *res;
4677         isl_union_map *shared_sched;
4678         isl_union_map *sched;
4679         isl_map *proj, *map;
4680         int i, j, k, s;
4681
4682         shared_sched = isl_union_map_copy(gen->tiled_sched);
4683         proj = projection(isl_union_map_get_space(shared_sched),
4684                                 gen->tiled_len, gen->shared_len);
4685         shared_sched = isl_union_map_apply_range(shared_sched,
4686                                 isl_union_map_from_map(proj));
4687         space = isl_union_map_get_space(shared_sched);
4688         proj = insert_even(gen, space, -1, 0);
4689         sched = isl_union_map_apply_range(isl_union_map_copy(shared_sched),
4690                                 isl_union_map_from_map(proj));
4691
4692         res = isl_union_map_range_product(isl_union_map_copy(schedule), sched);
4693
4694         s = 0;
4695         for (i = 0; i < gen->prog->n_array; ++i)
4696                 s += gen->prog->array[i].n_group;
4697
4698         k = 0;
4699         for (i = 0; i < gen->prog->n_array; ++i) {
4700                 struct gpu_array_info *array = &gen->prog->array[i];
4701
4702                 for (j = 0; j < array->n_group; ++j) {
4703                         struct gpu_array_ref_group *group;
4704
4705                         group = array->groups[j];
4706                         if (!group->private_tile && !group->shared_tile)
4707                                 continue;
4708                         res = add_group_schedule(gen, res, schedule,
4709                                                 shared_sched, group, 0, k, s);
4710                         res = add_group_schedule(gen, res, schedule,
4711                                                 shared_sched, group, 1, k, s);
4712                         ++k;
4713                 }
4714         }
4715
4716         res = add_sync_schedule(gen, res, schedule, shared_sched,
4717                             gen->shared_len - gen->tile_first, 1 + s);
4718
4719         isl_union_map_free(shared_sched);
4720         isl_union_map_free(schedule);
4721
4722         return res;
4723 }
4724
4725 /* Generate code for "kernel" in the given "context".
4726  *
4727  * We first generate code for the shared tile loops (T1T, T1P and T2)
4728  * in a context that includes the block ids.
4729  * Within each iteration of these loops an additional code generation
4730  * is performed (within create_kernel_leaf) for the rest of the schedule
4731  * in a context that includes the thread ids.
4732  */
4733 static __isl_give isl_ast_node *generate_kernel(struct gpu_gen *gen,
4734         __isl_keep isl_ast_build *build, __isl_keep isl_set *host_domain,
4735         __isl_keep isl_multi_pw_aff *grid_size)
4736 {
4737         isl_space *space;
4738         isl_set *set;
4739         isl_id_list *iterators;
4740         isl_union_map *schedule;
4741         isl_ast_node *tree;
4742         int sched_len;
4743
4744         schedule = isl_ast_build_get_schedule(build);
4745
4746         build = isl_ast_build_copy(build);
4747         build = isl_ast_build_restrict(build, isl_set_copy(host_domain));
4748         space = isl_ast_build_get_schedule_space(build);
4749         set = isl_set_universe(isl_space_copy(space));
4750         set = add_bounded_parameters_dynamic(set, grid_size, "b");
4751         build = isl_ast_build_restrict(build, set);
4752
4753         schedule = body_schedule(gen, schedule);
4754
4755         sched_len = 2 * (gen->shared_len - gen->tile_first) + 1;
4756
4757         build = set_atomic_and_unroll(build, space, sched_len);
4758         iterators = generate_names(gen->ctx, sched_len, "g");
4759         build = isl_ast_build_set_iterators(build, iterators);
4760         build = isl_ast_build_set_create_leaf(build, &create_kernel_leaf, gen);
4761         tree = isl_ast_build_ast_from_schedule(build, schedule);
4762         isl_ast_build_free(build);
4763
4764         return tree;
4765 }
4766
4767 /* Attach "id" to the given node.
4768  */
4769 static __isl_give isl_ast_node *attach_id(__isl_take isl_ast_node *node,
4770         __isl_keep isl_ast_build *build, void *user)
4771 {
4772         isl_id *id = user;
4773
4774         node = isl_ast_node_set_annotation(node, id);
4775
4776         return node;
4777 }
4778
4779 /* Construct an AST node for performing a kernel launch and attach
4780  * the information about the kernel to that node.
4781  *
4782  * The kernel AST has been constructed in the context of the range
4783  * of "schedule".  In particular, the grid size has been computed
4784  * in the context.  We therefore still need to make sure that these
4785  * constraints are expressed in the code.  We do this by creating a schedule
4786  *
4787  *      kernel[] -> [S -> []]
4788  *
4789  * where S is the schedule domain, i.e., the range of "schedule".
4790  * The AST generation will then create a single call surrounded by
4791  * all the condition in "S" that have not been expressed yet.
4792  *
4793  * The kernel information is attached to this node in attach_id.
4794  */
4795 static __isl_give isl_ast_node *construct_launch(
4796         __isl_take isl_ast_build *build, __isl_take isl_union_map *schedule,
4797         __isl_take struct ppcg_kernel *kernel)
4798 {
4799         isl_id *id;
4800         isl_ctx *ctx;
4801         isl_union_set *domain;
4802         isl_set *set;
4803         isl_map *map;
4804         isl_ast_node *node;
4805
4806         ctx = isl_ast_build_get_ctx(build);
4807
4808         id = isl_id_alloc(ctx, NULL, kernel);
4809         id = isl_id_set_free_user(id, &ppcg_kernel_free);
4810
4811         domain = isl_union_map_range(schedule);
4812         set = isl_set_from_union_set(domain);
4813         map = isl_map_from_domain(set);
4814         map = isl_map_from_range(isl_map_wrap(map));
4815         map = isl_map_set_tuple_name(map, isl_dim_in, "kernel");
4816         schedule = isl_union_map_from_map(map);
4817
4818         build = isl_ast_build_set_at_each_domain(build, &attach_id, id);
4819         node = isl_ast_build_ast_from_schedule(build, schedule);
4820         isl_ast_build_free(build);
4821
4822         return node;
4823 }
4824
4825 /* This function is called for each leaf in the AST of the host code.
4826  * We first specialize the schedule to the site of the leaf, compute
4827  * the size of shared memory and then construct the body of the host code
4828  * and the associated kernel.
4829  *
4830  * The necessary information for printing the kernel launch is
4831  * stored in a struct ppcg_kernel and attached to the leaf node
4832  * created to represent the launch.
4833  */
4834 static __isl_give isl_ast_node *create_host_leaf(
4835         __isl_take isl_ast_build *build, void *user)
4836 {
4837         struct gpu_gen *gen = (struct gpu_gen *) user;
4838         isl_id *id;
4839         isl_ast_node *node;
4840         struct ppcg_kernel *kernel;
4841         isl_set *host_domain;
4842         isl_union_map *schedule;
4843         isl_union_map *local_sched;
4844         isl_union_map *access;
4845         isl_union_set *domain;
4846         int i;
4847
4848         schedule = isl_ast_build_get_schedule(build);
4849
4850         isl_union_map_foreach_map(schedule, &extract_tile_len, gen);
4851         read_sizes(gen);
4852
4853         domain = isl_union_map_domain(isl_union_map_copy(schedule));
4854
4855         local_sched = isl_union_map_copy(gen->sched);
4856         local_sched = isl_union_map_intersect_domain(local_sched, domain);
4857         access = isl_union_map_union(isl_union_map_copy(gen->prog->read),
4858                                      isl_union_map_copy(gen->prog->may_write));
4859         access = isl_union_map_apply_domain(access,
4860                                             isl_union_map_copy(local_sched));
4861
4862         gen->tiled_sched = tile_schedule(gen, local_sched);
4863         gen->tiled_sched = parametrize_tiled_schedule(gen, gen->tiled_sched);
4864         gen->tiled_sched = scale_tile_loops(gen, gen->tiled_sched);
4865
4866         gen->local_sched = isl_union_map_copy(gen->tiled_sched);
4867         gen->local_sched = thread_tile_schedule(gen, gen->local_sched);
4868         gen->local_sched = scale_thread_tile_loops(gen, gen->local_sched);
4869
4870         kernel = gen->kernel = isl_calloc_type(gen->ctx, struct ppcg_kernel);
4871         if (!kernel)
4872                 goto error;
4873
4874         kernel->id = gen->kernel_id++;
4875         kernel->context = isl_union_map_params(isl_union_map_copy(schedule));
4876         kernel->grid_size = extract_grid_size(gen, kernel);
4877         extract_block_size(gen, kernel);
4878         kernel->arrays = isl_union_map_range(access);
4879         kernel->arrays = isl_union_set_apply(kernel->arrays,
4880                                 isl_union_map_copy(gen->prog->to_outer));
4881         kernel->space = isl_ast_build_get_schedule_space(build);
4882
4883         gen->private_access = NULL;
4884         compute_shared_sched(gen);
4885         gen->privatization = compute_privatization(gen);
4886         if (group_references(gen) < 0)
4887                 schedule = isl_union_map_free(schedule);
4888         compute_private_access(gen);
4889         check_shared_memory_bound(gen);
4890         compute_group_tilings(gen);
4891         host_domain = isl_set_from_union_set(isl_union_map_range(
4892                                                 isl_union_map_copy(schedule)));
4893         localize_bounds(gen, kernel, host_domain);
4894
4895         gen->local_sched = interchange_for_unroll(gen, gen->local_sched);
4896
4897         kernel->tree = generate_kernel(gen, build, host_domain,
4898                                         kernel->grid_size);
4899         create_kernel_vars(gen, kernel);
4900
4901         free_local_array_info(gen);
4902         isl_map_free(gen->privatization);
4903         isl_union_map_free(gen->private_access);
4904         isl_union_map_free(gen->local_sched);
4905         isl_union_map_free(gen->tiled_sched);
4906         isl_union_map_free(gen->shared_sched);
4907         isl_union_map_free(gen->shared_proj);
4908         isl_set_free(host_domain);
4909         free(gen->tile_size);
4910
4911         node = construct_launch(build, schedule, kernel);
4912
4913         return node;
4914 error:
4915         isl_union_map_free(schedule);
4916         return NULL;
4917 }
4918
4919 /* Use isl to generate code for the outer gen->tile_first loops
4920  * of the global schedule in gen->sched, resulting in the host code.
4921  * Within each iteration of this partial schedule, i.e., for each kernel
4922  * launch, create_host_leaf takes care of generating the kernel code.
4923  */
4924 static __isl_give isl_ast_node *generate_host_code(struct gpu_gen *gen)
4925 {
4926         isl_ast_build *build;
4927         isl_ast_node *tree;
4928         isl_union_map *sched;
4929         isl_map *proj;
4930         isl_id_list *iterators;
4931
4932         sched = isl_union_map_copy(gen->sched);
4933         proj = projection(isl_union_map_get_space(sched),
4934                             gen->untiled_len, gen->tile_first);
4935         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
4936
4937         isl_options_set_ast_build_group_coscheduled(gen->ctx, 1);
4938         build = isl_ast_build_from_context(isl_set_copy(gen->prog->context));
4939         iterators = generate_names(gen->ctx, gen->tile_first, "h");
4940         build = isl_ast_build_set_iterators(build, iterators);
4941         build = isl_ast_build_set_create_leaf(build, &create_host_leaf, gen);
4942         tree = isl_ast_build_ast_from_schedule(build, sched);
4943         isl_ast_build_free(build);
4944
4945         return tree;
4946 }
4947
4948 __isl_give isl_union_map *extract_sizes_from_str(isl_ctx *ctx, const char *str)
4949 {
4950         if (!str)
4951                 return NULL;
4952         return isl_union_map_read_from_str(ctx, str);
4953 }
4954
4955 /* Information about the outermost tilable bands in the forest of bands.
4956  *
4957  * tile_len and n_parallel are only sets on band_info structures
4958  * that correspond to outermost bands.  For other bands (in particular,
4959  * ancestors of the outermost bands), n_parallal is set to 0.
4960  *
4961  * prefix is the (padded) schedule leading up to the outermost tilable bands.
4962  *
4963  * tile_first is the number of schedule dimensions in prefix.
4964  *
4965  * suffix is the schedule of the outermost tilable bands and their descendants.
4966  */
4967 struct band_info {
4968         struct gpu_gen *gen;
4969         int tile_first;
4970         int tile_len;
4971         int n_parallel;
4972         isl_union_map *prefix;
4973         isl_union_map *suffix;
4974 };
4975
4976 /* Set tile_len and n_parallel of the statement to that of
4977  * their outermost band, recorded in the band_info.
4978  */
4979 static int set_stmt_tile_len(__isl_take isl_map *map, void *user)
4980 {
4981         struct band_info *info = user;
4982         struct gpu_stmt *stmt;
4983         isl_id *id;
4984
4985         id = isl_map_get_tuple_id(map, isl_dim_in);
4986         stmt = find_stmt(info->gen->prog, id);
4987         isl_id_free(id);
4988
4989         stmt->tile_len = info->tile_len;
4990         stmt->n_parallel = info->n_parallel;
4991
4992         isl_map_free(map);
4993
4994         return 0;
4995 }
4996
4997 static void list_select_outer_band(struct gpu_gen *gen,
4998         __isl_take isl_band_list *list, int pos, struct band_info *list_info);
4999
5000 /* Check if this band has any parallel loops.  If so, take it as
5001  * the outermost tilable band.  If not, continue looking for the
5002  * outermost tilable band in the children of the current band.
5003  */
5004 static void band_select_outer_band(struct gpu_gen *gen,
5005         __isl_take isl_band *band, int pos, struct band_info *info)
5006 {
5007         int n = isl_band_n_member(band);
5008         int n_parallel;
5009
5010         for (n_parallel = 0; n_parallel < n; ++n_parallel)
5011                 if (!isl_band_member_is_coincident(band, n_parallel))
5012                         break;
5013
5014         info->n_parallel = n_parallel;
5015         if (n_parallel) {
5016                 gen->any_parallelism = 1;
5017                 info->gen = gen;
5018                 info->tile_first = pos;
5019                 info->tile_len = n;
5020                 info->prefix = isl_band_get_prefix_schedule(band);
5021                 info->suffix = isl_union_map_flat_range_product(
5022                                 isl_band_get_partial_schedule(band),
5023                                 isl_band_get_suffix_schedule(band));
5024                 isl_union_map_foreach_map(info->prefix,
5025                                             &set_stmt_tile_len, info);
5026         } else if (isl_band_has_children(band)) {
5027                 isl_band_list *children;
5028                 children = isl_band_get_children(band);
5029                 list_select_outer_band(gen, children, pos + n, info);
5030         } else {
5031                 info->gen = gen;
5032                 info->tile_first = pos + n;
5033                 info->tile_len = 0;
5034                 info->prefix = isl_union_map_flat_range_product(
5035                                 isl_band_get_prefix_schedule(band),
5036                                 isl_band_get_partial_schedule(band));
5037                 info->suffix = isl_band_get_suffix_schedule(band);
5038                 isl_union_map_foreach_map(info->prefix,
5039                                             &set_stmt_tile_len, info);
5040         }
5041
5042         isl_band_free(band);
5043 }
5044
5045 /* Comparison function that returns a non-zero value for band_infos
5046  * with different tile_len fields or different n_parallel fields.
5047  */
5048 static int cmp_band(const void *p1, const void *p2)
5049 {
5050         const struct band_info *info1 = p1;
5051         const struct band_info *info2 = p2;
5052
5053         if (info1->tile_len != info2->tile_len)
5054                 return info1->tile_len - info2->tile_len;
5055
5056         return info1->n_parallel - info2->n_parallel;
5057 }
5058
5059 /* Extend "umap" with coordinates with fixed value "val"
5060  * to a total length of "dst_len", assuming the original dimension is "src_len".
5061  */
5062 static __isl_give isl_union_map *extend_range(
5063         __isl_take isl_union_map *umap, int src_len, int dst_len, int val)
5064 {
5065         isl_space *dim;
5066         isl_map *map;
5067         int i;
5068
5069         dim = isl_union_map_get_space(umap);
5070         map = isl_map_reverse(projection(dim, dst_len, src_len));
5071         for (i = src_len; i < dst_len; ++i)
5072                 map = isl_map_fix_si(map, isl_dim_out, i, val);
5073
5074         umap = isl_union_map_apply_range(umap, isl_union_map_from_map(map));
5075
5076         return umap;
5077 }
5078
5079 /* Group bands with the same values for tile_len and n_parallel.
5080  * The prefix schedule is then extended with a fixed coordinate that
5081  * is different for each such group.
5082  * Note that the actual values for this coordinate are not important.
5083  * The bands have already been effectively separated at a higher level
5084  * or they are independent and may be executed in parallel.
5085  * The list of band_info has been sorted before this functions is called.
5086  */
5087 static void separate_bands(struct band_info *info, int n)
5088 {
5089         int i;
5090         int j = 0;
5091
5092         for (i = 0; i < n; ++i) {
5093                 int l = info[i].tile_first;
5094
5095                 if (i &&
5096                     (info[i].tile_len != info[i - 1].tile_len ||
5097                      info[i].n_parallel != info[i - 1].n_parallel))
5098                         j++;
5099
5100                 info[i].prefix = extend_range(info[i].prefix,
5101                                                 l, l + 1, j);
5102                 info[i].tile_first = l + 1;
5103         }
5104 }
5105
5106 /* Select the outermost bands in the elements of the list, align
5107  * their prefix schedules, separate bands with different values
5108  * for tile_len and/or n_parallel and then combine the resulting
5109  * prefix and suffix schedules into a single pair of prefix and
5110  * suffix schedules for the entire list.
5111  */
5112 static void list_select_outer_band(struct gpu_gen *gen,
5113         __isl_take isl_band_list *list, int pos, struct band_info *list_info)
5114 {
5115         isl_band *band;
5116         int i;
5117         int n = isl_band_list_n_band(list);
5118         isl_ctx *ctx = isl_band_list_get_ctx(list);
5119         struct band_info *info;
5120         int max_tile_first;
5121         isl_union_map *prefix;
5122         isl_union_map *suffix;
5123
5124         assert(n >= 1);
5125         info = isl_calloc_array(ctx, struct band_info, n);
5126         assert(info);
5127
5128         max_tile_first = 0;
5129         for (i = 0; i < n; ++i) {
5130                 band = isl_band_list_get_band(list, i);
5131                 band_select_outer_band(gen, band, pos, &info[i]);
5132                 if (info[i].tile_first > max_tile_first)
5133                         max_tile_first = info[i].tile_first;
5134         }
5135
5136         for (i = 0; i < n; ++i) {
5137                 if (info[i].tile_first == max_tile_first)
5138                         continue;
5139                 info[i].prefix = extend_range(info[i].prefix,
5140                                         info[i].tile_first, max_tile_first, 0);
5141                 info[i].tile_first = max_tile_first;
5142         }
5143
5144         qsort(info, n, sizeof(struct band_info), &cmp_band);
5145
5146         for (i = 0; i < n - 1; ++i)
5147                 if (info[i].tile_len != info[i + 1].tile_len ||
5148                     info[i].n_parallel != info[i + 1].n_parallel)
5149                         break;
5150
5151         if (i < n -1)
5152                 separate_bands(info, n);
5153
5154         prefix = info[0].prefix;
5155         suffix = info[0].suffix;
5156
5157         for (i = 1; i < n; ++i) {
5158                 prefix = isl_union_map_union(prefix, info[i].prefix);
5159                 suffix = isl_union_map_union(suffix, info[i].suffix);
5160         }
5161
5162         list_info->tile_first = info[0].tile_first;
5163         list_info->tile_len = -1;
5164         list_info->prefix = prefix;
5165         list_info->suffix = suffix;
5166
5167         isl_band_list_free(list);
5168         free(info);
5169 }
5170
5171 /* Select the outermost tilable band that (by construction)
5172  * has at least one parallel loop.
5173  * The starting position of the aligned band is stored in the pair
5174  * gen->tile_first.
5175  * The sizes and number of parallel loops may be different in different
5176  * parts of the band forest and are therefore stored in the gpu_stmts.
5177  *
5178  * Return the complete schedule, with the tilable bands aligned
5179  * at gen->tile_first and padded with zero, if needed.
5180  */
5181 static __isl_give isl_union_map *select_outer_tilable_band(struct gpu_gen *gen,
5182         __isl_keep isl_schedule *schedule)
5183 {
5184         isl_band_list *list;
5185         struct band_info info;
5186
5187         gen->n_parallel = 0;
5188         gen->tile_len = -1;
5189
5190         list = isl_schedule_get_band_forest(schedule);
5191
5192         if (isl_band_list_n_band(list) == 0) {
5193                 isl_band_list_free(list);
5194                 return isl_schedule_get_map(schedule);
5195         }
5196
5197         list_select_outer_band(gen, list, 0, &info);
5198
5199         gen->tile_first = info.tile_first;
5200         info.suffix = align_range(info.suffix);
5201
5202         return isl_union_map_flat_range_product(info.prefix, info.suffix);
5203 }
5204
5205 /* Set gen->untiled_len to the number of scheduling dimensions
5206  * for the schedule of the first domain.
5207  * We assume here that this number is the same for all domains.
5208  */
5209 static int set_untiled_len(__isl_take isl_map *map, void *user)
5210 {
5211         unsigned *untiled_len = user;
5212
5213         *untiled_len = isl_map_dim(map, isl_dim_out);
5214
5215         isl_map_free(map);
5216         return -1;
5217 }
5218
5219 /* Compute an appropriate schedule based on the accesses in
5220  * gen->read and gen->write.
5221  *
5222  * We use the dependences in gen->prog->scop to compute
5223  * a schedule that has a parallel loop in each tilable band.
5224  * Finally, we select the outermost tilable band.
5225  */
5226 static void compute_schedule(struct gpu_gen *gen)
5227 {
5228         isl_union_set *domain;
5229         isl_union_map *dep_raw, *dep;
5230         isl_union_map *sched;
5231         isl_schedule_constraints *sc;
5232         isl_schedule *schedule;
5233
5234         dep_raw = isl_union_map_copy(gen->prog->scop->dep_flow);
5235
5236         dep = isl_union_map_copy(gen->prog->scop->dep_false);
5237         dep = isl_union_map_union(dep, dep_raw);
5238         dep = isl_union_map_coalesce(dep);
5239
5240         domain = isl_union_set_copy(gen->prog->scop->domain);
5241         domain = isl_union_set_intersect_params(domain,
5242                                 isl_set_copy(gen->prog->scop->context));
5243         sc = isl_schedule_constraints_on_domain(isl_union_set_copy(domain));
5244         sc = isl_schedule_constraints_set_validity(sc, isl_union_map_copy(dep));
5245         sc = isl_schedule_constraints_set_coincidence(sc,
5246                                                     isl_union_map_copy(dep));
5247         sc = isl_schedule_constraints_set_proximity(sc, dep);
5248
5249         if (gen->options->debug->dump_schedule_constraints)
5250                 isl_schedule_constraints_dump(sc);
5251         schedule = isl_schedule_constraints_compute_schedule(sc);
5252         if (gen->options->debug->dump_schedule)
5253                 isl_schedule_dump(schedule);
5254
5255         sched = select_outer_tilable_band(gen, schedule);
5256
5257         isl_union_map_foreach_map(sched, &set_untiled_len, &gen->untiled_len);
5258         sched = isl_union_map_intersect_domain(sched, domain);
5259         gen->sched = sched;
5260
5261         isl_schedule_free(schedule);
5262 }
5263
5264 /* Compute the sets of outer array elements that need to be copied in and out.
5265  *
5266  * In particular, for each array that is possibly written anywhere in
5267  * gen->prog and that is visible outside the corresponding scop,
5268  * we copy out its entire extent.
5269  *
5270  * Any array elements that is read without first being written needs
5271  * to be copied in. Furthermore, if there are any array elements that
5272  * are copied out, but that may not be written inside gen->prog, then
5273  * they also need to be copied in to ensure that the value after execution
5274  * is the same as the value before execution.
5275  * In case the array elements are structures, we need to take into
5276  * account that all members of the structures need to be written
5277  * by gen->prog before we can avoid copying the data structure in.
5278  *
5279  * While computing the set of array elements that are copied out but
5280  * not necessarily written, we intersect both sets with the context.
5281  * This helps in those cases where the arrays are declared with a fixed size,
5282  * while the accesses are parametric and the context assigns a fixed value
5283  * to the parameters.
5284  *
5285  * If an element from a local array is read without first being written,
5286  * then there is no point in copying it in since it cannot have been
5287  * written prior to the scop.  Warn about the uninitialized read instead.
5288  */
5289 static void compute_copy_in_and_out(struct gpu_gen *gen)
5290 {
5291         int i;
5292         isl_union_set *local;
5293         isl_union_set *may_write, *must_write;
5294         isl_union_set *copy_in, *copy_out;
5295         isl_union_set *not_written;
5296         isl_union_map *uninitialized;
5297         isl_union_map *local_uninitialized;
5298
5299         must_write = isl_union_map_range(
5300                                 isl_union_map_copy(gen->prog->must_write));
5301         must_write = isl_union_set_intersect_params(must_write,
5302                                             isl_set_copy(gen->prog->context));
5303         may_write = isl_union_map_range(
5304                                 isl_union_map_copy(gen->prog->may_write));
5305         may_write = isl_union_set_intersect_params(may_write,
5306                                             isl_set_copy(gen->prog->context));
5307         may_write = isl_union_set_universe(may_write);
5308         may_write = isl_union_set_apply(may_write,
5309                                     isl_union_map_copy(gen->prog->to_outer));
5310         copy_out = isl_union_set_empty(isl_union_set_get_space(may_write));
5311         local = isl_union_set_copy(copy_out);
5312
5313         for (i = 0; i < gen->prog->n_array; ++i) {
5314                 isl_space *space;
5315                 isl_set *write_i;
5316                 int empty;
5317
5318                 space = isl_space_copy(gen->prog->array[i].space);
5319
5320                 if (gen->prog->array[i].local) {
5321                         isl_set *set;
5322
5323                         set = isl_set_universe(space);
5324                         local = isl_union_set_add_set(local, set);
5325                         continue;
5326                 }
5327
5328                 write_i = isl_union_set_extract_set(may_write, space);
5329                 empty = isl_set_fast_is_empty(write_i);
5330                 isl_set_free(write_i);
5331                 if (empty)
5332                         continue;
5333
5334                 write_i = isl_set_copy(gen->prog->array[i].extent);
5335                 copy_out = isl_union_set_add_set(copy_out, write_i);
5336         }
5337         isl_union_set_free(may_write);
5338
5339         copy_out = isl_union_set_intersect_params(copy_out,
5340                                             isl_set_copy(gen->prog->context));
5341
5342         gen->prog->copy_out = isl_union_set_copy(copy_out);
5343
5344         copy_out = isl_union_set_apply(copy_out,
5345                                     isl_union_map_copy(gen->prog->to_inner));
5346         not_written = isl_union_set_subtract(copy_out, must_write);
5347
5348         uninitialized = isl_union_map_copy(gen->prog->scop->live_in);
5349         local_uninitialized = isl_union_map_copy(uninitialized);
5350
5351         local = isl_union_set_apply(local,
5352                                     isl_union_map_copy(gen->prog->to_inner));
5353         local_uninitialized = isl_union_map_intersect_range(local_uninitialized,
5354                                                             local);
5355         if (!isl_union_map_is_empty(local_uninitialized)) {
5356                 fprintf(stderr,
5357                         "possibly uninitialized reads (not copied in):\n");
5358                 isl_union_map_dump(local_uninitialized);
5359         }
5360         uninitialized = isl_union_map_subtract(uninitialized,
5361                                                 local_uninitialized);
5362         copy_in = isl_union_map_range(uninitialized);
5363         copy_in = isl_union_set_union(copy_in, not_written);
5364         copy_in = isl_union_set_apply(copy_in,
5365                                     isl_union_map_copy(gen->prog->to_outer));
5366
5367         gen->prog->copy_in = copy_in;
5368 }
5369
5370 /* Extract a gpu_stmt_access from "expr", append it to the list
5371  * that ends in *next_access and return the updated end of the list.
5372  */
5373 static struct gpu_stmt_access **expr_extract_access(struct pet_expr *expr,
5374         struct gpu_stmt_access **next_access)
5375 {
5376         struct gpu_stmt_access *access;
5377         isl_ctx *ctx = isl_map_get_ctx(expr->acc.access);
5378
5379         access = isl_alloc_type(ctx, struct gpu_stmt_access);
5380         assert(access);
5381         access->next = NULL;
5382         access->read = expr->acc.read;
5383         access->write = expr->acc.write;
5384         access->access = pet_expr_access_get_may_access(expr);
5385         access->tagged_access = pet_expr_access_get_tagged_may_access(expr);
5386         access->exact_write = !expr->acc.write ||
5387                 isl_map_is_equal(expr->acc.access, access->access);
5388         access->ref_id = isl_id_copy(expr->acc.ref_id);
5389         access->group = -1;
5390
5391         *next_access = access;
5392         next_access = &(*next_access)->next;
5393         return next_access;
5394 }
5395
5396 static struct gpu_stmt_access **expr_extract_accesses(struct pet_expr *expr,
5397         struct gpu_stmt_access **next_access)
5398 {
5399         int i;
5400
5401         for (i = 0; i < expr->n_arg; ++i)
5402                 next_access = expr_extract_accesses(expr->args[i],
5403                                                         next_access);
5404
5405         if (expr->type == pet_expr_access)
5406                 next_access = expr_extract_access(expr, next_access);
5407
5408         return next_access;
5409 }
5410
5411 static void pet_stmt_extract_accesses(struct gpu_stmt *stmt)
5412 {
5413         struct gpu_stmt_access **next_access = &stmt->accesses;
5414
5415         stmt->accesses = NULL;
5416         expr_extract_accesses(stmt->stmt->body, next_access);
5417 }
5418
5419 /* Return an array of gpu_stmt representing the statements in "scop".
5420  */
5421 static struct gpu_stmt *extract_stmts(isl_ctx *ctx, struct ppcg_scop *scop,
5422         __isl_keep isl_set *context)
5423 {
5424         int i;
5425         struct gpu_stmt *stmts;
5426
5427         stmts = isl_calloc_array(ctx, struct gpu_stmt, scop->n_stmt);
5428         if (!stmts)
5429                 return NULL;
5430
5431         for (i = 0; i < scop->n_stmt; ++i) {
5432                 struct gpu_stmt *s = &stmts[i];
5433
5434                 s->id = isl_set_get_tuple_id(scop->stmts[i]->domain);
5435                 s->stmt = scop->stmts[i];
5436                 pet_stmt_extract_accesses(s);
5437         }
5438
5439         return stmts;
5440 }
5441
5442 /* Callback for ppcg_print_guarded that calls the callback for generate_gpu.
5443  */
5444 static __isl_give isl_printer *print_gpu(__isl_take isl_printer *p, void *user)
5445 {
5446         struct gpu_gen *gen = user;
5447
5448         return gen->print(p, gen->prog, gen->tree, &gen->types,
5449                             gen->print_user);
5450 }
5451
5452 /* Generate CUDA code for "scop" and print it to "p".
5453  * After generating an AST for the transformed scop as explained below,
5454  * we call "gen->print" to print the AST in the desired output format
5455  * to "p".
5456  *
5457  * If it turns out that it does not make sense to generate GPU code,
5458  * then we generate CPU code instead.
5459  *
5460  * The GPU code is generated in a context where at least one
5461  * statement instance is executed.  The corresponding guard (if any) is printed
5462  * around the entire generated GPU code, except for the declaration
5463  * of the arrays that are visible outside of the scop and that therefore
5464  * cannot be declared inside the body of any possible guard.
5465  *
5466  * We first compute a schedule that respects the dependences
5467  * of the original program and select the outermost band
5468  * of tilable dimensions that has at least one parallel loop.
5469  * We then have three blocks of dimensions
5470  *
5471  *      H               B                       G
5472  *
5473  * The tilable band "B" is first tiled according to "tile" sizes, resulting
5474  * in
5475  *
5476  *      H       T               P               G
5477  *
5478  * For each iteration of the T loop and for each array, we compute
5479  * the array elements accessed by that iteration, construct a rectangular
5480  * box around it and shift it to the origin.  The result is used
5481  * as shared memory for the array.
5482  *
5483  * We then split off at most 2 parallel loops from the T loops and
5484  * at most 3 parallel loops from the P loops
5485  *
5486  *      H       T1      T2      P1      P2      G
5487  *
5488  * The T1/P1 loops are then tiled or "wrapped" over the blocks/threads,
5489  * according to "grid"/"block" sizes.
5490  *
5491  *      H       T1T T1P T2      P1T P1P P2      G
5492  *
5493  * Finally, the T1P and P1P iterators are equated to the block and
5494  * thread dimensions respectively and so are effectively removed.
5495  * The H loops are run on the host.  The T1T, T2, P1T, P2 and G loops
5496  * are run on the GPU.
5497  *
5498  * Code is generated in three stages.  We first generate code for the
5499  * host (the H loops), with iterators h%d.  Then, for each leaf node
5500  * of the resulting AST, we generate code for the shared loops (up to
5501  * and including T2), with iterators g%d and after equating the H loops
5502  * to h%d parameters and the T1P loops to the block dimensions.
5503  * Finally, we generate code for the remaining loops in a similar fashion.
5504  */
5505 static __isl_give isl_printer *generate(__isl_take isl_printer *p,
5506         struct gpu_gen *gen, struct ppcg_scop *scop,
5507         struct ppcg_options *options)
5508 {
5509         struct gpu_prog *prog;
5510         isl_ctx *ctx;
5511         isl_set *context, *guard;
5512
5513         if (!scop)
5514                 return isl_printer_free(p);
5515
5516         ctx = isl_printer_get_ctx(p);
5517         prog = gpu_prog_alloc(ctx, scop);
5518         if (!prog)
5519                 return isl_printer_free(p);
5520
5521         context = isl_set_copy(prog->context);
5522         guard = isl_union_set_params(isl_union_set_copy(prog->scop->domain));
5523         prog->context = isl_set_intersect(prog->context, isl_set_copy(guard));
5524
5525         gen->prog = prog;
5526         gen->any_parallelism = 0;
5527         compute_schedule(gen);
5528
5529         if (!gen->any_parallelism) {
5530                 isl_set_free(context);
5531                 isl_set_free(guard);
5532                 p = print_cpu(p, scop, options);
5533         } else {
5534                 compute_copy_in_and_out(gen);
5535                 gen->tree = generate_host_code(gen);
5536                 p = ppcg_print_exposed_declarations(p, prog->scop);
5537                 p = ppcg_print_guarded(p, guard, context, &print_gpu, gen);
5538                 isl_ast_node_free(gen->tree);
5539         }
5540
5541         isl_union_map_free(gen->sched);
5542
5543         gpu_prog_free(prog);
5544
5545         return p;
5546 }
5547
5548 /* Wrapper around generate for use as a ppcg_transform callback.
5549  */
5550 static __isl_give isl_printer *generate_wrap(__isl_take isl_printer *p,
5551         struct ppcg_scop *scop, void *user)
5552 {
5553         struct gpu_gen *gen = user;
5554
5555         return generate(p, gen, scop, gen->options);
5556 }
5557
5558 /* Transform the code in the file called "input" by replacing
5559  * all scops by corresponding GPU code and write the results to "out".
5560  */
5561 int generate_gpu(isl_ctx *ctx, const char *input, FILE *out,
5562         struct ppcg_options *options,
5563         __isl_give isl_printer *(*print)(__isl_take isl_printer *p,
5564                 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
5565                 struct gpu_types *types, void *user), void *user)
5566 {
5567         struct gpu_gen gen;
5568         int r;
5569         int i;
5570
5571         gen.ctx = ctx;
5572         gen.sizes = extract_sizes_from_str(ctx, options->sizes);
5573         gen.options = options;
5574         gen.kernel_id = 0;
5575         gen.print = print;
5576         gen.print_user = user;
5577         gen.types.n = 0;
5578         gen.types.name = NULL;
5579
5580         r = ppcg_transform(ctx, input, out, options, &generate_wrap, &gen);
5581
5582         isl_union_map_free(gen.sizes);
5583         for (i = 0; i < gen.types.n; ++i)
5584                 free(gen.types.name[i]);
5585         free(gen.types.name);
5586
5587         return r;
5588 }
5589
5590 struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop)
5591 {
5592         struct gpu_prog *prog;
5593
5594         if (!scop)
5595                 return NULL;
5596
5597         prog = isl_calloc_type(ctx, struct gpu_prog);
5598         assert(prog);
5599
5600         prog->ctx = ctx;
5601         prog->scop = scop;
5602         prog->context = isl_set_copy(scop->context);
5603         prog->n_stmts = scop->n_stmt;
5604         prog->stmts = extract_stmts(ctx, scop, prog->context);
5605         prog->read = isl_union_map_copy(scop->reads);
5606         prog->may_write = isl_union_map_copy(scop->may_writes);
5607         prog->must_write = isl_union_map_copy(scop->must_writes);
5608         prog->to_inner = compute_to_inner(scop);
5609         prog->to_outer = isl_union_map_copy(prog->to_inner);
5610         prog->to_outer = isl_union_map_reverse(prog->to_outer);
5611
5612         if (!prog->stmts)
5613                 return gpu_prog_free(prog);
5614
5615         if (collect_array_info(prog) < 0)
5616                 return gpu_prog_free(prog);
5617
5618         return prog;
5619 }
5620
5621 void *gpu_prog_free(struct gpu_prog *prog)
5622 {
5623         if (!prog)
5624                 return NULL;
5625         free_array_info(prog);
5626         free_stmts(prog->stmts, prog->n_stmts);
5627         isl_union_map_free(prog->to_outer);
5628         isl_union_map_free(prog->to_inner);
5629         isl_union_set_free(prog->copy_in);
5630         isl_union_set_free(prog->copy_out);
5631         isl_union_map_free(prog->read);
5632         isl_union_map_free(prog->may_write);
5633         isl_union_map_free(prog->must_write);
5634         isl_set_free(prog->context);
5635         free(prog);
5636         return NULL;
5637 }