gpu.c

   1 /*
   2  * Copyright 2010-2011 INRIA Saclay
   3  * Copyright 2012-2013 Ecole Normale Superieure
   4  *
   5  * Use of this software is governed by the MIT license
   6  *
   7  * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
   8  * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
   9  * 91893 Orsay, France
  10  * and Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
  11  */
  12
  13 #include <assert.h>
  14 #include <stdlib.h>
  15 #include <string.h>
  16
  17 #include <isl/polynomial.h>
  18 #include <isl/union_set.h>
  19 #include <isl/aff.h>
  20 #include <isl/ilp.h>
  21 #include <isl/flow.h>
  22 #include <isl/band.h>
  23 #include <isl/schedule.h>
  24 #include <isl/options.h>
  25 #include <isl/ast_build.h>
  26
  27 #include "cpu.h"
  28 #include "gpu.h"
  29 #include "schedule.h"
  30 #include "ppcg_options.h"
  31 #include "print.h"
  32
  33 /* The fields stride, shift and shift_map only contain valid information
  34  * if shift != NULL.
  35  * If so, they express that current index is such that if you add shift,
  36  * then the result is always a multiple of stride.
  37  * shift_map contains the mapping
  38  *
  39  *      i -> (i + shift)/stride
  40  *
  41  * Let D represent the initial shared_len dimensions of the computed schedule.
  42  * The spaces of "lb" and "shift" are of the form
  43  *
  44  *      D -> [b]
  45  *
  46  * "shift_map" is of the form
  47  *
  48  *      [D -> i] -> [D -> (i + shift(D))/stride]
  49  */
  50 struct gpu_array_bound {
  51         isl_val *size;
  52         isl_aff *lb;
  53
  54         isl_val *stride;
  55         isl_aff *shift;
  56         isl_basic_map *shift_map;
  57 };
  58
  59 /* A tile of an array.
  60  *
  61  * n is the dimension of the array.
  62  * bound is an array of size "n" representing the lower bound
  63  *      and size for each index.
  64  *
  65  * tiling maps a tile in the global array to the corresponding
  66  * shared/private memory tile and is of the form
  67  *
  68  *      { [D[i] -> A[a]] -> T[(a + shift(i))/stride - lb(i)] }
  69  *
  70  * where D represents the initial shared_len dimensions
  71  * of the computed schedule.
  72  */
  73 struct gpu_array_tile {
  74         int n;
  75         struct gpu_array_bound *bound;
  76         isl_multi_aff *tiling;
  77 };
  78
  79 struct gpu_array_info;
  80
  81 /* A group of array references in a kernel that should be handled together.
  82  * If private_tile is not NULL, then it is mapped to registers.
  83  * Otherwise, if shared_tile is not NULL, it is mapped to shared memory.
  84  * Otherwise, it is accessed from global memory.
  85  */
  86 struct gpu_array_ref_group {
  87         /* The references in this group access this array. */
  88         struct gpu_array_info *array;
  89         /* Position of this group in the list of reference groups of array. */
  90         int nr;
  91
  92         /* The following fields are use during the construction of the groups.
  93          * access is the combined access relation relative to the shared
  94          * memory tiling.  In particular, the domain of the map corresponds
  95          * to the first shared_len dimensions of the computed schedule.
  96          * write is set if any access in the group is a write.
  97          * exact_write is set if all writes are definite writes.
  98          */
  99         isl_map *access;
 100         int write;
 101         int exact_write;
 102
 103         /* The shared memory tile, NULL if none. */
 104         struct gpu_array_tile *shared_tile;
 105
 106         /* The private memory tile, NULL if none. */
 107         struct gpu_array_tile *private_tile;
 108
 109         /* References in this group; point to elements of a linked list. */
 110         int n_ref;
 111         struct gpu_stmt_access **refs;
 112
 113         /* Last shared memory tile dimension that affects tile of this group. */
 114         int last_shared;
 115 };
 116
 117 struct gpu_gen {
 118         isl_ctx *ctx;
 119         struct ppcg_options *options;
 120
 121         /* Callback for printing of AST in appropriate format. */
 122         __isl_give isl_printer *(*print)(__isl_take isl_printer *p,
 123                 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
 124                 struct gpu_types *types, void *user);
 125         void *print_user;
 126
 127         struct gpu_prog *prog;
 128         /* The generated AST. */
 129         isl_ast_node *tree;
 130
 131         /* The sequence of types for which a definition has been printed. */
 132         struct gpu_types types;
 133
 134         /* tile, grid and block sizes for each kernel */
 135         isl_union_map *sizes;
 136
 137         /* Identifier of current kernel. */
 138         int kernel_id;
 139         /* Pointer to the current kernel. */
 140         struct ppcg_kernel *kernel;
 141         /* Does the computed schedule exhibit any parallelism? */
 142         int any_parallelism;
 143
 144         /* First tile dimension. */
 145         int tile_first;
 146         /* Number of tile dimensions. */
 147         int tile_len;
 148         /* Number of initial parallel loops among tile dimensions. */
 149         int n_parallel;
 150
 151         /* Number of dimensions determining shared memory. */
 152         int shared_len;
 153
 154         /* Number of rows in the untiled schedule. */
 155         int untiled_len;
 156         /* Number of rows in the tiled schedule. */
 157         int tiled_len;
 158         /* Number of rows in schedule after tiling/wrapping over threads. */
 159         int thread_tiled_len;
 160
 161         /* Global untiled schedule. */
 162         isl_union_map *sched;
 163         /* Local (per kernel launch) tiled schedule. */
 164         isl_union_map *tiled_sched;
 165         /* Local schedule per shared memory tile loop iteration. */
 166         isl_union_map *local_sched;
 167
 168         /* Local tiled schedule projected onto the shared tile loops and
 169          * the loops that will be wrapped over the threads,
 170          * with all shared tile loops parametrized.
 171          */
 172         isl_union_map *shared_sched;
 173         /* Projects out the loops that will be wrapped over the threads
 174          * from shared_sched.
 175          */
 176         isl_union_map *shared_proj;
 177
 178         /* A map that takes the range of shared_sched as input,
 179          * wraps the appropriate loops over the threads and then projects
 180          * out these loops.
 181          */
 182         isl_map *privatization;
 183
 184         /* A map from the shared memory tile loops and the thread indices
 185          * (as parameters) to the set of accessed memory elements that
 186          * will be accessed through private copies.
 187          */
 188         isl_union_map *private_access;
 189
 190         /* The schedule for the current private/shared access
 191          * (within print_private_access or print_shared_access).
 192          */
 193         isl_map *copy_sched;
 194         /* The array reference group corresponding to copy_sched. */
 195         struct gpu_array_ref_group *copy_group;
 196
 197         /* First loop to unroll (or -1 if none) in the current part of the
 198          * schedule.
 199          */
 200         int first_unroll;
 201
 202         int n_grid;
 203         int n_block;
 204         /* Note: in the input file, the sizes of the grid and the blocks
 205          * are specified in the order x, y, z, but internally, the sizes
 206          * are stored in reverse order, so that the last element always
 207          * refers to the x dimension.
 208          */
 209         int grid_dim[2];
 210         int block_dim[3];
 211         int *tile_size;
 212 };
 213
 214 /* Print the name of the local copy of a given group of array references.
 215  */
 216 static __isl_give isl_printer *print_array_name(__isl_take isl_printer *p,
 217         struct gpu_array_ref_group *group)
 218 {
 219         int global = 0;
 220
 221         if (group->private_tile)
 222                 p = isl_printer_print_str(p, "private_");
 223         else if (group->shared_tile)
 224                 p = isl_printer_print_str(p, "shared_");
 225         else
 226                 global = 1;
 227         p = isl_printer_print_str(p, group->array->name);
 228         if (!global && group->array->n_group > 1) {
 229                 p = isl_printer_print_str(p, "_");
 230                 p = isl_printer_print_int(p, group->nr);
 231         }
 232
 233         return p;
 234 }
 235
 236 /* Collect all references to the given array and store pointers to them
 237  * in array->refs.
 238  *
 239  * If the array contains structures, then there is no need to collect
 240  * the references since we will not be computing any reference groups.
 241  */
 242 static void collect_references(struct gpu_prog *prog,
 243         struct gpu_array_info *array)
 244 {
 245         int i;
 246         int n;
 247
 248         if (array->has_compound_element)
 249                 return;
 250
 251         n = 0;
 252         for (i = 0; i < prog->n_stmts; ++i) {
 253                 struct gpu_stmt *stmt = &prog->stmts[i];
 254                 struct gpu_stmt_access *access;
 255
 256                 for (access = stmt->accesses; access; access = access->next) {
 257                         const char *name;
 258                         name = isl_map_get_tuple_name(access->access,
 259                                                       isl_dim_out);
 260                         if (name && !strcmp(array->name, name))
 261                                 n++;
 262                 }
 263         }
 264
 265         array->n_ref = n;
 266         array->refs = isl_alloc_array(prog->ctx, struct gpu_stmt_access *, n);
 267         assert(array->refs);
 268
 269         n = 0;
 270         for (i = 0; i < prog->n_stmts; ++i) {
 271                 struct gpu_stmt *stmt = &prog->stmts[i];
 272                 struct gpu_stmt_access *access;
 273
 274                 for (access = stmt->accesses; access; access = access->next) {
 275                         const char *name;
 276                         name = isl_map_get_tuple_name(access->access,
 277                                                       isl_dim_out);
 278                         if (!name || strcmp(array->name, name))
 279                                 continue;
 280
 281                         array->refs[n++] = access;
 282                 }
 283         }
 284 }
 285
 286 /* Create a gpu_array_tile for an array of dimension "n_index".
 287  */
 288 static struct gpu_array_tile *create_tile(isl_ctx *ctx, int n_index)
 289 {
 290         int i;
 291         struct gpu_array_tile *tile;
 292
 293         tile = isl_calloc_type(ctx, struct gpu_array_tile);
 294         assert(tile);
 295
 296         tile->n = n_index;
 297
 298         tile->bound = isl_alloc_array(ctx, struct gpu_array_bound, n_index);
 299         assert(tile->bound);
 300
 301         for (i = 0; i < n_index; ++i) {
 302                 tile->bound[i].size = NULL;
 303                 tile->bound[i].lb = NULL;
 304                 tile->bound[i].stride = NULL;
 305                 tile->bound[i].shift = NULL;
 306                 tile->bound[i].shift_map = NULL;
 307         }
 308
 309         return tile;
 310 }
 311
 312 static void *free_tile(struct gpu_array_tile *tile)
 313 {
 314         int j;
 315
 316         if (!tile)
 317                 return NULL;
 318
 319         for (j = 0; j < tile->n; ++j) {
 320                 isl_val_free(tile->bound[j].size);
 321                 isl_val_free(tile->bound[j].stride);
 322                 isl_aff_free(tile->bound[j].lb);
 323                 isl_aff_free(tile->bound[j].shift);
 324                 isl_basic_map_free(tile->bound[j].shift_map);
 325         }
 326         free(tile->bound);
 327         isl_multi_aff_free(tile->tiling);
 328         free(tile);
 329
 330         return NULL;
 331 }
 332
 333 static struct pet_array *find_array(struct ppcg_scop *scop,
 334         __isl_keep isl_set *accessed)
 335 {
 336         int i;
 337         isl_id *id;
 338
 339         id = isl_set_get_tuple_id(accessed);
 340
 341         for (i = 0; i < scop->n_array; ++i) {
 342                 isl_id *id_i;
 343
 344                 id_i = isl_set_get_tuple_id(scop->arrays[i]->extent);
 345                 isl_id_free(id_i);
 346                 if (id == id_i)
 347                         break;
 348         }
 349         isl_id_free(id);
 350
 351         return i < scop->n_array ? scop->arrays[i] : NULL;
 352 }
 353
 354 /* Compute and return the extent of "array", taking into account the set of
 355  * accessed elements.
 356  *
 357  * In particular, the extent in the outer dimension is taken
 358  * from "accessed", while then extent in the remaing dimensions
 359  * are taken from array->extent.
 360  *
 361  * The extent in the outer dimension cannot be taken from array->extent
 362  * because that may be unbounded.  Furthermore, even if it is bounded,
 363  * it may be larger than the piece of the array that is being accessed.
 364  */
 365 static __isl_give isl_set *compute_extent(struct pet_array *array,
 366         __isl_keep isl_set *accessed)
 367 {
 368         int n_index;
 369         isl_id *id;
 370         isl_set *outer;
 371         isl_set *extent;
 372
 373         extent = isl_set_copy(array->extent);
 374
 375         n_index = isl_set_dim(accessed, isl_dim_set);
 376         if (n_index == 0)
 377                 return extent;
 378
 379         extent = isl_set_project_out(extent, isl_dim_set, 0, 1);
 380         outer = isl_set_copy(accessed);
 381         outer = isl_set_project_out(outer, isl_dim_set, 1, n_index - 1);
 382         extent = isl_set_flat_product(outer, extent);
 383         id = isl_set_get_tuple_id(accessed);
 384         extent = isl_set_set_tuple_id(extent, id);
 385
 386         return extent;
 387 }
 388
 389 /* Is the array "array" being extracted a read-only scalar?
 390  *
 391  * That is, is "array" a scalar that is never possibly written to.
 392  * An array containing structures is never considered to be a scalar.
 393  */
 394 static int is_read_only_scalar(struct gpu_array_info *array,
 395         struct gpu_prog *prog)
 396 {
 397         isl_set *space;
 398         isl_union_map *write;
 399         int empty;
 400
 401         if (array->has_compound_element)
 402                 return 0;
 403         if (array->n_index != 0)
 404                 return 0;
 405
 406         write = isl_union_map_copy(prog->may_write);
 407         space = isl_set_universe(isl_space_copy(array->space));
 408         write = isl_union_map_intersect_range(write,
 409                                                 isl_union_set_from_set(space));
 410         empty = isl_union_map_is_empty(write);
 411         isl_union_map_free(write);
 412
 413         return empty;
 414 }
 415
 416 /* Compute bounds on the host arrays based on the accessed elements
 417  * and collect all references to the array.
 418  *
 419  * If the array is zero-dimensional and does not contain structures,
 420  * i.e., if the array is a scalar, we check whether it is read-only.
 421  */
 422 static int extract_array_info(__isl_take isl_set *array, void *user)
 423 {
 424         int i;
 425         struct gpu_prog *prog = (struct gpu_prog *)user;
 426         const char *name;
 427         int n_index;
 428         isl_pw_aff **bounds;
 429         struct pet_array *pa;
 430         struct gpu_array_info *info;
 431         isl_set *extent;
 432
 433         info = &prog->array[prog->n_array];
 434         prog->n_array++;
 435
 436         n_index = isl_set_dim(array, isl_dim_set);
 437         name = isl_set_get_tuple_name(array);
 438         bounds = isl_alloc_array(isl_set_get_ctx(array),
 439                                  isl_pw_aff *, n_index);
 440         if (!bounds)
 441                 goto error;
 442
 443         info->space = isl_set_get_space(array);
 444         info->name = strdup(name);
 445         info->n_index = n_index;
 446         info->bound = bounds;
 447         info->linearize = prog->scop->options->linearize_device_arrays;
 448
 449         pa = find_array(prog->scop, array);
 450         if (!pa)
 451                 isl_die(isl_set_get_ctx(array), isl_error_internal,
 452                         "unable to find array in scop", goto error);
 453
 454         info->type = strdup(pa->element_type);
 455         info->size = pa->element_size;
 456         info->local = pa->declared && !pa->exposed;
 457         info->has_compound_element = pa->element_is_record;
 458         info->read_only_scalar = is_read_only_scalar(info, prog);
 459
 460         extent = compute_extent(pa, array);
 461         for (i = 0; i < n_index; ++i) {
 462                 isl_set *dom;
 463                 isl_local_space *ls;
 464                 isl_aff *one;
 465                 isl_pw_aff *bound;
 466
 467                 bound = isl_set_dim_max(isl_set_copy(extent), i);
 468                 assert(bound);
 469                 dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
 470                 ls = isl_local_space_from_space(isl_set_get_space(dom));
 471                 one = isl_aff_zero_on_domain(ls);
 472                 one = isl_aff_add_constant_si(one, 1);
 473                 bound = isl_pw_aff_add(bound, isl_pw_aff_alloc(dom, one));
 474                 bound = isl_pw_aff_gist(bound, isl_set_copy(prog->context));
 475
 476                 bounds[i] = bound;
 477                 if (!isl_pw_aff_is_cst(bound))
 478                         info->linearize = 1;
 479         }
 480         info->extent = extent;
 481
 482         collect_references(prog, info);
 483
 484         isl_set_free(array);
 485         return 0;
 486 error:
 487         isl_set_free(array);
 488         return -1;
 489 }
 490
 491 /* Compute a mapping from all outer arrays (of structs) in scop
 492  * to their innermost arrays.
 493  *
 494  * In particular, for each array of a primitive type, the result
 495  * contains the identity mapping on that array.
 496  * For each array involving member accesses, the result
 497  * contains a mapping from the elements of the outer array of structs
 498  * to all corresponding elements of the innermost nested arrays.
 499  */
 500 static __isl_give isl_union_map *compute_to_inner(struct ppcg_scop *scop)
 501 {
 502         int i;
 503         isl_union_map *to_inner;
 504
 505         to_inner = isl_union_map_empty(isl_set_get_space(scop->context));
 506
 507         for (i = 0; i < scop->n_array; ++i) {
 508                 struct pet_array *array = scop->arrays[i];
 509                 isl_set *set;
 510                 isl_map *map;
 511
 512                 if (array->element_is_record)
 513                         continue;
 514
 515                 set = isl_set_copy(array->extent);
 516                 map = isl_set_identity(isl_set_copy(set));
 517
 518                 while (set && isl_set_is_wrapping(set)) {
 519                         isl_id *id;
 520                         isl_map *wrapped;
 521
 522                         id = isl_set_get_tuple_id(set);
 523                         wrapped = isl_set_unwrap(set);
 524                         wrapped = isl_map_domain_map(wrapped);
 525                         wrapped = isl_map_set_tuple_id(wrapped, isl_dim_in, id);
 526                         map = isl_map_apply_domain(map, wrapped);
 527                         set = isl_map_domain(isl_map_copy(map));
 528                 }
 529
 530                 map = isl_map_gist_domain(map, set);
 531
 532                 to_inner = isl_union_map_add_map(to_inner, map);
 533         }
 534
 535         return to_inner;
 536 }
 537
 538 /* Construct a gpu_array_info for each array possibly accessed by "prog" and
 539  * collect them in prog->array.
 540  *
 541  * If there are any member accesses involved, then they are first mapped
 542  * to the outer arrays of structs.
 543  */
 544 static int collect_array_info(struct gpu_prog *prog)
 545 {
 546         int r;
 547         isl_union_set *arrays;
 548
 549         arrays = isl_union_map_range(isl_union_map_copy(prog->read));
 550         arrays = isl_union_set_union(arrays,
 551                     isl_union_map_range(isl_union_map_copy(prog->may_write)));
 552
 553         arrays = isl_union_set_apply(arrays,
 554                                         isl_union_map_copy(prog->to_outer));
 555
 556         arrays = isl_union_set_coalesce(arrays);
 557
 558         prog->n_array = isl_union_set_n_set(arrays);
 559         prog->array = isl_calloc_array(prog->ctx,
 560                                      struct gpu_array_info, prog->n_array);
 561         assert(prog->array);
 562         prog->n_array = 0;
 563         r = isl_union_set_foreach_set(arrays, &extract_array_info, prog);
 564         isl_union_set_free(arrays);
 565
 566         return r;
 567 }
 568
 569 static void free_array_info(struct gpu_prog *prog)
 570 {
 571         int i, j;
 572
 573         for (i = 0; i < prog->n_array; ++i) {
 574                 int n_index = prog->array[i].n_index;
 575                 free(prog->array[i].type);
 576                 free(prog->array[i].name);
 577                 for (j = 0; j < n_index; ++j)
 578                         isl_pw_aff_free(prog->array[i].bound[j]);
 579                 isl_space_free(prog->array[i].space);
 580                 isl_set_free(prog->array[i].extent);
 581                 free(prog->array[i].bound);
 582                 free(prog->array[i].refs);
 583         }
 584         free(prog->array);
 585 }
 586
 587 /* Check if a gpu array is a scalar.  A scalar is a value that is not stored
 588  * as an array or through a pointer reference, but as a single data element.
 589  * At the moment, scalars are represented as zero-dimensional arrays.
 590  * A zero-dimensional array containing structures is not considered
 591  * to be a scalar.
 592  */
 593 int gpu_array_is_scalar(struct gpu_array_info *array)
 594 {
 595         return !array->has_compound_element && array->n_index == 0;
 596 }
 597
 598 /* Is "array" a read-only scalar?
 599  */
 600 int gpu_array_is_read_only_scalar(struct gpu_array_info *array)
 601 {
 602         return array->read_only_scalar;
 603 }
 604
 605 /* Internal data structure for extract_size_of_type.
 606  * "type" specifies the name of the space that we want to extract.
 607  * "res" is used to store the subset of that space.
 608  */
 609 struct ppcg_extract_size_data {
 610         const char *type;
 611         isl_set *res;
 612 };
 613
 614 /* This function is called for each set in a union_set.
 615  * If the name of the set matches data->type, we store the
 616  * set in data->res.
 617  */
 618 static int extract_size_of_type(__isl_take isl_set *size, void *user)
 619 {
 620         struct ppcg_extract_size_data *data = user;
 621         const char *name;
 622
 623         name = isl_set_get_tuple_name(size);
 624         if (name && !strcmp(name, data->type)) {
 625                 data->res = size;
 626                 return -1;
 627         }
 628
 629         isl_set_free(size);
 630         return 0;
 631 }
 632
 633 /* Given a union map { kernel[i] -> *[...] },
 634  * return the range in the space called "type" for the kernel with
 635  * sequence number "id".
 636  */
 637 static __isl_give isl_set *extract_sizes(__isl_keep isl_union_map *sizes,
 638         const char *type, int id)
 639 {
 640         isl_space *space;
 641         isl_set *dom;
 642         isl_union_set *local_sizes;
 643         struct ppcg_extract_size_data data = { type, NULL };
 644
 645         if (!sizes)
 646                 return NULL;
 647
 648         space = isl_union_map_get_space(sizes);
 649         space = isl_space_set_from_params(space);
 650         space = isl_space_add_dims(space, isl_dim_set, 1);
 651         space = isl_space_set_tuple_name(space, isl_dim_set, "kernel");
 652         dom = isl_set_universe(space);
 653         dom = isl_set_fix_si(dom, isl_dim_set, 0, id);
 654
 655         local_sizes = isl_union_set_apply(isl_union_set_from_set(dom),
 656                                         isl_union_map_copy(sizes));
 657         isl_union_set_foreach_set(local_sizes, &extract_size_of_type, &data);
 658         isl_union_set_free(local_sizes);
 659         return data.res;
 660 }
 661
 662 /* Given a singleton set, extract the first (at most *len) elements
 663  * of the single integer tuple into *sizes and update *len if needed.
 664  */
 665 static void read_sizes_from_set(__isl_take isl_set *set, int *sizes, int *len)
 666 {
 667         int i;
 668         int dim;
 669
 670         if (!set)
 671                 return;
 672
 673         dim = isl_set_dim(set, isl_dim_set);
 674         if (dim < *len)
 675                 *len = dim;
 676
 677         for (i = 0; i < *len; ++i) {
 678                 isl_val *v;
 679
 680                 v = isl_set_plain_get_val_if_fixed(set, isl_dim_set, i);
 681                 assert(v);
 682
 683                 sizes[i] = isl_val_get_num_si(v);
 684                 isl_val_free(v);
 685         }
 686
 687         isl_set_free(set);
 688 }
 689
 690 /* Extract user specified "tile" sizes from the "sizes" command line option,
 691  * defaulting to option->tile_size in each dimension.
 692  */
 693 static void read_tile_sizes(struct gpu_gen *gen)
 694 {
 695         int n;
 696         isl_set *size;
 697
 698         gen->tile_size = isl_alloc_array(gen->ctx, int, gen->tile_len);
 699         assert(gen->tile_size);
 700         for (n = 0; n < gen->tile_len; ++n)
 701                 gen->tile_size[n] = gen->options->tile_size;
 702
 703         size = extract_sizes(gen->sizes, "tile", gen->kernel_id);
 704         read_sizes_from_set(size, gen->tile_size, &gen->tile_len);
 705
 706         if (gen->n_parallel > gen->tile_len)
 707                 gen->n_parallel = gen->tile_len;
 708 }
 709
 710 /* Extract user specified "block" sizes from the "sizes" command line option,
 711  * after filling in some potentially useful defaults.
 712  */
 713 static void read_block_sizes(struct gpu_gen *gen)
 714 {
 715         int n;
 716         isl_set *size;
 717
 718         n = gen->n_parallel;
 719         gen->n_block = (n <= 3) ? n : 3;
 720         switch (gen->n_block) {
 721         case 1:
 722                 gen->block_dim[0] = 512;
 723                 break;
 724         case 2:
 725                 gen->block_dim[0] = 32;
 726                 gen->block_dim[1] = 16;
 727                 break;
 728         default:
 729                 gen->block_dim[0] = 32;
 730                 gen->block_dim[1] = 4;
 731                 gen->block_dim[2] = 4;
 732                 break;
 733         }
 734
 735         size = extract_sizes(gen->sizes, "block", gen->kernel_id);
 736         read_sizes_from_set(size, gen->block_dim, &gen->n_block);
 737 }
 738
 739 /* Extract user specified "grid" sizes from the "sizes" command line option,
 740  * after filling in some potentially useful defaults.
 741  */
 742 static void read_grid_sizes(struct gpu_gen *gen)
 743 {
 744         int n = gen->n_parallel;
 745         isl_set *size;
 746
 747         gen->n_grid = (n <= 2) ? n : 2;
 748         switch (gen->n_grid) {
 749         case 1:
 750                 gen->grid_dim[0] = 32768;
 751                 break;
 752         default:
 753                 gen->grid_dim[0] = 256;
 754                 gen->grid_dim[1] = 256;
 755                 break;
 756         }
 757
 758         size = extract_sizes(gen->sizes, "grid", gen->kernel_id);
 759         read_sizes_from_set(size, gen->grid_dim, &gen->n_grid);
 760 }
 761
 762 /* Extract user specified sizes from the "sizes" command line option
 763  * after filling in some potentially useful defaults.
 764  */
 765 static void read_sizes(struct gpu_gen *gen)
 766 {
 767         read_tile_sizes(gen);
 768         read_block_sizes(gen);
 769         read_grid_sizes(gen);
 770 }
 771
 772 static void *free_stmts(struct gpu_stmt *stmts, int n)
 773 {
 774         int i;
 775
 776         if (!stmts)
 777                 return NULL;
 778
 779         for (i = 0; i < n; ++i) {
 780                 struct gpu_stmt_access *access, *next;
 781
 782                 for (access = stmts[i].accesses; access; access = next) {
 783                         next = access->next;
 784                         isl_id_free(access->ref_id);
 785                         isl_map_free(access->access);
 786                         free(access);
 787                 }
 788
 789                 isl_id_free(stmts[i].id);
 790         }
 791         free(stmts);
 792
 793         return NULL;
 794 }
 795
 796 /* Construct a map from a domain of dimensionality "len"
 797  * to a domain of dimensionality "len" + "tile_len" that tiles
 798  * the "tile_len" coordinates starting at "first".
 799  * In particular, [s_i] -> [s_i / tile_size[i], s_i % tile_size[i]].
 800  * "dim" prescribes the parameters.
 801  */
 802 static __isl_give isl_map *tile(__isl_take isl_space *dim, int len,
 803         int first, int tile_len, int *tile_size)
 804 {
 805         int i;
 806         isl_basic_map *bmap;
 807         isl_constraint *c;
 808         isl_local_space *ls;
 809
 810         dim = isl_space_add_dims(dim, isl_dim_in, len);
 811         dim = isl_space_add_dims(dim, isl_dim_out, len + tile_len);
 812         bmap = isl_basic_map_universe(isl_space_copy(dim));
 813         ls = isl_local_space_from_space(dim);
 814
 815         for (i = 0; i < len - tile_len; ++i) {
 816                 int j = i < first ? i : i + tile_len;
 817                 int k = i < first ? i : i + 2 * tile_len;
 818
 819                 c = isl_equality_alloc(isl_local_space_copy(ls));
 820                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, j, -1);
 821                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, k, 1);
 822                 bmap = isl_basic_map_add_constraint(bmap, c);
 823         }
 824
 825         for (i = 0; i < tile_len; ++i) {
 826                 c = isl_equality_alloc(isl_local_space_copy(ls));
 827                 c = isl_constraint_set_coefficient_si(c, isl_dim_in,
 828                                                 first + i, -1);
 829                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 830                                                 first + i, tile_size[i]);
 831                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 832                                                 first + i + tile_len, 1);
 833                 bmap = isl_basic_map_add_constraint(bmap, c);
 834
 835                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 836                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 837                                                    first + i + tile_len, 1);
 838                 bmap = isl_basic_map_add_constraint(bmap, c);
 839
 840                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 841                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 842                                                    first + i + tile_len, -1);
 843                 c = isl_constraint_set_constant_si(c, tile_size[i] - 1);
 844                 bmap = isl_basic_map_add_constraint(bmap, c);
 845         }
 846
 847         isl_local_space_free(ls);
 848
 849         return isl_map_from_basic_map(bmap);
 850 }
 851
 852 /* Construct a map from a domain of dimensionality "len"
 853  * to a domain of dimensionality "len" + "wrap_len" that "wraps"
 854  * the "wrap_len" coordinates starting at "first" according to "wrap_size".
 855  * In particular, [s_i] -> [s_i, s_i % wrap_size[i]].
 856  * To do so, we need extra variables corresponding to [s_i / wrap_size[i]],
 857  * that are projected out at the end.
 858  * "dim" prescribes the parameters.
 859  */
 860 static __isl_give isl_map *wrap(__isl_take isl_space *dim, int len,
 861         int first, int wrap_len, int *wrap_size)
 862 {
 863         int i;
 864         isl_basic_map *bmap;
 865         isl_constraint *c;
 866         isl_local_space *ls;
 867
 868         dim = isl_space_add_dims(dim, isl_dim_in, len);
 869         dim = isl_space_add_dims(dim, isl_dim_out, len + 2 * wrap_len);
 870         bmap = isl_basic_map_universe(isl_space_copy(dim));
 871         ls = isl_local_space_from_space(dim);
 872
 873         for (i = 0; i < len; ++i) {
 874                 int k = i < first + wrap_len ? i : i + 2 * wrap_len;
 875
 876                 c = isl_equality_alloc(isl_local_space_copy(ls));
 877                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, -1);
 878                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, k, 1);
 879                 bmap = isl_basic_map_add_constraint(bmap, c);
 880         }
 881
 882         for (i = 0; i < wrap_len; ++i) {
 883                 c = isl_equality_alloc(isl_local_space_copy(ls));
 884                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 885                                                     first + i, -1);
 886                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 887                                                     first + wrap_len + i, 1);
 888                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 889                                     first + 2 * wrap_len + i, wrap_size[i]);
 890                 bmap = isl_basic_map_add_constraint(bmap, c);
 891
 892                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 893                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 894                                                     first + wrap_len + i, 1);
 895                 bmap = isl_basic_map_add_constraint(bmap, c);
 896
 897                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 898                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 899                                                     first + wrap_len + i, -1);
 900                 c = isl_constraint_set_constant_si(c, wrap_size[i] - 1);
 901                 bmap = isl_basic_map_add_constraint(bmap, c);
 902         }
 903
 904         isl_local_space_free(ls);
 905
 906         bmap = isl_basic_map_project_out(bmap, isl_dim_out,
 907                                 first + 2 * wrap_len, wrap_len);
 908
 909         return isl_map_from_basic_map(bmap);
 910 }
 911
 912 /* Add "n" parameters named prefix%d.
 913  */
 914 static __isl_give isl_set *add_params( __isl_take isl_set *set,
 915         int n, const char *prefix)
 916 {
 917         int i;
 918         unsigned nparam;
 919         char name[20];
 920
 921         nparam = isl_set_dim(set, isl_dim_param);
 922         set = isl_set_add_dims(set, isl_dim_param, n);
 923
 924         for (i = 0; i < n; ++i) {
 925                 snprintf(name, sizeof(name), "%s%d", prefix, i);
 926                 set = isl_set_set_dim_name(set, isl_dim_param,
 927                                             nparam + i, name);
 928         }
 929
 930         return set;
 931 }
 932
 933 /* Equate the "n" dimensions of "set" starting at "first" to
 934  * freshly created parameters named prefix%d.
 935  */
 936 static __isl_give isl_set *parametrize(__isl_take isl_set *set,
 937         int first, int n, const char *prefix)
 938 {
 939         int i;
 940         unsigned nparam;
 941
 942         nparam = isl_set_dim(set, isl_dim_param);
 943
 944         set = add_params(set, n, prefix);
 945
 946         for (i = 0; i < n; ++i)
 947                 set = isl_set_equate(set, isl_dim_param, nparam + i,
 948                                         isl_dim_set, first + i);
 949
 950         return set;
 951 }
 952
 953 /* Given a parameter space "space", create a set of dimension "len"
 954  * of which the "n" dimensions starting at "first" are equated to
 955  * freshly created parameters named prefix%d.
 956  */
 957 static __isl_give isl_set *parametrization(__isl_take isl_space *space,
 958         int len, int first, int n, const char *prefix)
 959 {
 960         isl_set *set;
 961
 962         space = isl_space_set_from_params(space);
 963         space = isl_space_add_dims(space, isl_dim_set, len);
 964         set = isl_set_universe(space);
 965
 966         return parametrize(set, first, n, prefix);
 967 }
 968
 969 /* Tile the B loops over the tile sizes and then tile/wrap
 970  * the T1 loops over the blocks.
 971  */
 972 static __isl_give isl_union_map *tile_schedule(struct gpu_gen *gen,
 973         __isl_take isl_union_map *sched)
 974 {
 975         isl_space *dim;
 976         isl_map *tiling, *block_tiling;
 977
 978         dim = isl_union_map_get_space(sched);
 979         tiling = tile(isl_space_copy(dim), gen->untiled_len,
 980                       gen->tile_first, gen->tile_len, gen->tile_size);
 981
 982         if (gen->options->wrap)
 983                 block_tiling = wrap(dim, gen->untiled_len + gen->tile_len,
 984                                 gen->tile_first, gen->n_grid, gen->grid_dim);
 985         else
 986                 block_tiling = tile(dim, gen->untiled_len + gen->tile_len,
 987                                 gen->tile_first, gen->n_grid, gen->grid_dim);
 988
 989         gen->tiled_len = gen->untiled_len + gen->tile_len + gen->n_grid;
 990
 991         tiling = isl_map_apply_range(tiling, block_tiling);
 992
 993         sched = isl_union_map_apply_range(sched,
 994                                              isl_union_map_from_map(tiling));
 995
 996         gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
 997
 998         return sched;
 999 }
1000
1001 /* Equate the "T1P" iterators in the tiled schedule "sched"
1002  * to the block dimensions.
1003  */
1004 static __isl_give isl_union_map *parametrize_tiled_schedule(
1005         struct gpu_gen *gen, __isl_take isl_union_map *sched)
1006 {
1007         isl_space *dim;
1008         isl_set *par;
1009
1010         dim = isl_union_map_get_space(sched);
1011         par = parametrization(dim, gen->tiled_len,
1012                 gen->tile_first + gen->n_grid, gen->n_grid, "b");
1013         sched = isl_union_map_intersect_range(sched,
1014                                                 isl_union_set_from_set(par));
1015
1016         return sched;
1017 }
1018
1019 /* Tile/wrap the P1 loops over the threads.
1020  */
1021 static __isl_give isl_union_map *thread_tile_schedule(struct gpu_gen *gen,
1022         __isl_take isl_union_map *sched)
1023 {
1024         isl_space *dim;
1025         isl_map *tiling;
1026         isl_set *par;
1027
1028         dim = isl_union_map_get_space(sched);
1029
1030         if (gen->options->wrap)
1031                 tiling = wrap(isl_space_copy(dim), gen->tiled_len,
1032                                 gen->shared_len, gen->n_block, gen->block_dim);
1033         else
1034                 tiling = tile(isl_space_copy(dim), gen->tiled_len,
1035                                 gen->shared_len, gen->n_block, gen->block_dim);
1036         gen->thread_tiled_len = gen->tiled_len + gen->n_block;
1037
1038         sched = isl_union_map_apply_range(sched,
1039                                              isl_union_map_from_map(tiling));
1040
1041         par = parametrization(dim, gen->thread_tiled_len,
1042                 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
1043                 gen->n_block, "t");
1044         sched = isl_union_map_intersect_range(sched,
1045                                                 isl_union_set_from_set(par));
1046
1047         gen->shared_len = gen->tile_first + gen->tile_len + gen->n_grid;
1048
1049         return sched;
1050 }
1051
1052 /* If the user asked for it, scale the shared memory tile loops
1053  * (T1T and T2) of "sched" by gen->tile_size[i].
1054  * If we are not performing "wrapping", then additionally scale the T1P
1055  * loops by gen->grid_dim[i].
1056  */
1057 static __isl_give isl_union_map *scale_tile_loops(struct gpu_gen *gen,
1058         __isl_take isl_union_map *sched)
1059 {
1060         int i;
1061         isl_space *dim;
1062         isl_basic_map *scale;
1063         isl_constraint *c;
1064         isl_local_space *ls;
1065
1066         if (!gen->options->scale_tile_loops)
1067                 return sched;
1068
1069         dim = isl_union_map_get_space(sched);
1070         dim = isl_space_add_dims(dim, isl_dim_in, gen->tiled_len);
1071         dim = isl_space_add_dims(dim, isl_dim_out, gen->tiled_len);
1072         scale = isl_basic_map_universe(isl_space_copy(dim));
1073         ls = isl_local_space_from_space(dim);
1074
1075         for (i = 0; i < gen->tiled_len; ++i) {
1076                 int f = 1;
1077
1078                 if (i >= gen->tile_first && i < gen->tile_first + gen->n_grid) {
1079                         f = gen->tile_size[i - gen->tile_first];
1080                         if (!gen->options->wrap)
1081                                 f *= gen->grid_dim[i - gen->tile_first];
1082                 } else if (i >= gen->tile_first + gen->n_grid &&
1083                            i < gen->tile_first + gen->n_grid + gen->tile_len) {
1084                         f = gen->tile_size[i - (gen->tile_first + gen->n_grid)];
1085                 }
1086
1087                 c = isl_equality_alloc(isl_local_space_copy(ls));
1088                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1089                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1090                 scale = isl_basic_map_add_constraint(scale, c);
1091         }
1092
1093         isl_local_space_free(ls);
1094
1095         sched = isl_union_map_apply_range(sched,
1096                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1097
1098         return sched;
1099 }
1100
1101 /* If we are not performing "wrapping" and if the user asked for it,
1102  * scale the thread tile loops (P1T) of "sched" by gen->block_dim[i].
1103  */
1104 static __isl_give isl_union_map *scale_thread_tile_loops(struct gpu_gen *gen,
1105         __isl_take isl_union_map *sched)
1106 {
1107         int i;
1108         isl_space *dim;
1109         isl_basic_map *scale;
1110         isl_constraint *c;
1111         isl_local_space *ls;
1112
1113         if (gen->options->wrap)
1114                 return sched;
1115         if (!gen->options->scale_tile_loops)
1116                 return sched;
1117
1118         dim = isl_union_map_get_space(sched);
1119         dim = isl_space_add_dims(dim, isl_dim_in, gen->thread_tiled_len);
1120         dim = isl_space_add_dims(dim, isl_dim_out, gen->thread_tiled_len);
1121         scale = isl_basic_map_universe(isl_space_copy(dim));
1122         ls = isl_local_space_from_space(dim);
1123
1124         for (i = 0; i < gen->thread_tiled_len; ++i) {
1125                 int f = 1;
1126
1127                 if (i >= gen->shared_len &&
1128                     i < gen->shared_len + gen->n_block)
1129                         f = gen->block_dim[i - gen->shared_len];
1130
1131                 c = isl_equality_alloc(isl_local_space_copy(ls));
1132                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1133                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1134                 scale = isl_basic_map_add_constraint(scale, c);
1135         }
1136
1137         isl_local_space_free(ls);
1138
1139         sched = isl_union_map_apply_range(sched,
1140                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1141
1142         return sched;
1143 }
1144
1145 /* If we are not performing "wrapping" and if the user asked for it,
1146  * scale the "n_tile" loops starting at "first" of "sched" by gen->block_dim[i].
1147  */
1148 static __isl_give isl_union_map *scale_access_tile_loops(struct gpu_gen *gen,
1149         __isl_take isl_union_map *sched, int len, int first, int n_tile)
1150 {
1151         int i;
1152         isl_space *dim;
1153         isl_basic_map *scale;
1154         isl_constraint *c;
1155         isl_local_space *ls;
1156
1157         if (gen->options->wrap)
1158                 return sched;
1159         if (!gen->options->scale_tile_loops)
1160                 return sched;
1161
1162         dim = isl_union_map_get_space(sched);
1163         dim = isl_space_add_dims(dim, isl_dim_in, len);
1164         dim = isl_space_add_dims(dim, isl_dim_out, len);
1165         scale = isl_basic_map_universe(isl_space_copy(dim));
1166         ls = isl_local_space_from_space(dim);
1167
1168         for (i = 0; i < len; ++i) {
1169                 int f = 1;
1170
1171                 if (i >= first && i < first + n_tile)
1172                         f = gen->kernel->block_dim[i - first];
1173
1174                 c = isl_equality_alloc(isl_local_space_copy(ls));
1175                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
1176                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
1177                 scale = isl_basic_map_add_constraint(scale, c);
1178         }
1179
1180         isl_local_space_free(ls);
1181
1182         sched = isl_union_map_apply_range(sched,
1183                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
1184
1185         return sched;
1186 }
1187
1188 /* Add "len" parameters p[i] called prefix%d,
1189  * with bounds to 0 <= p[i] < size[i].
1190  */
1191 __isl_give isl_set *add_bounded_parameters(__isl_take isl_set *set,
1192         int len, int *size, const char *prefix)
1193 {
1194         int i;
1195         unsigned nparam;
1196         isl_space *dim;
1197         isl_basic_set *bset;
1198         isl_constraint *c;
1199         isl_local_space *ls;
1200         char name[20];
1201
1202         nparam = isl_set_dim(set, isl_dim_param);
1203         set = isl_set_add_dims(set, isl_dim_param, len);
1204
1205         for (i = 0; i < len; ++i) {
1206                 snprintf(name, sizeof(name), "%s%d", prefix, i);
1207                 set = isl_set_set_dim_name(set, isl_dim_param,
1208                                             nparam + i, name);
1209         }
1210
1211         dim = isl_set_get_space(set);
1212         bset = isl_basic_set_universe(isl_space_copy(dim));
1213         ls = isl_local_space_from_space(dim);
1214
1215         for (i = 0; i < len; ++i) {
1216                 c = isl_inequality_alloc(isl_local_space_copy(ls));
1217                 c = isl_constraint_set_coefficient_si(c, isl_dim_param,
1218                                                         nparam + i, 1);
1219                 bset = isl_basic_set_add_constraint(bset, c);
1220
1221                 c = isl_inequality_alloc(isl_local_space_copy(ls));
1222                 c = isl_constraint_set_coefficient_si(c, isl_dim_param,
1223                                                         nparam + i, -1);
1224                 c = isl_constraint_set_constant_si(c, size[i] - 1);
1225                 bset = isl_basic_set_add_constraint(bset, c);
1226         }
1227
1228         isl_local_space_free(ls);
1229
1230         return isl_set_intersect(set, isl_set_from_basic_set(bset));
1231 }
1232
1233 /* Add "len" parameters p[i] called prefix%d,
1234  * with bounds to 0 <= p[i] < size[i].
1235  */
1236 static __isl_give isl_set *add_bounded_parameters_dynamic(
1237         __isl_take isl_set *set, __isl_keep isl_multi_pw_aff *size,
1238         const char *prefix)
1239 {
1240         int i, len;
1241         unsigned nparam;
1242         isl_space *space;
1243         isl_local_space *ls;
1244         char name[20];
1245
1246         len = isl_multi_pw_aff_dim(size, isl_dim_out);
1247         nparam = isl_set_dim(set, isl_dim_param);
1248         set = isl_set_add_dims(set, isl_dim_param, len);
1249
1250         for (i = 0; i < len; ++i) {
1251                 snprintf(name, sizeof(name), "%s%d", prefix, i);
1252                 set = isl_set_set_dim_name(set, isl_dim_param,
1253                                             nparam + i, name);
1254         }
1255
1256         space = isl_space_params(isl_set_get_space(set));
1257         ls = isl_local_space_from_space(space);
1258         for (i = 0; i < len; ++i) {
1259                 isl_pw_aff *param, *size_i, *zero;
1260                 isl_set *bound;
1261
1262                 param = isl_pw_aff_var_on_domain(isl_local_space_copy(ls),
1263                                                 isl_dim_param, nparam + i);
1264
1265                 size_i = isl_multi_pw_aff_get_pw_aff(size, i);
1266                 bound = isl_pw_aff_lt_set(isl_pw_aff_copy(param), size_i);
1267                 set = isl_set_intersect_params(set, bound);
1268
1269                 zero = isl_pw_aff_zero_on_domain(isl_local_space_copy(ls));
1270                 bound = isl_pw_aff_ge_set(param, zero);
1271                 set = isl_set_intersect_params(set, bound);
1272         }
1273         isl_local_space_free(ls);
1274
1275         return set;
1276 }
1277
1278 /* Construct a map from an access to group->array to the corresponding
1279  * shared/private memory tile.
1280  * The map is of the form
1281  *
1282  *      { [D[i] -> A[a]] -> T[t] }
1283  *
1284  * where D represents the initial shared_len dimensions
1285  * of the computed schedule.
1286  */
1287 static __isl_give isl_map *shift_access(struct gpu_array_ref_group *group)
1288 {
1289         struct gpu_array_tile *tile;
1290         isl_multi_aff *tiling;
1291
1292         tile = group->private_tile;
1293         if (!tile)
1294                 tile = group->shared_tile;
1295
1296         tiling = isl_multi_aff_copy(tile->tiling);
1297
1298         return isl_map_from_multi_aff(tiling);
1299 }
1300
1301 /* Does "map" have an obviously fixed value at variable "pos" of "type"?
1302  */
1303 static int map_plain_is_fixed(isl_map *map, enum isl_dim_type type,
1304         unsigned pos)
1305 {
1306         isl_val *v;
1307         int fixed;
1308
1309         v = isl_map_plain_get_val_if_fixed(map, type, pos);
1310         if (!v)
1311                 return -1;
1312         fixed = isl_val_is_int(v);
1313         isl_val_free(v);
1314
1315         return fixed;
1316 }
1317
1318 /* Given a schedule that iterates over all elements in a piece of an array,
1319  * perform tiling/wrapping over the threads.
1320  *
1321  * In particular, we tile the final iterators so that the final thread
1322  * dimension runs over the final array dimension.
1323  * However, if those final iterators have only a single iteration,
1324  * we try to tile earlier iterators instead.
1325  */
1326 static __isl_give isl_map *tile_access_schedule(struct gpu_gen *gen,
1327         __isl_take isl_map *sched)
1328 {
1329         isl_space *dim;
1330         isl_union_map *usched;
1331         isl_map *tiling;
1332         isl_set *par;
1333         unsigned nvar = isl_map_dim(sched, isl_dim_out);
1334         int n_tile;
1335         int first;
1336
1337         n_tile = gen->kernel->n_block;
1338         if (n_tile > nvar) {
1339                 int i;
1340                 sched = isl_map_insert_dims(sched,
1341                                                 isl_dim_out, 0, n_tile - nvar);
1342                 for (i = 0; i < n_tile - nvar; ++i)
1343                         sched = isl_map_fix_si(sched, isl_dim_out, i, 0);
1344                 nvar = n_tile;
1345         }
1346
1347         first = nvar - n_tile;
1348
1349         for (; first > 0; first --)
1350                 if (!map_plain_is_fixed(sched, isl_dim_out, first + n_tile - 1))
1351                         break;
1352
1353         dim = isl_map_get_space(sched);
1354         dim = isl_space_params(dim);
1355         if (gen->options->wrap)
1356                 tiling = wrap(isl_space_copy(dim), nvar, first,
1357                                 n_tile, gen->kernel->block_dim);
1358         else
1359                 tiling = tile(isl_space_copy(dim), nvar, first,
1360                                 n_tile, gen->kernel->block_dim);
1361         sched = isl_map_apply_range(sched, tiling);
1362
1363         par = parametrization(dim, nvar + n_tile, first + n_tile, n_tile, "t");
1364         sched = isl_map_intersect_range(sched, par);
1365
1366         usched = isl_union_map_from_map(sched);
1367         usched = scale_access_tile_loops(gen, usched, nvar + n_tile,
1368                                          first, n_tile);
1369         sched = isl_map_from_union_map(usched);
1370
1371         return sched;
1372 }
1373
1374 /* Return the union of all read (read = 1) and/or write (write = 1)
1375  * access relations in the group.
1376  */
1377 static __isl_give isl_union_map *group_access_relation(
1378         struct gpu_array_ref_group *group, int read, int write)
1379 {
1380         int i;
1381         isl_union_map *access;
1382
1383         access = isl_union_map_empty(isl_map_get_space(group->access));
1384         for (i = 0; i < group->n_ref; ++i) {
1385                 isl_map *map_i;
1386
1387                 if (!((read && group->refs[i]->read) ||
1388                      (write && group->refs[i]->write)))
1389                         continue;
1390                 map_i = isl_map_copy(group->refs[i]->access);
1391                 access = isl_union_map_union(access,
1392                                             isl_union_map_from_map(map_i));
1393         }
1394
1395         return access;
1396 }
1397
1398 /* Return the extent of "array", recomputed from the bounds.
1399  * The recomputed extent may be simpler than the original extent.
1400  */
1401 static __isl_give isl_set *array_extent(struct gpu_array_info *array)
1402 {
1403         int i;
1404         isl_id *id;
1405         isl_space *space;
1406         isl_local_space *ls;
1407         isl_set *extent;
1408
1409         id = isl_set_get_tuple_id(array->extent);
1410         space = isl_set_get_space(array->extent);
1411         extent = isl_set_universe(isl_space_copy(space));
1412         ls = isl_local_space_from_space(space);
1413         for (i = 0; i < array->n_index; ++i) {
1414                 isl_pw_aff *bound;
1415                 isl_aff *aff;
1416                 isl_pw_aff *index;
1417                 isl_set *lt;
1418
1419                 extent = isl_set_lower_bound_si(extent, isl_dim_set, i, 0);
1420
1421                 aff = isl_aff_var_on_domain(isl_local_space_copy(ls),
1422                                                 isl_dim_set, i);
1423                 index = isl_pw_aff_from_aff(aff);
1424                 bound = isl_pw_aff_copy(array->bound[i]);
1425                 bound = isl_pw_aff_from_range(bound);
1426                 bound = isl_pw_aff_add_dims(bound, isl_dim_in, array->n_index);
1427                 bound = isl_pw_aff_set_tuple_id(bound, isl_dim_in,
1428                                                 isl_id_copy(id));
1429                 lt = isl_pw_aff_lt_set(index, bound);
1430                 extent = isl_set_intersect(extent, lt);
1431         }
1432         isl_local_space_free(ls);
1433         isl_id_free(id);
1434
1435         return extent;
1436 }
1437
1438 /* Return a map from the first shared_len dimensions of the computed
1439  * schedule to the array tile in
1440  * global memory that corresponds to the shared memory copy.
1441  *
1442  * In particular, return a map
1443  *
1444  *      { D[i] -> A[a] }
1445  *
1446  * with constraints
1447  *
1448  *      tile_offset(i) <= a <= tile_offset(i) + tile_size - 1           (1)
1449  *
1450  * and
1451  *
1452  *      0 <= a <= array_size - 1                                        (2)
1453  *
1454  * Note that if some stride has been detected (i.e., when
1455  * group->shared_tile->bound[i].shift is set), then a in (1) refers
1456  * to the shifted and scaled down version.
1457  *
1458  * Constraints (1) are obtained by mapping the size constraints on the
1459  * shared/private memory tile back to the access relation.
1460  * Constraints (2) are obtained from the (recomputed) extent.
1461  */
1462 static __isl_give isl_map *group_tile(struct gpu_array_ref_group *group)
1463 {
1464         int i;
1465         int n_index = group->array->n_index;
1466         isl_map *tile;
1467         isl_space *space;
1468         isl_set *local;
1469         isl_set *extent;
1470
1471         space = isl_multi_aff_get_space(group->shared_tile->tiling);
1472         space = isl_space_range(space);
1473         local = isl_set_universe(space);
1474         for (i = 0; i < n_index; ++i) {
1475                 isl_val *bound;
1476
1477                 local = isl_set_lower_bound_si(local, isl_dim_set, i, 0);
1478                 bound = isl_val_copy(group->shared_tile->bound[i].size);
1479                 bound = isl_val_sub_ui(bound, 1);
1480                 local = isl_set_upper_bound_val(local, isl_dim_set, i, bound);
1481         }
1482         local = isl_set_preimage_multi_aff(local,
1483                                 isl_multi_aff_copy(group->shared_tile->tiling));
1484         tile = isl_set_unwrap(local);
1485         extent = array_extent(group->array);
1486         tile = isl_map_intersect_range(tile, extent);
1487
1488         return tile;
1489 }
1490
1491 /* Given a mapping "iterator_map" from the AST schedule to a domain,
1492  * return the corresponding mapping from the AST schedule to
1493  * to the first shared_len dimensions of the schedule computed by PPCG.
1494  */
1495 static __isl_give isl_pw_multi_aff *compute_sched_to_shared(struct gpu_gen *gen,
1496         __isl_take isl_pw_multi_aff *iterator_map)
1497 {
1498         isl_union_map *umap;
1499         isl_space *space;
1500         isl_map *map, *sched;;
1501
1502         space = isl_space_range(isl_pw_multi_aff_get_space(iterator_map));
1503         space = isl_space_from_domain(space);
1504         space = isl_space_add_dims(space, isl_dim_out, gen->shared_len);
1505
1506         umap = isl_union_map_copy(gen->shared_sched);
1507         umap = isl_union_map_apply_range(umap,
1508                         isl_union_map_copy(gen->shared_proj));
1509         map = isl_union_map_extract_map(umap, space);
1510         isl_union_map_free(umap);
1511
1512         sched = isl_map_preimage_domain_pw_multi_aff(map, iterator_map);
1513         sched = isl_map_detect_equalities(sched);
1514
1515         return isl_pw_multi_aff_from_map(sched);
1516 }
1517
1518 /* Set unroll[j] if the input dimension j is involved in
1519  * the index expression represented by ma.
1520  */
1521 static int check_unroll(__isl_take isl_set *set, __isl_take isl_multi_aff *ma,
1522         void *user)
1523 {
1524         int i, j;
1525         int n_in = isl_multi_aff_dim(ma, isl_dim_in);
1526         int n_out = isl_multi_aff_dim(ma, isl_dim_out);
1527         int *unroll = user;
1528
1529         for (i = 0; i < n_out; ++i) {
1530                 isl_aff *aff;
1531
1532                 aff = isl_multi_aff_get_aff(ma, i);
1533                 for (j = 0; j < n_in; ++j)
1534                         if (isl_aff_involves_dims(aff, isl_dim_in, j, 1))
1535                                 unroll[j] = 1;
1536                 isl_aff_free(aff);
1537         }
1538
1539         isl_set_free(set);
1540         isl_multi_aff_free(ma);
1541         return 0;
1542 }
1543
1544 /* Given an array pos mapping input dimensions to the corresponding
1545  * output dimension, construct the corresponding map.
1546  */
1547 static __isl_give isl_map *permutation(__isl_take isl_space *dim,
1548         int *pos, int len)
1549 {
1550         int i;
1551         isl_constraint *c;
1552         isl_basic_map *bmap;
1553         isl_local_space *ls;
1554
1555         dim = isl_space_add_dims(dim, isl_dim_in, len);
1556         dim = isl_space_add_dims(dim, isl_dim_out, len);
1557         bmap = isl_basic_map_universe(isl_space_copy(dim));
1558         ls = isl_local_space_from_space(dim);
1559
1560         for (i = 0; i < len; ++i) {
1561                 c = isl_equality_alloc(isl_local_space_copy(ls));
1562                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i,
1563                                                       -1);
1564                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, pos[i],
1565                                                       1);
1566                 bmap = isl_basic_map_add_constraint(bmap, c);
1567         }
1568         isl_local_space_free(ls);
1569
1570         return isl_map_from_basic_map(bmap);
1571 }
1572
1573 /* Find all loops involved in any of the index expressions for any of
1574  * the private accesses, move them innermost and then mark them as
1575  * requiring unrolling by setting gen->first_unroll.
1576  * The loops involved should all be parallel because of the checks
1577  * we performed in check_private_group_access.  Moving them innermost
1578  * is therefore a valid transformation.
1579  *
1580  * Loops up to gen->shared_len are generated before the mapping to
1581  * threads is applied.  They should therefore be ignored.
1582  *
1583  * We compute the hidden equalities of the schedule first
1584  * since we will need them in our calls to isl_pw_multi_aff_from_map
1585  * and because we want to make sure that the same equalities
1586  * are also available to the code generator.
1587  */
1588 static __isl_give isl_union_map *interchange_for_unroll(struct gpu_gen *gen,
1589         __isl_take isl_union_map *sched)
1590 {
1591         int i, j;
1592         int unroll[gen->thread_tiled_len];
1593         int perm[gen->thread_tiled_len];
1594         isl_space *dim;
1595         isl_map *permute;
1596         int len = gen->shared_len + gen->n_parallel + gen->n_block;
1597
1598         gen->first_unroll = -1;
1599
1600         sched = isl_union_map_detect_equalities(sched);
1601         for (i = 0; i < gen->thread_tiled_len; ++i)
1602                 unroll[i] = 0;
1603         for (i = 0; i < gen->prog->n_array; ++i) {
1604                 struct gpu_array_info *array = &gen->prog->array[i];
1605
1606                 for (j = 0; j < array->n_group; ++j) {
1607                         isl_union_map *access;
1608                         isl_map *acc;
1609                         isl_pw_multi_aff *pma;
1610
1611                         if (!array->groups[j]->private_tile)
1612                                 continue;
1613
1614                         access = group_access_relation(array->groups[j], 1, 1);
1615                         access = isl_union_map_apply_domain(access,
1616                                                 isl_union_map_copy(sched));
1617
1618                         acc = isl_map_from_union_map(access);
1619                         pma = isl_pw_multi_aff_from_map(acc);
1620                         isl_pw_multi_aff_foreach_piece(pma,
1621                                                         &check_unroll, unroll);
1622
1623                         isl_pw_multi_aff_free(pma);
1624                 }
1625         }
1626
1627         for (i = gen->shared_len; i < len; ++i)
1628                 if (unroll[i])
1629                         break;
1630
1631         if (i >= len)
1632                 return sched;
1633
1634         for (i = len; i < gen->thread_tiled_len; ++i)
1635                 if (unroll[i])
1636                         return sched;
1637
1638         j = 0;
1639         for (i = 0; i < gen->shared_len; ++i)
1640                 perm[i] = j++;
1641         for (i = gen->shared_len; i < gen->thread_tiled_len; ++i)
1642                 if (!unroll[i])
1643                         perm[i] = j++;
1644         gen->first_unroll = j - gen->shared_len;
1645         for (i = gen->shared_len; i < len; ++i)
1646                 if (unroll[i])
1647                         perm[i] = j++;
1648
1649         dim = isl_union_map_get_space(sched);
1650         permute = permutation(dim, perm, gen->thread_tiled_len);
1651         sched = isl_union_map_apply_range(sched,
1652                                           isl_union_map_from_map(permute));
1653
1654         return sched;
1655 }
1656
1657 /* Given a constraint
1658  *
1659  *              a(p,i) + j = g f(e)
1660  *
1661  * or -a(p,i) - j = g f(e) if sign < 0,
1662  * store a(p,i) in bound->shift and g (stride) in bound->stride.
1663  * a(p,i) is assumed to be an expression in only the parameters
1664  * and the input dimensions.
1665  */
1666 static void extract_stride(__isl_keep isl_constraint *c,
1667         struct gpu_array_bound *bound, __isl_keep isl_val *stride, int sign)
1668 {
1669         int i;
1670         isl_val *v;
1671         isl_space *space;
1672         unsigned nparam;
1673         unsigned nvar;
1674         isl_aff *aff;
1675
1676         isl_val_free(bound->stride);
1677         bound->stride = isl_val_copy(stride);
1678
1679         space = isl_constraint_get_space(c);
1680         space = isl_space_domain(space);
1681
1682         nparam = isl_space_dim(space, isl_dim_param);
1683         nvar = isl_space_dim(space, isl_dim_set);
1684
1685         v = isl_constraint_get_constant_val(c);
1686         if (sign < 0)
1687                 v = isl_val_neg(v);
1688         aff = isl_aff_zero_on_domain(isl_local_space_from_space(space));
1689         aff = isl_aff_set_constant_val(aff, v);
1690
1691         for (i = 0; i < nparam; ++i) {
1692                 if (!isl_constraint_involves_dims(c, isl_dim_param, i, 1))
1693                         continue;
1694                 v = isl_constraint_get_coefficient_val(c, isl_dim_param, i);
1695                 if (sign < 0)
1696                         v = isl_val_neg(v);
1697                 aff = isl_aff_add_coefficient_val(aff, isl_dim_param, i, v);
1698         }
1699
1700         for (i = 0; i < nvar; ++i) {
1701                 if (!isl_constraint_involves_dims(c, isl_dim_in, i, 1))
1702                         continue;
1703                 v = isl_constraint_get_coefficient_val(c, isl_dim_in, i);
1704                 if (sign < 0)
1705                         v = isl_val_neg(v);
1706                 aff = isl_aff_add_coefficient_val(aff, isl_dim_in, i, v);
1707         }
1708
1709         bound->shift = aff;
1710 }
1711
1712 /* Given an equality constraint of a map with a single output dimension j,
1713  * check if the constraint is of the form
1714  *
1715  *              a(p,i) + j = g f(e)
1716  *
1717  * with a(p,i) an expression in the parameters and input dimensions
1718  * and f(e) an expression in the existentially quantified variables.
1719  * If so, and if g is larger than any such g from a previously considered
1720  * constraint, then call extract_stride to record the stride information
1721  * in bound.
1722  */
1723 static int check_stride_constraint(__isl_take isl_constraint *c, void *user)
1724 {
1725         int i;
1726         isl_ctx *ctx;
1727         isl_val *v;
1728         unsigned n_div;
1729         struct gpu_array_bound *bound = user;
1730
1731         ctx = isl_constraint_get_ctx(c);
1732         n_div = isl_constraint_dim(c, isl_dim_div);
1733         v = isl_constraint_get_coefficient_val(c, isl_dim_out, 0);
1734
1735         if (n_div && (isl_val_is_one(v) || isl_val_is_negone(v))) {
1736                 int s = isl_val_sgn(v);
1737                 isl_val *stride = isl_val_zero(ctx);
1738
1739                 isl_val_free(v);
1740                 for (i = 0; i < n_div; ++i) {
1741                         v = isl_constraint_get_coefficient_val(c,
1742                                                                 isl_dim_div, i);
1743                         stride = isl_val_gcd(stride, v);
1744                 }
1745                 if (!isl_val_is_zero(stride) &&
1746                     isl_val_gt(stride, bound->stride))
1747                         extract_stride(c, bound, stride, s);
1748
1749                 isl_val_free(stride);
1750         } else
1751                 isl_val_free(v);
1752
1753         isl_constraint_free(c);
1754         return 0;
1755 }
1756
1757 /* Given contraints on an array index i, check if we can find
1758  * a shift a(p) and a stride g such that
1759  *
1760  *      a(p) + i = 0 mod g
1761  *
1762  * If so, record the information in bound and apply the mapping
1763  * i -> (i + a(p))/g to the array index in bounds and return
1764  * the new constraints.
1765  * If not, simply return the original constraints.
1766  *
1767  * If bounds is a subset of the space
1768  *
1769  *      D -> i
1770  *
1771  * then the bound recorded in bound->shift is of the form
1772  *
1773  *      D -> s(D)
1774  *
1775  * with s(D) equal to a(p) above.
1776  * The mapping recorded in bound->shift_map is of the form
1777  *
1778  *      [D -> i] -> [D -> (i + S(D))/g]
1779  *
1780  * This mapping is computed as follows.
1781  * We first introduce "i" in the domain through precomposition
1782  * with [D -> i] -> D obtaining
1783  *
1784  *      [D -> i] -> s(D)
1785  *
1786  * Adding [D -> i] -> i produces
1787  *
1788  *      [D -> i] -> i + s(D)
1789  *
1790  * and the domain product with [D -> i] -> D yields
1791  *
1792  *      [D -> i] -> [D -> i + s(D)]
1793  *
1794  * Composition with [D -> i] -> [D -> i/g] gives the desired result.
1795  */
1796 static __isl_give isl_basic_map *check_stride(struct gpu_array_bound *bound,
1797         __isl_take isl_basic_map *bounds)
1798 {
1799         isl_space *space;
1800         isl_basic_map *hull;
1801         isl_basic_map *shift, *id, *bmap, *scale;
1802         isl_basic_set *bset;
1803         isl_aff *aff;
1804
1805         bound->stride = NULL;
1806
1807         hull = isl_basic_map_affine_hull(isl_basic_map_copy(bounds));
1808
1809         isl_basic_map_foreach_constraint(hull, &check_stride_constraint, bound);
1810
1811         isl_basic_map_free(hull);
1812
1813         if (!bound->stride)
1814                 return bounds;
1815
1816         shift = isl_basic_map_from_aff(isl_aff_copy(bound->shift));
1817         space = isl_basic_map_get_space(bounds);
1818         bmap = isl_basic_map_domain_map(isl_basic_map_universe(space));
1819         shift = isl_basic_map_apply_range(bmap, shift);
1820         space = isl_basic_map_get_space(bounds);
1821         id = isl_basic_map_range_map(isl_basic_map_universe(space));
1822         shift = isl_basic_map_sum(id, shift);
1823         space = isl_basic_map_get_space(bounds);
1824         id = isl_basic_map_domain_map(isl_basic_map_universe(space));
1825         shift = isl_basic_map_range_product(id, shift);
1826
1827         space = isl_space_domain(isl_basic_map_get_space(bounds));
1828         id = isl_basic_map_identity(isl_space_map_from_set(space));
1829         space = isl_space_range(isl_basic_map_get_space(bounds));
1830         aff = isl_aff_zero_on_domain(isl_local_space_from_space(space));
1831         aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1);
1832         aff = isl_aff_scale_down_val(aff, isl_val_copy(bound->stride));
1833         scale = isl_basic_map_from_aff(aff);
1834         scale = isl_basic_map_product(id, scale);
1835
1836         bound->shift_map = isl_basic_map_apply_range(shift, scale);
1837         bmap = isl_basic_map_copy(bound->shift_map);
1838         bset = isl_basic_set_apply(isl_basic_map_wrap(bounds), bmap);
1839         bounds = isl_basic_set_unwrap(bset);
1840
1841         return bounds;
1842 }
1843
1844 /* Data used in compute_array_dim_size and compute_size_in_direction.
1845  *
1846  * pos is the position of the variable representing the array index,
1847  * i.e., the variable for which want to compute the size.  This variable
1848  * is also the last variable in the set.
1849  */
1850 struct gpu_size_info {
1851         isl_basic_set *bset;
1852         struct gpu_array_bound *bound;
1853         int pos;
1854 };
1855
1856 /* Given a constraint from the basic set describing the bounds on
1857  * an array index, check if it is a lower bound, say m i >= b(x), and,
1858  * if so, check whether the expression "i - ceil(b(x)/m) + 1" has a constant
1859  * upper bound.  If so, and if this bound is smaller than any bound
1860  * derived from earlier constraints, set the size to this bound on
1861  * the expression and the lower bound to ceil(b(x)/m).
1862  */
1863 static int compute_size_in_direction(__isl_take isl_constraint *c, void *user)
1864 {
1865         struct gpu_size_info *size = user;
1866         unsigned nparam;
1867         unsigned n_div;
1868         isl_val *v;
1869         isl_aff *aff;
1870         isl_aff *lb;
1871
1872         nparam = isl_basic_set_dim(size->bset, isl_dim_param);
1873         n_div = isl_constraint_dim(c, isl_dim_div);
1874
1875         if (isl_constraint_involves_dims(c, isl_dim_div, 0, n_div) ||
1876             !isl_constraint_is_lower_bound(c, isl_dim_set, size->pos)) {
1877                 isl_constraint_free(c);
1878                 return 0;
1879         }
1880
1881         aff = isl_constraint_get_bound(c, isl_dim_set, size->pos);
1882         aff = isl_aff_ceil(aff);
1883
1884         lb = isl_aff_copy(aff);
1885
1886         aff = isl_aff_neg(aff);
1887         aff = isl_aff_add_coefficient_si(aff, isl_dim_in, size->pos, 1);
1888
1889         v = isl_basic_set_max_val(size->bset, aff);
1890         isl_aff_free(aff);
1891
1892         if (isl_val_is_int(v)) {
1893                 v = isl_val_add_ui(v, 1);
1894                 if (!size->bound->size || isl_val_lt(v, size->bound->size)) {
1895                         isl_val_free(size->bound->size);
1896                         size->bound->size = isl_val_copy(v);
1897                         lb = isl_aff_drop_dims(lb, isl_dim_in, size->pos, 1);
1898                         isl_aff_free(size->bound->lb);
1899                         size->bound->lb = isl_aff_copy(lb);
1900                 }
1901         }
1902         isl_val_free(v);
1903         isl_aff_free(lb);
1904
1905         isl_constraint_free(c);
1906
1907         return 0;
1908 }
1909
1910 /* Given a basic map "bounds" that maps parameters and input dimensions
1911  * to a single output dimension, look for an expression in the parameters
1912  * and input dimensions such that the range of the output dimension shifted
1913  * by this expression is a constant.
1914  *
1915  * In particular, we currently only consider lower bounds on the output
1916  * dimension as candidate expressions.
1917  */
1918 static int compute_array_dim_size(struct gpu_array_bound *bound,
1919         __isl_take isl_basic_map *bounds)
1920 {
1921         struct gpu_size_info size;
1922
1923         bounds = isl_basic_map_detect_equalities(bounds);
1924         bounds = check_stride(bound, bounds);
1925
1926         bound->size = NULL;
1927         bound->lb = NULL;
1928
1929         size.bound = bound;
1930         size.pos = isl_basic_map_dim(bounds, isl_dim_in);
1931         size.bset = isl_basic_map_wrap(bounds);
1932         size.bset = isl_basic_set_flatten(size.bset);
1933         size.bset = isl_set_simple_hull(isl_basic_set_compute_divs(size.bset));
1934         isl_basic_set_foreach_constraint(size.bset, &compute_size_in_direction,
1935                                         &size);
1936         isl_basic_set_free(size.bset);
1937
1938         return bound->size ? 0 : -1;
1939 }
1940
1941 /* Check if we can find a memory tile for the given array
1942  * based on the given accesses, and if so, put the results in "tile".
1943  *
1944  * We project the accesses on each index in turn and look for a parametric
1945  * offset such that the size is constant.
1946  */
1947 static int can_tile(__isl_keep isl_map *access, struct gpu_array_tile *tile)
1948 {
1949         int i;
1950
1951         for (i = 0; i < tile->n; ++i) {
1952                 isl_map *access_i;
1953                 isl_basic_map *hull;
1954
1955                 access_i = isl_map_copy(access);
1956                 access_i = isl_map_project_out(access_i, isl_dim_out, 0, i);
1957                 access_i = isl_map_project_out(access_i, isl_dim_out,
1958                                             1, tile->n - (i + 1));
1959                 access_i = isl_map_compute_divs(access_i);
1960                 hull = isl_map_simple_hull(access_i);
1961                 if (compute_array_dim_size(&tile->bound[i], hull) < 0)
1962                         return 0;
1963         }
1964
1965         return 1;
1966 }
1967
1968 /* Construct a map with input the shared tile loops and the loops that
1969  * will be wrapped around the threads that relates these later loops
1970  * to the thread indices and then projects them out.
1971  */
1972 static __isl_give isl_map *compute_privatization(struct gpu_gen *gen)
1973 {
1974         isl_map *priv;
1975         isl_map *tiling;
1976         isl_map *proj;
1977         isl_set *par;
1978         isl_space *dim;
1979
1980         dim = isl_union_map_get_space(gen->shared_sched);
1981
1982         if (gen->options->wrap)
1983                 tiling = wrap(isl_space_copy(dim), gen->shared_len + gen->n_block,
1984                                 gen->shared_len, gen->n_block, gen->block_dim);
1985         else
1986                 tiling = tile(isl_space_copy(dim), gen->shared_len + gen->n_block,
1987                                 gen->shared_len, gen->n_block, gen->block_dim);
1988
1989         priv = tiling;
1990
1991         par = parametrization(dim, gen->shared_len + 2 * gen->n_block,
1992                 gen->tile_first + gen->tile_len + gen->n_grid + gen->n_block,
1993                 gen->n_block, "t");
1994
1995         priv = isl_map_align_params(priv, isl_set_get_space(par));
1996         priv = isl_map_intersect_range(priv, par);
1997
1998         dim = isl_map_get_space(priv);
1999         dim = isl_space_drop_dims(dim, isl_dim_in, 0, isl_space_dim(dim, isl_dim_in));
2000         dim = isl_space_drop_dims(dim, isl_dim_out, 0, isl_space_dim(dim, isl_dim_out));
2001         proj = projection(dim, gen->shared_len + 2 * gen->n_block,
2002                           gen->shared_len);
2003
2004         priv = isl_map_apply_range(priv, proj);
2005
2006         return priv;
2007 }
2008
2009 /* Construct a map from domain_dim to domain_dim that increments
2010  * the dimension at position "pos" and leaves all other dimensions
2011  * constant.
2012  */
2013 static __isl_give isl_map *next(__isl_take isl_space *domain_dim, int pos)
2014 {
2015         int i;
2016         int len = isl_space_dim(domain_dim, isl_dim_set);
2017         isl_space *dim;
2018         isl_basic_map *next;
2019         isl_local_space *ls;
2020
2021         dim = isl_space_map_from_set(domain_dim);
2022         next = isl_basic_map_universe(isl_space_copy(dim));
2023         ls = isl_local_space_from_space(dim);
2024
2025         for (i = 0; i < len; ++i) {
2026                 isl_constraint *c;
2027
2028                 c = isl_equality_alloc(isl_local_space_copy(ls));
2029                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, 1);
2030                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
2031                 if (i == pos)
2032                         c = isl_constraint_set_constant_si(c, 1);
2033                 next = isl_basic_map_add_constraint(next, c);
2034         }
2035
2036         isl_local_space_free(ls);
2037
2038         return isl_map_from_basic_map(next);
2039 }
2040
2041 /* Check if the given access is coalesced.
2042  * That is, check whether incrementing the dimension that will get
2043  * wrapped over the last thread index results in incrementing
2044  * the last array index.
2045  *
2046  * This function is only called for access relations without reuse.
2047  */
2048 static int access_is_coalesced(struct gpu_gen *gen,
2049         __isl_keep isl_union_map *access)
2050 {
2051         isl_space *dim;
2052         isl_map *access_map;
2053         isl_map *next_thread_x;
2054         isl_map *next_element;
2055         isl_map *map;
2056         int coalesced;
2057
2058         access = isl_union_map_copy(access);
2059         access = isl_union_map_apply_domain(access,
2060                                 isl_union_map_copy(gen->tiled_sched));
2061         access_map = isl_map_from_union_map(access);
2062
2063         dim = isl_map_get_space(access_map);
2064         dim = isl_space_domain(dim);
2065         next_thread_x = next(dim, gen->shared_len + gen->n_block - 1);
2066
2067         dim = isl_map_get_space(access_map);
2068         dim = isl_space_range(dim);
2069         next_element = next(dim, isl_space_dim(dim, isl_dim_set) - 1);
2070
2071         map = isl_map_apply_domain(next_thread_x, isl_map_copy(access_map));
2072         map = isl_map_apply_range(map, access_map);
2073
2074         coalesced = isl_map_is_subset(map, next_element);
2075
2076         isl_map_free(next_element);
2077         isl_map_free(map);
2078
2079         return coalesced;
2080 }
2081
2082 /* Given an access relation in terms of the first gen->shared_len + gen->n_block
2083  * dimensions of the computed schedule, check if it is bijective for
2084  * fixed values of the first gen->shared_len dimensions.
2085  * We perform this check by equating these dimensions to parameters.
2086  */
2087 static int access_is_bijective(struct gpu_gen *gen, __isl_keep isl_map *access)
2088 {
2089         int res;
2090         isl_set *par;
2091         isl_space *space;
2092
2093         access = isl_map_copy(access);
2094         space = isl_space_params(isl_map_get_space(access));
2095         par = parametrization(space, gen->shared_len + gen->n_block,
2096                                 0, gen->shared_len, "s");
2097         access = isl_map_intersect_domain(access, par);
2098         res = isl_map_is_bijective(access);
2099         isl_map_free(access);
2100
2101         return res;
2102 }
2103
2104 /* Look for the last shared tile loop that affects the offset of "tile"
2105  * and return the result.
2106  * If there is no such loop, then return the index of the loop
2107  * before the first shared tile loop, in particular gen->tile_first - 1.
2108  */
2109 static int compute_tile_last_shared(struct gpu_gen *gen,
2110         struct gpu_array_tile *tile)
2111 {
2112         int i, j;
2113
2114         for (j = gen->shared_len - 1; j >= gen->tile_first; --j) {
2115                 for (i = 0; i < tile->n; ++i) {
2116                         isl_aff *lb;
2117                         isl_aff *shift;
2118
2119                         lb = tile->bound[i].lb;
2120                         if (isl_aff_involves_dims(lb, isl_dim_in, j, 1))
2121                                 break;
2122
2123                         shift = tile->bound[i].shift;
2124                         if (!shift)
2125                                 continue;
2126                         if (isl_aff_involves_dims(shift, isl_dim_in, j, 1))
2127                                 break;
2128                 }
2129                 if (i < tile->n)
2130                         break;
2131         }
2132
2133         return j;
2134 }
2135
2136 /* Look for the last shared tile loop that affects the offset of the
2137  * shared or private tile and store the result in group->last_shared.
2138  * If there is no such loop, then group->last_shared is set to a value
2139  * before the first shared tile loop, in particular gen->tile_first - 1.
2140  * If there is no tile defined on the array reference group,
2141  * then set group->last_shared to gen->shared_len - 1.
2142  */
2143 static void set_last_shared(struct gpu_gen *gen,
2144         struct gpu_array_ref_group *group)
2145 {
2146         struct gpu_array_tile *tile;
2147
2148         group->last_shared = gen->shared_len - 1;
2149
2150         tile = group->private_tile;
2151         if (!tile)
2152                 tile = group->shared_tile;
2153         if (!tile)
2154                 return;
2155
2156         group->last_shared = compute_tile_last_shared(gen, tile);
2157 }
2158
2159 /* Compute a privatized copy of all access relations from reference groups that
2160  * are mapped to private memory and store the result in gen->privatization.
2161  *
2162  * Read-only scalars and arrays containing structures are not mapped
2163  * to private memory.
2164  */
2165 static void compute_private_access(struct gpu_gen *gen)
2166 {
2167         int i, j;
2168         isl_union_map *private;
2169
2170         if (!gen->options->use_private_memory)
2171                 return;
2172
2173         private = isl_union_map_empty(isl_union_map_get_space(gen->shared_sched));
2174
2175         for (i = 0; i < gen->prog->n_array; ++i) {
2176                 struct gpu_array_info *array = &gen->prog->array[i];
2177
2178                 if (gpu_array_is_read_only_scalar(array))
2179                         continue;
2180                 if (array->has_compound_element)
2181                         continue;
2182
2183                 for (j = 0; j < array->n_group; ++j) {
2184                         if (!array->groups[j]->private_tile)
2185                                 continue;
2186
2187                         private = isl_union_map_union(private,
2188                                 group_access_relation(array->groups[j], 1, 1));
2189                 }
2190         }
2191
2192         if (isl_union_map_is_empty(private))
2193                 isl_union_map_free(private);
2194         else {
2195                 isl_union_map *priv;
2196
2197                 private = isl_union_map_apply_domain(private,
2198                                         isl_union_map_copy(gen->shared_sched));
2199                 priv = isl_union_map_from_map(isl_map_copy(gen->privatization));
2200                 private = isl_union_map_apply_domain(private, priv);
2201                 gen->private_access = private;
2202         }
2203 }
2204
2205 /* Compute the size of the tile specified by "tile"
2206  * in number of elements and return the result.
2207  */
2208 static __isl_give isl_val *tile_size(isl_ctx *ctx, struct gpu_array_tile *tile)
2209 {
2210         int i;
2211         isl_val *size;
2212
2213         size = isl_val_one(ctx);
2214
2215         for (i = 0; i < tile->n; ++i)
2216                 size = isl_val_mul(size, isl_val_copy(tile->bound[i].size));
2217
2218         return size;
2219 }
2220
2221 /* If max_shared_memory is not set to infinity (-1), then make
2222  * sure that the total amount of shared memory required by the
2223  * array reference groups mapped to shared memory is no larger
2224  * than this maximum.
2225  *
2226  * We apply a greedy approach and discard (keep in global memory)
2227  * those groups that would result in a total memory size that
2228  * is larger than the maximum.
2229  */
2230 static void check_shared_memory_bound(struct gpu_gen *gen)
2231 {
2232         int i, j;
2233         isl_val *left, *size;
2234
2235         if (gen->options->max_shared_memory < 0)
2236                 return;
2237
2238         left = isl_val_int_from_si(gen->ctx, gen->options->max_shared_memory);
2239
2240         for (i = 0; i < gen->prog->n_array; ++i) {
2241                 struct gpu_array_info *array = &gen->prog->array[i];
2242
2243                 for (j = 0; j < array->n_group; ++j) {
2244                         struct gpu_array_ref_group *group;
2245
2246                         group = array->groups[j];
2247                         if (group->private_tile)
2248                                 continue;
2249                         if (!group->shared_tile)
2250                                 continue;
2251
2252                         size = tile_size(gen->ctx, group->shared_tile);
2253                         size = isl_val_mul_ui(size, array->size);
2254
2255                         if (isl_val_le(size, left)) {
2256                                 left = isl_val_sub(left, size);
2257                                 continue;
2258                         }
2259                         isl_val_free(size);
2260
2261                         group->shared_tile = free_tile(group->shared_tile);
2262                 }
2263         }
2264
2265         isl_val_free(left);
2266 }
2267
2268 /* Given a description of an array tile "tile" and the "space"
2269  *
2270  *      { D -> A }
2271  *
2272  * where D represents the first shared_len schedule dimensions
2273  * and A represents the array, construct an isl_multi_aff
2274  *
2275  *      { [D[i] -> A[a]] -> A'[a'] }
2276  *
2277  * with A' a scaled down copy of A according to the shifts and strides
2278  * in "tile".  In particular,
2279  *
2280  *      a' = (a + shift(i))/stride
2281  *
2282  * "insert_array" represents
2283  *
2284  *      { [D -> A] -> D }
2285  *
2286  * and is used to insert A into the domain of functions that only
2287  * reference D.
2288  */
2289 static __isl_give isl_multi_aff *strided_tile(
2290         struct gpu_array_tile *tile, __isl_keep isl_space *space,
2291         __isl_keep isl_multi_aff *insert_array)
2292 {
2293         int i;
2294         isl_ctx *ctx;
2295         isl_multi_aff *shift;
2296         isl_multi_val *stride;
2297         isl_space *space2;
2298         isl_local_space *ls;
2299         isl_multi_aff *tiling;
2300
2301         ctx = isl_space_get_ctx(space);
2302         space2 = isl_space_domain(isl_space_copy(space));
2303         ls = isl_local_space_from_space(space2);
2304         space2 = isl_space_range(isl_space_copy(space));
2305         stride = isl_multi_val_zero(space2);
2306         shift = isl_multi_aff_zero(isl_space_copy(space));
2307
2308         for (i = 0; i < tile->n; ++i) {
2309                 struct gpu_array_bound *bound = &tile->bound[i];
2310                 isl_val *stride_i;
2311                 isl_aff *shift_i;
2312
2313                 if (tile->bound[i].shift) {
2314                         stride_i = isl_val_copy(bound->stride);
2315                         shift_i = isl_aff_copy(bound->shift);
2316                 } else {
2317                         stride_i = isl_val_one(ctx);
2318                         shift_i = isl_aff_zero_on_domain(
2319                                         isl_local_space_copy(ls));
2320                 }
2321
2322                 stride = isl_multi_val_set_val(stride, i, stride_i);
2323                 shift = isl_multi_aff_set_aff(shift, i, shift_i);
2324         }
2325         isl_local_space_free(ls);
2326
2327         shift = isl_multi_aff_pullback_multi_aff(shift,
2328                                     isl_multi_aff_copy(insert_array));
2329
2330         tiling = isl_multi_aff_range_map(isl_space_copy(space));
2331         tiling = isl_multi_aff_add(tiling, shift);
2332         tiling = isl_multi_aff_scale_down_multi_val(tiling, stride);
2333
2334         return tiling;
2335 }
2336
2337 /* Compute a tiling for the array reference group "group".
2338  *
2339  * The tiling is of the form
2340  *
2341  *      { [D[i] -> A[a]] -> T[t] }
2342  *
2343  * where D represents the first shared_len schedule dimensions,
2344  * A represents the global array and T represents the shared or
2345  * private memory tile.  The name of T is the name of the local
2346  * array.
2347  *
2348  * If there is any stride in the accesses, then the mapping is
2349  *
2350  *      t = (a + shift(i))/stride - lb(i)
2351  *
2352  * otherwise, it is simply
2353  *
2354  *      t = a - lb(i)
2355  */
2356 static void compute_group_tiling(struct gpu_array_ref_group *group)
2357 {
2358         int i;
2359         struct gpu_array_tile *tile;
2360         struct gpu_array_info *array = group->array;
2361         isl_space *space;
2362         isl_multi_aff *tiling, *lb, *insert_array;
2363         isl_printer *p;
2364         char *local_name;
2365
2366         tile = group->private_tile;
2367         if (!tile)
2368                 tile = group->shared_tile;
2369         if (!tile)
2370                 return;
2371
2372         space = isl_map_get_space(group->access);
2373         insert_array = isl_multi_aff_domain_map(isl_space_copy(space));
2374
2375         for (i = 0; i < tile->n; ++i)
2376                 if (tile->bound[i].shift)
2377                         break;
2378
2379         if (i < tile->n)
2380                 tiling = strided_tile(tile, space, insert_array);
2381         else
2382                 tiling = isl_multi_aff_range_map(isl_space_copy(space));
2383
2384         lb = isl_multi_aff_zero(space);
2385         for (i = 0; i < tile->n; ++i) {
2386                 isl_aff *lb_i = isl_aff_copy(tile->bound[i].lb);
2387                 lb = isl_multi_aff_set_aff(lb, i, lb_i);
2388         }
2389         lb = isl_multi_aff_pullback_multi_aff(lb, insert_array);
2390
2391         tiling = isl_multi_aff_sub(tiling, lb);
2392
2393         p = isl_printer_to_str(isl_multi_aff_get_ctx(tiling));
2394         p = print_array_name(p, group);
2395         local_name = isl_printer_get_str(p);
2396         isl_printer_free(p);
2397         tiling = isl_multi_aff_set_tuple_name(tiling, isl_dim_out, local_name);
2398         free(local_name);
2399
2400         tile->tiling = tiling;
2401 }
2402
2403 /* Compute a tiling for all the array reference groups.
2404  */
2405 static void compute_group_tilings(struct gpu_gen *gen)
2406 {
2407         int i, j;
2408
2409         for (i = 0; i < gen->prog->n_array; ++i) {
2410                 struct gpu_array_info *array = &gen->prog->array[i];
2411
2412                 for (j = 0; j < array->n_group; ++j)
2413                         compute_group_tiling(array->groups[j]);
2414         }
2415 }
2416
2417 /* Fill up the groups array with singleton groups, i.e., one group
2418  * per reference, initializing the array, access, write, n_ref and refs fields.
2419  * In particular the access field is initialized to the scheduled
2420  * access relation of the array reference.
2421  *
2422  * Return the number of elements initialized, i.e., the number of
2423  * active references in the current kernel.
2424  */
2425 static int populate_array_references(struct gpu_array_info *array,
2426         __isl_keep isl_union_map *sched, struct gpu_array_ref_group **groups)
2427 {
2428         int i;
2429         int n;
2430         isl_ctx *ctx = isl_union_map_get_ctx(sched);
2431
2432         n = 0;
2433         for (i = 0; i < array->n_ref; ++i) {
2434                 isl_union_map *umap;
2435                 isl_map *map;
2436                 struct gpu_array_ref_group *group;
2437                 struct gpu_stmt_access *access = array->refs[i];
2438
2439                 map = isl_map_copy(access->access);
2440                 umap = isl_union_map_from_map(map);
2441                 umap = isl_union_map_apply_domain(umap,
2442                                 isl_union_map_copy(sched));
2443
2444                 if (isl_union_map_is_empty(umap)) {
2445                         isl_union_map_free(umap);
2446                         continue;
2447                 }
2448
2449                 map = isl_map_from_union_map(umap);
2450                 map = isl_map_detect_equalities(map);
2451
2452                 group = isl_calloc_type(ctx, struct gpu_array_ref_group);
2453                 assert(group);
2454                 group->array = array;
2455                 group->access = map;
2456                 group->write = access->write;
2457                 group->exact_write = access->exact_write;
2458                 group->refs = &array->refs[i];
2459                 group->n_ref = 1;
2460
2461                 groups[n++] = group;
2462         }
2463
2464         return n;
2465 }
2466
2467 /* If group->n_ref == 1, then group->refs was set by
2468  * populate_array_references to point directly into
2469  * group->array->refs and should not be freed.
2470  * If group->n_ref > 1, then group->refs was set by join_groups
2471  * to point to a newly allocated array.
2472  */
2473 static void free_array_ref_group(struct gpu_array_ref_group *group)
2474 {
2475         if (!group)
2476                 return;
2477         free_tile(group->shared_tile);
2478         free_tile(group->private_tile);
2479         isl_map_free(group->access);
2480         if (group->n_ref > 1)
2481                 free(group->refs);
2482         free(group);
2483 }
2484
2485 /* Given a map where the input dimensions represent the tile loops,
2486  * eliminate the innermost of those that have a fixed value
2487  * until we reach one that does not (obviously) have a fixed value.
2488  */
2489 static __isl_give isl_map *eliminate_fixed_inner_loops(
2490         __isl_take isl_map *access)
2491 {
2492         int i, n;
2493
2494         n = isl_map_dim(access, isl_dim_in);
2495
2496         for (i = n - 1; i >= 0; --i) {
2497                 if (!map_plain_is_fixed(access, isl_dim_in, i))
2498                         break;
2499                 access = isl_map_eliminate(access, isl_dim_in, i, 1);
2500         }
2501         return access;
2502 }
2503
2504 /* Check if the access relations of group1 and group2 overlap within
2505  * the innermost loop.  In particular, ignore any inner dimension
2506  * with a fixed value.
2507  * The copying to and from shared memory will be performed within
2508  * the innermost actual loop so we are only allowed to consider
2509  * the dimensions up to that innermost loop while checking whether
2510  * two access relations overlap.
2511  */
2512 static int accesses_overlap(struct gpu_array_ref_group *group1,
2513         struct gpu_array_ref_group *group2)
2514 {
2515         int empty;
2516         isl_map *access1, *access2;
2517
2518         access1 = isl_map_copy(group1->access);
2519         access1 = eliminate_fixed_inner_loops(access1);
2520         access2 = isl_map_copy(group2->access);
2521         access2 = eliminate_fixed_inner_loops(access2);
2522         access1 = isl_map_intersect(access1, access2);
2523         empty = isl_map_is_empty(access1);
2524         isl_map_free(access1);
2525
2526         return !empty;
2527 }
2528
2529 /* Combine the given two groups into a single group, containing
2530  * the references of both groups.
2531  */
2532 static struct gpu_array_ref_group *join_groups(
2533         struct gpu_array_ref_group *group1,
2534         struct gpu_array_ref_group *group2)
2535 {
2536         int i;
2537         isl_ctx *ctx;
2538         struct gpu_array_ref_group *group;
2539
2540         ctx = isl_map_get_ctx(group1->access);
2541         group = isl_calloc_type(ctx, struct gpu_array_ref_group);
2542         assert(group);
2543         group->array = group1->array;
2544         group->access = isl_map_union(isl_map_copy(group1->access),
2545                                         isl_map_copy(group2->access));
2546         group->write = group1->write || group2->write;
2547         group->exact_write = group1->exact_write && group2->exact_write;
2548         group->n_ref = group1->n_ref + group2->n_ref;
2549         group->refs = isl_alloc_array(ctx, struct gpu_stmt_access *,
2550                                         group->n_ref);
2551         assert(group->refs);
2552         for (i = 0; i < group1->n_ref; ++i)
2553                 group->refs[i] = group1->refs[i];
2554         for (i = 0; i < group2->n_ref; ++i)
2555                 group->refs[group1->n_ref + i] = group2->refs[i];
2556
2557         return group;
2558 }
2559
2560 /* Combine the given two groups into a single group and free
2561  * the original two groups.
2562  */
2563 static struct gpu_array_ref_group *join_groups_and_free(
2564         struct gpu_array_ref_group *group1,
2565         struct gpu_array_ref_group *group2)
2566 {
2567         struct gpu_array_ref_group *group;
2568
2569         group = join_groups(group1, group2);
2570         free_array_ref_group(group1);
2571         free_array_ref_group(group2);
2572         return group;
2573 }
2574
2575 /* Compute the private and/or shared memory tiles for the array
2576  * reference group "group" of array "array".
2577  *
2578  * If the array is a read-only scalar or if the user requested
2579  * not to use shared or private memory, then we do not need to do anything.
2580  *
2581  * If the array group involves any may writes (that are not must writes),
2582  * then we would have to make sure that we load the data into shared/private
2583  * memory first in case the data is not written by the kernel
2584  * (but still written back out to global memory).
2585  * Since we don't have any such mechanism at the moment, we don't
2586  * compute shared/private tiles for groups involving may writes.
2587  *
2588  * We only try to compute a shared memory tile if there is any reuse
2589  * or if the access is not coalesced.
2590  *
2591  * For computing a private memory tile, we also require that there is
2592  * some reuse.  Moreover, we require that the access is private
2593  * to the thread.  That is, we check that any given array element
2594  * is only accessed by a single thread.
2595  * We compute an access relation that maps the shared tile loop iterators
2596  * and the shared point loop iterators that will be wrapped over the
2597  * threads to the array elements.
2598  * We actually check that those iterators that will be wrapped
2599  * partition the array space.  This check is stricter than necessary
2600  * since several iterations may be mapped onto the same thread
2601  * and then they could be allowed to access the same memory elements,
2602  * but our check does not allow this situation.
2603  *
2604  * We also check that the index expression only depends on parallel
2605  * loops.  That way, we can move those loops innermost and unroll them.
2606  * Again, we use a test that is stricter than necessary.
2607  * We actually check whether the index expression only depends
2608  * on the iterators that are wrapped over the threads.
2609  * These are necessarily parallel, but there may be more parallel loops.
2610  *
2611  * Combining the injectivity of the first test with the single-valuedness
2612  * of the second test, we simply test for bijectivity.
2613  *
2614  * If it turns out we can use registers, we compute the private memory
2615  * tile size using can_tile, after introducing a dependence
2616  * on the thread indices.
2617  */
2618 static void compute_group_bounds_core(struct gpu_gen *gen,
2619         struct gpu_array_ref_group *group)
2620 {
2621         isl_ctx *ctx = isl_space_get_ctx(group->array->space);
2622         isl_union_map *access;
2623         int n_index = group->array->n_index;
2624         int no_reuse;
2625         isl_map *acc;
2626         int use_shared = gen->options->use_shared_memory;
2627         int use_private = gen->options->use_private_memory;
2628
2629         if (!use_shared && !use_private)
2630                 return;
2631         if (gpu_array_is_read_only_scalar(group->array))
2632                 return;
2633         if (!group->exact_write)
2634                 return;
2635
2636         access = group_access_relation(group, 1, 1);
2637         no_reuse = isl_union_map_is_injective(access);
2638
2639         if (use_shared && (!no_reuse || !access_is_coalesced(gen, access))) {
2640                 group->shared_tile = create_tile(ctx, group->array->n_index);
2641                 if (!can_tile(group->access, group->shared_tile))
2642                         group->shared_tile = free_tile(group->shared_tile);
2643         }
2644
2645         if (!use_private || no_reuse) {
2646                 isl_union_map_free(access);
2647                 return;
2648         }
2649
2650         access = isl_union_map_apply_domain(access,
2651                                         isl_union_map_copy(gen->shared_sched));
2652
2653         acc = isl_map_from_union_map(access);
2654
2655         if (!access_is_bijective(gen, acc)) {
2656                 isl_map_free(acc);
2657                 return;
2658         }
2659
2660         group->private_tile = create_tile(gen->ctx, n_index);
2661         acc = isl_map_apply_domain(acc, isl_map_copy(gen->privatization));
2662         if (!can_tile(acc, group->private_tile))
2663                 group->private_tile = free_tile(group->private_tile);
2664
2665         isl_map_free(acc);
2666 }
2667
2668 /* Compute the private and/or shared memory tiles for the array
2669  * reference group "group" of array "array" and set last_shared.
2670  */
2671 static void compute_group_bounds(struct gpu_gen *gen,
2672         struct gpu_array_ref_group *group)
2673 {
2674         compute_group_bounds_core(gen, group);
2675         set_last_shared(gen, group);
2676 }
2677
2678 /* If two groups have overlapping access relations (as determined by
2679  * the "overlap" function) and if one of them involves a write,
2680  * then merge the two groups into one.
2681  * If "compute_bounds" is set, then call compute_group_bounds
2682  * on the merged groups.
2683  *
2684  * Return the updated number of groups.
2685  */
2686 static int group_writes(struct gpu_gen *gen,
2687         int n, struct gpu_array_ref_group **groups,
2688         int (*overlap)(struct gpu_array_ref_group *group1,
2689                 struct gpu_array_ref_group *group2), int compute_bounds)
2690 {
2691         int i, j;
2692
2693         for (i = 0; i < n; ++i) {
2694                 for (j = n - 1; j > i; --j) {
2695                         if (!groups[i]->write && !groups[j]->write)
2696                                 continue;
2697
2698                         if (!overlap(groups[i], groups[j]))
2699                                 continue;
2700
2701                         groups[i] = join_groups_and_free(groups[i], groups[j]);
2702                         if (compute_bounds)
2703                                 compute_group_bounds(gen, groups[i]);
2704                         if (j != n - 1)
2705                                 groups[j] = groups[n - 1];
2706                         n--;
2707                 }
2708         }
2709
2710         return n;
2711 }
2712
2713 /* If two groups have overlapping access relations (within the innermost
2714  * loop) and if one of them involves a write, then merge the two groups
2715  * into one.
2716  *
2717  * Return the updated number of groups.
2718  */
2719 static int group_overlapping_writes(struct gpu_gen *gen,
2720         int n, struct gpu_array_ref_group **groups)
2721 {
2722         return group_writes(gen, n, groups, &accesses_overlap, 0);
2723 }
2724
2725 /* Check if the access relations of group1 and group2 overlap within
2726  * the outermost min(group1->last_shared, group2->last_shared) loops.
2727  */
2728 static int last_shared_accesses_overlap(struct gpu_array_ref_group *group1,
2729         struct gpu_array_ref_group *group2)
2730 {
2731         int last_shared;
2732         int dim;
2733         int empty;
2734         isl_map *map_i, *map_j, *map;
2735
2736         last_shared = group1->last_shared;
2737         if (group2->last_shared < last_shared)
2738                 last_shared = group2->last_shared;
2739         map_i = isl_map_copy(group1->access);
2740         dim = isl_map_dim(map_i, isl_dim_in);
2741         map_i = isl_map_eliminate(map_i, isl_dim_in,
2742                                 last_shared + 1, dim - (last_shared + 1));
2743         map_j = isl_map_copy(group2->access);
2744         map_j = isl_map_eliminate(map_j, isl_dim_in,
2745                                 last_shared + 1, dim - (last_shared + 1));
2746         map = isl_map_intersect(map_i, map_j);
2747         empty = isl_map_is_empty(map);
2748         isl_map_free(map);
2749
2750         return !empty;
2751 }
2752
2753 /* If two groups have overlapping access relations (within the outer
2754  * last_shared loops) and if one of them involves a write,
2755  * then merge the two groups into one.
2756  *
2757  * Return the updated number of groups.
2758  */
2759 static int group_last_shared_overlapping_writes(struct gpu_gen *gen, int n,
2760         struct gpu_array_ref_group **groups)
2761 {
2762         return group_writes(gen, n, groups, &last_shared_accesses_overlap, 1);
2763 }
2764
2765 /* Is the size of the tile specified by "tile" smaller than the sum of
2766  * the sizes of the tiles specified by "tile1" and "tile2"?
2767  */
2768 static int smaller_tile(isl_ctx *ctx, struct gpu_array_tile *tile,
2769         struct gpu_array_tile *tile1, struct gpu_array_tile *tile2)
2770 {
2771         int smaller;
2772         isl_val *size, *size1, *size2;
2773
2774         size = tile_size(ctx, tile);
2775         size1 = tile_size(ctx, tile1);
2776         size2 = tile_size(ctx, tile2);
2777
2778         size = isl_val_sub(size, size1);
2779         size = isl_val_sub(size, size2);
2780         smaller = isl_val_is_neg(size);
2781
2782         isl_val_free(size);
2783
2784         return smaller;
2785 }
2786
2787 /* Given an initial grouping of array references and shared memory tiles
2788  * for each group that allows for a shared memory tile, merge two groups
2789  * if both have a shared memory tile, the merged group also has
2790  * a shared memory tile and the size of the tile for the merge group
2791  * is smaller than the sum of the tile sizes of the individual groups.
2792  *
2793  * If merging two groups decreases the "last_shared" dimension of
2794  * one or both of the two groups, then we need to check for overlapping
2795  * writes again.
2796  *
2797  * Return the number of groups after merging.
2798  */
2799 static int group_common_shared_memory_tile(struct gpu_gen *gen,
2800         struct gpu_array_info *array, int n,
2801         struct gpu_array_ref_group **groups)
2802 {
2803         int i, j;
2804         int recompute_overlap = 0;
2805         isl_ctx *ctx = isl_space_get_ctx(array->space);
2806
2807         for (i = 0; i < n; ++i) {
2808                 if (!groups[i]->shared_tile)
2809                         continue;
2810                 for (j = n - 1; j > i; --j) {
2811                         isl_map *map;
2812                         int empty;
2813                         struct gpu_array_ref_group *group;
2814
2815                         if (!groups[j]->shared_tile)
2816                                 continue;
2817
2818                         map = isl_map_intersect(isl_map_copy(groups[i]->access),
2819                                             isl_map_copy(groups[j]->access));
2820                         empty = isl_map_is_empty(map);
2821                         isl_map_free(map);
2822
2823                         if (empty)
2824                                 continue;
2825
2826                         group = join_groups(groups[i], groups[j]);
2827                         compute_group_bounds(gen, group);
2828                         if (!group->shared_tile ||
2829                             !smaller_tile(ctx, group->shared_tile,
2830                                         groups[i]->shared_tile,
2831                                         groups[j]->shared_tile)) {
2832                                 free_array_ref_group(group);
2833                                 continue;
2834                         }
2835
2836                         if (group->last_shared < groups[i]->last_shared ||
2837                             group->last_shared < groups[j]->last_shared)
2838                                 recompute_overlap = 1;
2839                         free_array_ref_group(groups[i]);
2840                         free_array_ref_group(groups[j]);
2841                         groups[i] = group;
2842                         if (j != n - 1)
2843                                 groups[j] = groups[n - 1];
2844                         n--;
2845                 }
2846         }
2847
2848         if (recompute_overlap)
2849                 n = group_last_shared_overlapping_writes(gen, n, groups);
2850         return n;
2851 }
2852
2853 /* Set array->n_group and array->groups to n and groups.
2854  *
2855  * Additionally, set the "nr" field of each group
2856  * and the "group" field of each reference in each group.
2857  */
2858 static void set_array_groups(struct gpu_array_info *array,
2859         int n, struct gpu_array_ref_group **groups)
2860 {
2861         int i, j;
2862
2863         array->n_group = n;
2864         array->groups = groups;
2865
2866         for (i = 0; i < n; ++i) {
2867                 groups[i]->nr = i;
2868
2869                 for (j = 0; j < groups[i]->n_ref; ++j)
2870                         groups[i]->refs[j]->group = i;
2871         }
2872 }
2873
2874 /* Group array references that should be considered together when
2875  * deciding whether to access them from private, shared or global memory.
2876  *
2877  * In particular, if two array references overlap and if one of them
2878  * is a write, then the two references are grouped together.
2879  * We first perform an initial grouping based only on the access relation.
2880  * After computing shared and private memory tiles, we check for
2881  * overlapping writes again, but this time taking into account
2882  * the "last_shared" property.
2883  *
2884  * Furthermore, if two groups admit a shared memory tile and if the
2885  * combination of the two also admits a shared memory tile, we merge
2886  * the two groups.
2887  *
2888  * If the array contains structures, then there is no need to compute
2889  * reference groups since we do not map such arrays to private or shared
2890  * memory.
2891  */
2892 static void group_array_references(struct gpu_gen *gen,
2893         struct gpu_array_info *array, __isl_keep isl_union_map *sched)
2894 {
2895         int i;
2896         int n;
2897         isl_ctx *ctx = isl_union_map_get_ctx(sched);
2898         struct gpu_array_ref_group **groups;
2899
2900         if (array->has_compound_element)
2901                 return;
2902
2903         groups = isl_calloc_array(ctx, struct gpu_array_ref_group *,
2904                                         array->n_ref);
2905         assert(groups);
2906
2907         n = populate_array_references(array, sched, groups);
2908
2909         n = group_overlapping_writes(gen, n, groups);
2910
2911         for (i = 0; i < n; ++i)
2912                 compute_group_bounds(gen, groups[i]);
2913
2914         n = group_last_shared_overlapping_writes(gen, n, groups);
2915
2916         n = group_common_shared_memory_tile(gen, array, n, groups);
2917
2918         set_array_groups(array, n, groups);
2919 }
2920
2921 /* Take tiled_sched, project it onto the shared tile loops and
2922  * the loops that will be wrapped over the threads and
2923  * store the result in gen->shared_sched.
2924  * Also compute a projection that projects out the loops that will be
2925  * wrapped over the threads and store this projection in gen->shared_proj.
2926  */
2927 static void compute_shared_sched(struct gpu_gen *gen)
2928 {
2929         isl_space *dim;
2930         isl_map *proj;
2931         isl_set *par;
2932         isl_union_map *sched;
2933
2934         sched = isl_union_map_copy(gen->tiled_sched);
2935
2936         dim = isl_union_map_get_space(sched);
2937         proj = projection(dim, gen->tiled_len, gen->shared_len + gen->n_block);
2938         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
2939
2940         dim = isl_union_map_get_space(sched);
2941         proj = projection(dim, gen->shared_len + gen->n_block, gen->shared_len);
2942
2943         gen->shared_sched = sched;
2944         gen->shared_proj = isl_union_map_from_map(proj);
2945 }
2946
2947 /* Group references of all arrays in the program.
2948  */
2949 static void group_references(struct gpu_gen *gen)
2950 {
2951         int i;
2952         isl_union_map *sched;
2953
2954         sched = isl_union_map_apply_range(isl_union_map_copy(gen->shared_sched),
2955                                           isl_union_map_copy(gen->shared_proj));
2956
2957         for (i = 0; i < gen->prog->n_array; ++i)
2958                 group_array_references(gen, &gen->prog->array[i], sched);
2959
2960         isl_union_map_free(sched);
2961 }
2962
2963 /* Free all array information that is local to the current kernel.
2964  */
2965 static void free_local_array_info(struct gpu_gen *gen)
2966 {
2967         int i, j;
2968
2969         for (i = 0; i < gen->prog->n_array; ++i) {
2970                 struct gpu_array_info *array = &gen->prog->array[i];
2971
2972                 for (j = 0; j < array->n_group; ++j)
2973                         free_array_ref_group(array->groups[j]);
2974                 free(array->groups);
2975         }
2976 }
2977
2978 /* Compute the size of a bounding box around the origin and "set",
2979  * where "set" is assumed to contain only non-negative elements.
2980  * In particular, compute the maximal value of "set" in each direction
2981  * and add one.
2982  */
2983 static __isl_give isl_multi_pw_aff *extract_size(__isl_take isl_set *set,
2984         __isl_keep isl_set *context)
2985 {
2986         int i, n;
2987         isl_multi_pw_aff *mpa;
2988
2989         n = isl_set_dim(set, isl_dim_set);
2990         mpa = isl_multi_pw_aff_zero(isl_set_get_space(set));
2991         for (i = 0; i < n; ++i) {
2992                 isl_space *space;
2993                 isl_aff *one;
2994                 isl_pw_aff *bound;
2995
2996                 bound = isl_set_dim_max(isl_set_copy(set), i);
2997                 bound = isl_pw_aff_coalesce(bound);
2998                 bound = isl_pw_aff_gist(bound, isl_set_copy(context));
2999
3000                 space = isl_pw_aff_get_domain_space(bound);
3001                 one = isl_aff_zero_on_domain(isl_local_space_from_space(space));
3002                 one = isl_aff_add_constant_si(one, 1);
3003                 bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one));
3004                 mpa = isl_multi_pw_aff_set_pw_aff(mpa, i, bound);
3005         }
3006         isl_set_free(set);
3007
3008         return mpa;
3009 }
3010
3011 /* Compute the effective grid size as a list of the sizes in each dimension.
3012  *
3013  * The grid size specified by the user or set by default
3014  * in read_grid_sizes() and applied in tile_schedule(),
3015  * may be too large for the given code in the sense that
3016  * it may contain blocks that don't need to execute anything.
3017  * We therefore don't return this grid size, but instead the
3018  * smallest grid size that ensures that all blocks that actually
3019  * execute code are included in the grid.
3020  *
3021  * We first extract a description of the grid, i.e., the possible values
3022  * of the block ids, from gen->tiled_sched.
3023  * The block ids are parameters in gen->tiled_sched.
3024  * We simply need to change them into set dimensions.
3025  *
3026  * Then, for each block dimension, we compute the maximal value of the block id
3027  * and add one.
3028  */
3029 static __isl_give isl_multi_pw_aff *extract_grid_size(struct gpu_gen *gen,
3030         struct ppcg_kernel *kernel)
3031 {
3032         int i;
3033         isl_set *grid;
3034
3035         grid = isl_union_map_params(isl_union_map_copy(gen->tiled_sched));
3036         grid = isl_set_from_params(grid);
3037         grid = isl_set_add_dims(grid, isl_dim_set, gen->n_grid);
3038         for (i = 0; i < gen->n_grid; ++i) {
3039                 int pos;
3040                 char name[20];
3041
3042                 snprintf(name, sizeof(name), "b%d", i);
3043                 pos = isl_set_find_dim_by_name(grid, isl_dim_param, name);
3044                 assert(pos >= 0);
3045                 grid = isl_set_equate(grid, isl_dim_param, pos, isl_dim_set, i);
3046                 grid = isl_set_project_out(grid, isl_dim_param, pos, 1);
3047         }
3048
3049         return extract_size(grid, kernel->context);
3050 }
3051
3052 /* Compute the size of a fixed bounding box around the origin and "set",
3053  * where "set" is assumed to contain only non-negative elements,
3054  * and store the results in "size".
3055  * In particular, compute the maximal value of "set" in each direction
3056  * and add one.
3057  */
3058 static void extract_fixed_size(__isl_take isl_set *set, int *size)
3059 {
3060         int i, n;
3061         isl_local_space *ls;
3062         isl_aff *obj;
3063
3064         n = isl_set_dim(set, isl_dim_set);
3065         ls = isl_local_space_from_space(isl_set_get_space(set));
3066         obj = isl_aff_zero_on_domain(ls);
3067         for (i = 0; i < n; ++i) {
3068                 isl_val *max;
3069
3070                 obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 1);
3071                 max = isl_set_max_val(set, obj);
3072                 size[i] = isl_val_get_num_si(max) + 1;
3073                 isl_val_free(max);
3074                 obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 0);
3075         }
3076         isl_aff_free(obj);
3077         isl_set_free(set);
3078 }
3079
3080 /* Compute the effective block size as a list of the sizes in each dimension
3081  * and store the sizes in kernel->block_dim.
3082  *
3083  * The block size specified by the user or set by default
3084  * in read_block_sizes() and applied in thread_tile_schedule(),
3085  * may be too large for the given code in the sense that
3086  * it may contain threads that don't need to execute anything.
3087  * We therefore don't store this block size in kernel->block_dim,
3088  * but instead the smallest block size that ensures that all threads
3089  * that actually execute code are included in the block.
3090  *
3091  * The current implementation eliminates all parameters, ensuring
3092  * that the size is a fixed constant in each dimension.
3093  * In principle we could also compute parametric sizes.
3094  * We would have to make sure to project out all b%d and t%d parameters,
3095  * however.
3096  */
3097 static void extract_block_size(struct gpu_gen *gen, struct ppcg_kernel *kernel)
3098 {
3099         int i;
3100         int nparam;
3101         isl_set *block;
3102         isl_multi_pw_aff *mpa;
3103
3104         block = isl_union_map_params(isl_union_map_copy(gen->local_sched));
3105         block = isl_set_from_params(block);
3106         block = isl_set_add_dims(block, isl_dim_set, gen->n_block);
3107         kernel->n_block = gen->n_block;
3108         for (i = 0; i < gen->n_block; ++i) {
3109                 int pos;
3110                 char name[20];
3111
3112                 snprintf(name, sizeof(name), "t%d", i);
3113                 pos = isl_set_find_dim_by_name(block, isl_dim_param, name);
3114                 assert(pos >= 0);
3115                 block = isl_set_equate(block, isl_dim_param, pos,
3116                                         isl_dim_set, i);
3117         }
3118         nparam = isl_set_dim(block, isl_dim_param);
3119         block = isl_set_project_out(block, isl_dim_param, 0, nparam);
3120
3121         extract_fixed_size(block, kernel->block_dim);
3122 }
3123
3124 void ppcg_kernel_free(void *user)
3125 {
3126         struct ppcg_kernel *kernel = user;
3127         int i;
3128
3129         if (!kernel)
3130                 return;
3131
3132         isl_multi_pw_aff_free(kernel->grid_size);
3133         isl_set_free(kernel->context);
3134         isl_union_set_free(kernel->arrays);
3135         isl_space_free(kernel->space);
3136         isl_ast_node_free(kernel->tree);
3137
3138         for (i = 0; i < kernel->n_array; ++i)
3139                 isl_pw_aff_list_free(kernel->array[i].bound);
3140         free(kernel->array);
3141
3142         for (i = 0; i < kernel->n_var; ++i) {
3143                 free(kernel->var[i].name);
3144                 isl_vec_free(kernel->var[i].size);
3145         }
3146         free(kernel->var);
3147
3148         free(kernel);
3149 }
3150
3151 static void create_kernel_var(isl_ctx *ctx, struct gpu_array_ref_group *group,
3152         struct ppcg_kernel_var *var)
3153 {
3154         int j;
3155         struct gpu_array_tile *tile;
3156         isl_printer *p;
3157         char *name;
3158
3159         var->array = group->array;
3160
3161         tile = group->private_tile;
3162         var->type = ppcg_access_private;
3163         if (!tile) {
3164                 tile = group->shared_tile;
3165                 var->type = ppcg_access_shared;
3166         }
3167
3168         p = isl_printer_to_str(ctx);
3169         p = print_array_name(p, group);
3170         var->name = isl_printer_get_str(p);
3171         isl_printer_free(p);
3172
3173         var->size = isl_vec_alloc(ctx, group->array->n_index);
3174
3175         for (j = 0; j < group->array->n_index; ++j)
3176                 var->size = isl_vec_set_element_val(var->size, j,
3177                                             isl_val_copy(tile->bound[j].size));
3178 }
3179
3180 static void create_kernel_vars(struct gpu_gen *gen, struct ppcg_kernel *kernel)
3181 {
3182         int i, j, n;
3183
3184         n = 0;
3185         for (i = 0; i < gen->prog->n_array; ++i) {
3186                 struct gpu_array_info *array = &gen->prog->array[i];
3187
3188                 for (j = 0; j < array->n_group; ++j) {
3189                         struct gpu_array_ref_group *group = array->groups[j];
3190                         if (group->private_tile || group->shared_tile)
3191                                 ++n;
3192                 }
3193         }
3194
3195         kernel->n_var = n;
3196         kernel->var = isl_calloc_array(gen->ctx, struct ppcg_kernel_var, n);
3197         assert(kernel->var);
3198
3199         n = 0;
3200         for (i = 0; i < gen->prog->n_array; ++i) {
3201                 struct gpu_array_info *array = &gen->prog->array[i];
3202
3203                 for (j = 0; j < array->n_group; ++j) {
3204                         struct gpu_array_ref_group *group = array->groups[j];
3205                         if (!group->private_tile && !group->shared_tile)
3206                                 continue;
3207                         create_kernel_var(gen->ctx, group, &kernel->var[n]);
3208                         ++n;
3209                 }
3210         }
3211 }
3212
3213 /* The sizes of the arrays on the host that have been computed by
3214  * extract_array_info may depend on the parameters.  Use the extra
3215  * constraints on the parameters that are valid at "host_domain"
3216  * to simplify these expressions and store the results in kernel->array.
3217  *
3218  * We only need these localized bounds for arrays that are accessed
3219  * by the current kernel.  If we have found at least one reference group
3220  * then the array is accessed by the kernel.  If the array has compound
3221  * elements then we skipped the construction of array reference groups.
3222  */
3223 static void localize_bounds(struct gpu_gen *gen, struct ppcg_kernel *kernel,
3224         __isl_keep isl_set *host_domain)
3225 {
3226         int i, j;
3227         isl_set *context;
3228
3229         kernel->array = isl_calloc_array(gen->ctx,
3230                             struct gpu_local_array_info, gen->prog->n_array);
3231         assert(kernel->array);
3232         kernel->n_array = gen->prog->n_array;
3233
3234         context = isl_set_copy(host_domain);
3235         context = isl_set_params(context);
3236
3237         for (i = 0; i < gen->prog->n_array; ++i) {
3238                 struct gpu_array_info *array = &gen->prog->array[i];
3239                 isl_pw_aff_list *local;
3240
3241                 if (array->n_group == 0 && !array->has_compound_element)
3242                         continue;
3243
3244                 local = isl_pw_aff_list_alloc(gen->ctx, array->n_index);
3245
3246                 for (j = 0; j < array->n_index; ++j) {
3247                         isl_pw_aff *pwaff;
3248
3249                         pwaff = isl_pw_aff_copy(array->bound[j]);
3250                         pwaff = isl_pw_aff_gist(pwaff, isl_set_copy(context));
3251                         local = isl_pw_aff_list_add(local, pwaff);
3252                 }
3253
3254                 kernel->array[i].bound = local;
3255         }
3256         isl_set_free(context);
3257 }
3258
3259 /* Find the element in gen->stmt that has the given "id".
3260  * Return NULL if no such gpu_stmt can be found.
3261  */
3262 static struct gpu_stmt *find_stmt(struct gpu_prog *prog, __isl_keep isl_id *id)
3263 {
3264         int i;
3265
3266         for (i = 0; i < prog->n_stmts; ++i) {
3267                 if (id == prog->stmts[i].id)
3268                         break;
3269         }
3270
3271         return i < prog->n_stmts ? &prog->stmts[i] : NULL;
3272 }
3273
3274 /* Set gen->tile_len and gen->n_parallel to those of the statement
3275  * affected by the first map (part of the schedule)
3276  * on which this function is called.
3277  * Because of the way the schedule is constructed, the other statements
3278  * in the list, if any, should have the same values for these properties.
3279  */
3280 static int extract_tile_len(__isl_take isl_map *map, void *user)
3281 {
3282         struct gpu_gen *gen = (struct gpu_gen *) user;
3283         isl_id *id;
3284         struct gpu_stmt *stmt;
3285
3286         id = isl_map_get_tuple_id(map, isl_dim_in);
3287         stmt = find_stmt(gen->prog, id);
3288         isl_id_free(id);
3289
3290         isl_map_free(map);
3291
3292         if (!stmt)
3293                 isl_die(gen->ctx, isl_error_unknown,
3294                         "statement not found", return -1);
3295
3296         gen->tile_len = stmt->tile_len;
3297         gen->n_parallel = stmt->n_parallel;
3298
3299         return -1;
3300 }
3301
3302 void ppcg_kernel_stmt_free(void *user)
3303 {
3304         int i;
3305         struct ppcg_kernel_stmt *stmt = user;
3306
3307         if (!stmt)
3308                 return;
3309
3310         switch (stmt->type) {
3311         case ppcg_kernel_copy:
3312                 isl_ast_expr_free(stmt->u.c.index);
3313                 isl_ast_expr_free(stmt->u.c.local_index);
3314                 break;
3315         case ppcg_kernel_domain:
3316                 isl_id_to_ast_expr_free(stmt->u.d.ref2expr);
3317                 break;
3318         case ppcg_kernel_sync:
3319                 break;
3320         }
3321
3322         free(stmt);
3323 }
3324
3325 /* Set the options of "context" to
3326  *
3327  *      { space -> [x] : x >= first }
3328  */
3329 static __isl_give isl_ast_build *set_unroll(
3330         __isl_take isl_ast_build *build, __isl_take isl_space *space,
3331         int first)
3332 {
3333         isl_ctx *ctx;
3334         isl_map *unroll;
3335         isl_union_map *opt;
3336
3337         ctx = isl_ast_build_get_ctx(build);
3338
3339         space = isl_space_from_domain(space);
3340         space = isl_space_add_dims(space, isl_dim_out, 1);
3341         space = isl_space_set_tuple_name(space, isl_dim_out, "unroll");
3342         unroll = isl_map_universe(space);
3343         unroll = isl_map_lower_bound_si(unroll, isl_dim_out, 0, first);
3344         opt = isl_union_map_from_map(unroll);
3345
3346         build = isl_ast_build_set_options(build, opt);
3347
3348         return build;
3349 }
3350
3351 /* Return a list of isl_ids of the form "prefix%d".
3352  */
3353 static __isl_give isl_id_list *generate_names(isl_ctx *ctx,
3354         int n, const char *prefix)
3355 {
3356         int i;
3357         char name[10];
3358         isl_id_list *names;
3359
3360         names = isl_id_list_alloc(ctx, n);
3361         for (i = 0; i < n; ++i) {
3362                 isl_id *id;
3363
3364                 snprintf(name, sizeof(name), "%s%d", prefix, i);
3365                 id = isl_id_alloc(ctx, name, NULL);
3366                 names = isl_id_list_add(names, id);
3367         }
3368
3369         return names;
3370 }
3371
3372 /* Extend the schedule "schedule" with the part of "extension"
3373  * starting at "first" up to "len".
3374  */
3375 static __isl_give isl_union_map *extend_schedule(
3376         __isl_take isl_union_map *schedule,
3377         __isl_take isl_union_map *extension, int first, int len)
3378 {
3379         isl_space *space;
3380         isl_map *proj;
3381         isl_union_map *umap;
3382         isl_set *set;
3383
3384         space = isl_union_map_get_space(schedule);
3385         space = isl_space_set_from_params(space);
3386         space = isl_space_add_dims(space, isl_dim_set, len);
3387         proj = isl_set_identity(isl_set_universe(space));
3388         proj = isl_map_project_out(proj, isl_dim_out, 0, first);
3389         extension = isl_union_map_apply_range(extension,
3390                                                 isl_union_map_from_map(proj));
3391
3392         schedule = isl_union_map_range_product(schedule, extension);
3393
3394         return schedule;
3395 }
3396
3397 /* Return the gpu_stmt_access in the list "accesses" that corresponds
3398  * to "ref_id".
3399  */
3400 static struct gpu_stmt_access *find_access(struct gpu_stmt_access *accesses,
3401         __isl_keep isl_id *ref_id)
3402 {
3403         struct gpu_stmt_access *access;
3404
3405         for (access = accesses; access; access = access->next)
3406                 if (access->ref_id == ref_id)
3407                         return access;
3408
3409         return NULL;
3410 }
3411
3412 /* Return the index of the array called "name" in the list of arrays.
3413  */
3414 static int find_array_index(struct gpu_gen *gen, const char *name)
3415 {
3416         int i;
3417
3418         for (i = 0; i < gen->prog->n_array; ++i)
3419                 if (!strcmp(name, gen->prog->array[i].name))
3420                         return i;
3421
3422         return -1;
3423 }
3424
3425 /* Internal data structure for the index and AST expression transformation
3426  * callbacks for pet_stmt_build_ast_exprs.
3427  *
3428  * "accesses" is the list of gpu_stmt_access in the statement.
3429  * "iterator_map" expresses the statement iterators in terms of
3430  * the AST loop iterators.
3431  * "sched2shared" expresses the first shared_len dimensions of
3432  * the computed schedule in terms of the AST loop iterators.
3433  *
3434  * The following fields are set in transform_index and used in transform_expr.
3435  * "array" is the array that is being accessed.
3436  * "global" is set if the global array is accessed (rather than
3437  * shared/private memory).
3438  * "local_array" refers to information on the array specialized
3439  * to the current kernel.
3440  */
3441 struct ppcg_transform_data {
3442         struct gpu_gen *gen;
3443         struct gpu_stmt_access *accesses;
3444         isl_pw_multi_aff *iterator_map;
3445         isl_pw_multi_aff *sched2shared;
3446
3447         struct gpu_array_info *array;
3448         int global;
3449         struct gpu_local_array_info *local_array;
3450 };
3451
3452 /* Return the name of the outer array (of structs) accessed by "access".
3453  */
3454 static const char *get_outer_array_name(__isl_keep isl_map *access)
3455 {
3456         isl_space *space;
3457         const char *name;
3458
3459         space = isl_space_range(isl_map_get_space(access));
3460         while (space && isl_space_is_wrapping(space))
3461                 space = isl_space_domain(isl_space_unwrap(space));
3462         name = isl_space_get_tuple_name(space, isl_dim_set);
3463         isl_space_free(space);
3464
3465         return name;
3466 }
3467
3468 /* Index transformation callback for pet_stmt_build_ast_exprs.
3469  *
3470  * "index" expresses the array indices in terms of statement iterators
3471  *
3472  * We first reformulate "index" in terms of the AST loop iterators.
3473  * Then we check if we are accessing the global array or
3474  * a shared/private copy.  In the former case, we simply return
3475  * the updated index.  If "index" is an affine expression rather
3476  * than an array access, then we also return the updated index here.
3477  *
3478  * If no reference groups have been computed for the array,
3479  * then we can only be accessing the global array.
3480  *
3481  * Otherwise, we apply the tiling to the index.
3482  * This tiling is of the form
3483  *
3484  *      [D -> A] -> T
3485  *
3486  * The index is of the form
3487  *
3488  *      L -> A
3489  *
3490  * We update the tiling to refer to the AST loop iteratos
3491  *
3492  *      [L -> A] -> T
3493  *
3494  * and modify index to keep track of those iterators
3495  *
3496  *      L -> [L -> A]
3497  *
3498  * Combining these two yields a tiled index expression in terms
3499  * of the AST loop iterators
3500  *
3501  *      L -> T
3502  */
3503 static __isl_give isl_multi_pw_aff *transform_index(
3504         __isl_take isl_multi_pw_aff *index, __isl_keep isl_id *ref_id,
3505         void *user)
3506 {
3507         struct ppcg_transform_data *data = user;
3508         struct gpu_stmt_access *access;
3509         struct gpu_array_ref_group *group;
3510         struct gpu_array_tile *tile;
3511         isl_pw_multi_aff *iterator_map;
3512         int i;
3513         const char *name;
3514         isl_space *space;
3515         isl_multi_pw_aff *tiling;
3516         isl_pw_multi_aff *pma;
3517         isl_multi_pw_aff *mpa;
3518
3519         data->array = NULL;
3520
3521         iterator_map = isl_pw_multi_aff_copy(data->iterator_map);
3522         index = isl_multi_pw_aff_pullback_pw_multi_aff(index, iterator_map);
3523
3524         access = find_access(data->accesses, ref_id);
3525         if (!access)
3526                 return index;
3527         if (!isl_map_has_tuple_name(access->access, isl_dim_out))
3528                 return index;
3529
3530         name = get_outer_array_name(access->access);
3531         i = find_array_index(data->gen, name);
3532         if (i < 0)
3533                 isl_die(isl_multi_pw_aff_get_ctx(index), isl_error_internal,
3534                         "cannot find array",
3535                         return isl_multi_pw_aff_free(index));
3536         data->array = &data->gen->prog->array[i];
3537         data->local_array = &data->gen->kernel->array[i];
3538
3539         if (access->group < 0) {
3540                 data->global = 1;
3541                 return index;
3542         }
3543
3544         group = data->array->groups[access->group];
3545         tile = group->private_tile;
3546         if (!tile)
3547                 tile = group->shared_tile;
3548         data->global = !tile;
3549         if (!tile)
3550                 return index;
3551
3552         space = isl_space_range(isl_multi_pw_aff_get_space(index));
3553         space = isl_space_map_from_set(space);
3554         pma = isl_pw_multi_aff_identity(space);
3555         pma = isl_pw_multi_aff_product(
3556                         isl_pw_multi_aff_copy(data->sched2shared), pma);
3557         tiling = isl_multi_pw_aff_from_multi_aff(
3558                                     isl_multi_aff_copy(tile->tiling));
3559         tiling = isl_multi_pw_aff_pullback_pw_multi_aff(tiling, pma);
3560
3561         space = isl_space_domain(isl_multi_pw_aff_get_space(index));
3562         space = isl_space_map_from_set(space);
3563         mpa = isl_multi_pw_aff_identity(space);
3564         index = isl_multi_pw_aff_range_product(mpa, index);
3565         index = isl_multi_pw_aff_pullback_multi_pw_aff(tiling, index);
3566
3567         return index;
3568 }
3569
3570 /* Dereference "expr" by adding an index [0].
3571  * The original "expr" is assumed not to have any indices.
3572  *
3573  * If "expr" is a member access, then the dereferencing needs
3574  * to be applied to the structure argument of this member access.
3575  */
3576 static __isl_give isl_ast_expr *dereference(__isl_take isl_ast_expr *expr)
3577 {
3578         isl_ctx *ctx;
3579         isl_ast_expr *res;
3580         isl_ast_expr_list *list;
3581
3582         if (isl_ast_expr_get_op_type(expr) == isl_ast_op_member) {
3583                 isl_ast_expr *arg;
3584
3585                 arg = isl_ast_expr_get_op_arg(expr, 0);
3586                 arg = dereference(arg);
3587                 expr = isl_ast_expr_set_op_arg(expr, 0, arg);
3588
3589                 return expr;
3590         }
3591
3592         ctx = isl_ast_expr_get_ctx(expr);
3593         res = isl_ast_expr_from_val(isl_val_zero(ctx));
3594         list = isl_ast_expr_list_from_ast_expr(res);
3595         res = isl_ast_expr_get_op_arg(expr, 0);
3596         res = isl_ast_expr_access(res, list);
3597         isl_ast_expr_free(expr);
3598
3599         return res;
3600 }
3601
3602 /* Linearize the index expression "expr" based on the array bounds
3603  * of "array".
3604  *
3605  * That is, transform expression
3606  *
3607  *      A[i_0][i_1]...[i_n]
3608  *
3609  * to
3610  *
3611  *      A[(..((i_0 * b_1 + i_1) ... ) * b_n + i_n]
3612  *
3613  * where b_0, b_1, ..., b_n are the bounds on the array.
3614  *
3615  * If the base of "expr" is a member access, then the linearization needs
3616  * to be applied to the structure argument of this member access.
3617  */
3618 __isl_give isl_ast_expr *gpu_local_array_info_linearize_index(
3619         struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr)
3620 {
3621         int i, n;
3622         isl_ctx *ctx;
3623         isl_set *context;
3624         isl_ast_expr *arg0;
3625         isl_ast_expr *res;
3626         isl_ast_expr_list *list;
3627         isl_ast_build *build;
3628
3629         arg0 = isl_ast_expr_get_op_arg(expr, 0);
3630         if (isl_ast_expr_get_type(arg0) == isl_ast_expr_op &&
3631             isl_ast_expr_get_op_type(arg0) == isl_ast_op_member) {
3632                 isl_ast_expr *arg;
3633
3634                 arg = isl_ast_expr_get_op_arg(arg0, 0);
3635                 arg = gpu_local_array_info_linearize_index(array, arg);
3636                 arg0 = isl_ast_expr_set_op_arg(arg0, 0, arg);
3637                 expr = isl_ast_expr_set_op_arg(expr, 0, arg0);
3638
3639                 return expr;
3640         }
3641         isl_ast_expr_free(arg0);
3642
3643         ctx = isl_ast_expr_get_ctx(expr);
3644         context = isl_set_universe(isl_space_params_alloc(ctx, 0));
3645         build = isl_ast_build_from_context(context);
3646
3647         n = isl_ast_expr_get_op_n_arg(expr);
3648         res = isl_ast_expr_get_op_arg(expr, 1);
3649         for (i = 2; i < n; ++i) {
3650                 isl_pw_aff *bound_i;
3651                 isl_ast_expr *expr_i;
3652
3653                 bound_i = isl_pw_aff_list_get_pw_aff(array->bound, i - 1);
3654                 expr_i = isl_ast_build_expr_from_pw_aff(build, bound_i);
3655                 res = isl_ast_expr_mul(res, expr_i);
3656                 expr_i = isl_ast_expr_get_op_arg(expr, i);
3657                 res = isl_ast_expr_add(res, expr_i);
3658         }
3659
3660         isl_ast_build_free(build);
3661
3662         list = isl_ast_expr_list_from_ast_expr(res);
3663         res = isl_ast_expr_get_op_arg(expr, 0);
3664         res = isl_ast_expr_access(res, list);
3665
3666         isl_ast_expr_free(expr);
3667
3668         return res;
3669 }
3670
3671 /* AST expression transformation callback for pet_stmt_build_ast_exprs.
3672  *
3673  * If the AST expression refers to a global scalar that is not
3674  * a read-only scalar, then its address was passed to the kernel and
3675  * we need to dereference it.
3676  *
3677  * If the AST expression refers to an access to a global array,
3678  * then we linearize the access exploiting the bounds in data->local_array.
3679  */
3680 static __isl_give isl_ast_expr *transform_expr(__isl_take isl_ast_expr *expr,
3681         __isl_keep isl_id *id, void *user)
3682 {
3683         struct ppcg_transform_data *data = user;
3684
3685         if (!data->array)
3686                 return expr;
3687         if (gpu_array_is_read_only_scalar(data->array))
3688                 return expr;
3689         if (!data->global)
3690                 return expr;
3691         if (data->array->n_index == 0)
3692                 return dereference(expr);
3693         if (!data->array->linearize)
3694                 return expr;
3695
3696         return gpu_local_array_info_linearize_index(data->local_array, expr);
3697 }
3698
3699 /* This function is called for each instance of a user statement
3700  * in the kernel.
3701  *
3702  * We attach a struct ppcg_kernel_stmt to the "node", containing
3703  * a computed AST expression for each access.
3704  * These AST expressions are computed from iterator_map,
3705  * which expresses the domain
3706  * elements in terms of the generated loops, and sched2shared,
3707  * which expresses the first shared_len dimensions of the schedule
3708  * computed by PPCG in terms of the generated loops.
3709  */
3710 static __isl_give isl_ast_node *at_each_domain(__isl_take isl_ast_node *node,
3711         __isl_keep isl_ast_build *build, void *user)
3712 {
3713         struct ppcg_transform_data data;
3714         struct gpu_gen *gen = (struct gpu_gen *) user;
3715         struct ppcg_kernel_stmt *stmt;
3716         isl_id *id;
3717         isl_pw_multi_aff *sched2shared;
3718         isl_map *map;
3719         isl_pw_multi_aff *iterator_map;
3720         isl_ast_expr *expr, *arg;
3721         isl_union_map *schedule;
3722         int i, n;
3723         struct gpu_stmt_access *access;
3724
3725         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
3726         if (!stmt)
3727                 return isl_ast_node_free(node);
3728
3729         expr = isl_ast_node_user_get_expr(node);
3730         arg = isl_ast_expr_get_op_arg(expr, 0);
3731         id = isl_ast_expr_get_id(arg);
3732
3733         schedule = isl_ast_build_get_schedule(build);
3734         map = isl_map_reverse(isl_map_from_union_map(schedule));
3735         iterator_map = isl_pw_multi_aff_from_map(map);
3736         sched2shared = compute_sched_to_shared(gen,
3737                                         isl_pw_multi_aff_copy(iterator_map));
3738
3739         stmt->type = ppcg_kernel_domain;
3740         stmt->u.d.stmt = find_stmt(gen->prog, id);
3741         if (!stmt->u.d.stmt)
3742                 goto error;
3743
3744         data.gen = gen;
3745         data.accesses = stmt->u.d.stmt->accesses;
3746         data.iterator_map = iterator_map;
3747         data.sched2shared = sched2shared;
3748         stmt->u.d.ref2expr = pet_stmt_build_ast_exprs(stmt->u.d.stmt->stmt,
3749                                             build, &transform_index, &data,
3750                                             &transform_expr, &data);
3751
3752         isl_id_free(id);
3753         isl_pw_multi_aff_free(iterator_map);
3754         isl_pw_multi_aff_free(sched2shared);
3755         isl_ast_expr_free(arg);
3756         isl_ast_expr_free(expr);
3757
3758         id = isl_id_alloc(gen->ctx, NULL, stmt);
3759         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
3760         return isl_ast_node_set_annotation(node, id);
3761 error:
3762         isl_id_free(id);
3763         isl_pw_multi_aff_free(iterator_map);
3764         ppcg_kernel_stmt_free(stmt);
3765         isl_pw_multi_aff_free(sched2shared);
3766         return isl_ast_node_free(node);
3767 }
3768
3769 /* This function is called when code has been generated for the shared
3770  * tile loops.  The "schedule" refers only to the original statements.
3771  *
3772  * We extend the schedule with that part of gen->local_sched that hasn't
3773  * been taken into account yet.  This introduces parameters referring
3774  * to thread ids in the schedule, so we add them (with the appropriate
3775  * bounds to the context as well).
3776  * Finally, we set the appropriate unrolling options
3777  * if gen->first_unroll is set.
3778  */
3779 static __isl_give isl_ast_node *create_domain_leaf(
3780         __isl_take isl_union_map *schedule, __isl_take isl_ast_build *build,
3781         void *user)
3782 {
3783         struct gpu_gen *gen = (struct gpu_gen *) user;
3784         isl_space *space;
3785         isl_union_map *sched;
3786         isl_ast_node *tree;
3787         isl_set *set;
3788         isl_id_list *iterators;
3789         int n;
3790
3791         schedule = extend_schedule(schedule,
3792                         isl_union_map_copy(gen->local_sched),
3793                         gen->shared_len, gen->thread_tiled_len);
3794
3795         space = isl_ast_build_get_schedule_space(build);
3796         set = isl_set_universe(space);
3797         set = add_bounded_parameters(set, gen->kernel->n_block,
3798                                         gen->kernel->block_dim, "t");
3799         build = isl_ast_build_restrict(build, set);
3800
3801         n = gen->thread_tiled_len - gen->shared_len;
3802
3803         if (gen->first_unroll >= 0) {
3804                 space = isl_space_set_alloc(gen->ctx, 0, n);
3805                 build = set_unroll(build, space, gen->first_unroll);
3806         }
3807         iterators = generate_names(gen->ctx, n, "c");
3808         build = isl_ast_build_set_iterators(build, iterators);
3809         build = isl_ast_build_set_at_each_domain(build, &at_each_domain, gen);
3810         tree = isl_ast_build_ast_from_schedule(build, schedule);
3811         isl_ast_build_free(build);
3812
3813         return tree;
3814 }
3815
3816 /* This function is called for each statement node in the AST of the code
3817  * for copying to or from shared/private memory.
3818  * Attach a pointer to a ppcg_kernel_stmt representing the copy
3819  * statement to the node.
3820  * The statement name is "read" or "write", depending on whether we are
3821  * reading from global memory or writing to global memory.
3822  * The name of the T space is {shared,private}_<array>.
3823  *
3824  * The schedule is of the form
3825  *
3826  *      type[A -> T] -> L
3827  *
3828  * where A refers to a piece of an array and T to the corresponding
3829  * shifted tile.  We split this schedule into mappings L -> A and L -> T
3830  * and store the corresponding expressions in stmt->index and stmt->local_index,
3831  * where stmt points to the ppcg_kernel_stmt that is attached to the node.
3832  */
3833 static __isl_give isl_ast_node *attach_copy_stmt(__isl_take isl_ast_node *node,
3834         __isl_keep isl_ast_build *build, void *user)
3835 {
3836         struct gpu_gen *gen = (struct gpu_gen *) user;
3837         struct ppcg_kernel_stmt *stmt;
3838         isl_id *id;
3839         isl_ast_expr *expr;
3840         isl_space *space;
3841         isl_map *access, *local_access, *map;
3842         isl_pw_multi_aff *pma;
3843         const char *type;
3844         int array_index;
3845
3846         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
3847         if (!stmt)
3848                 return isl_ast_node_free(node);
3849
3850         access = isl_map_from_union_map(isl_ast_build_get_schedule(build));
3851         type = isl_map_get_tuple_name(access, isl_dim_in);
3852         stmt->u.c.read = !strcmp(type, "read");
3853         access = isl_map_reverse(access);
3854         space = isl_space_unwrap(isl_space_range(isl_map_get_space(access)));
3855         local_access = isl_map_copy(access);
3856
3857         map = isl_map_domain_map(isl_map_universe(isl_space_copy(space)));
3858         id = isl_map_get_tuple_id(access, isl_dim_out);
3859         map = isl_map_set_tuple_id(map, isl_dim_in, id);
3860         access = isl_map_apply_range(access, map);
3861         pma = isl_pw_multi_aff_from_map(access);
3862         expr = isl_ast_build_access_from_pw_multi_aff(build, pma);
3863         stmt->u.c.index = expr;
3864
3865         map = isl_map_range_map(isl_map_universe(space));
3866         id = isl_map_get_tuple_id(local_access, isl_dim_out);
3867         map = isl_map_set_tuple_id(map, isl_dim_in, id);
3868         local_access = isl_map_apply_range(local_access, map);
3869         pma = isl_pw_multi_aff_from_map(local_access);
3870         expr = isl_ast_build_access_from_pw_multi_aff(build, pma);
3871         stmt->u.c.local_index = expr;
3872
3873         stmt->u.c.array = gen->copy_group->array;
3874         array_index = stmt->u.c.array - gen->prog->array;
3875         stmt->u.c.local_array = &gen->kernel->array[array_index];
3876         stmt->type = ppcg_kernel_copy;
3877
3878         id = isl_id_alloc(gen->ctx, NULL, stmt);
3879         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
3880         return isl_ast_node_set_annotation(node, id);
3881 }
3882
3883 /* Given a schedule of the form
3884  *
3885  *      [S -> A] -> L
3886  *
3887  * (with S the first shared_len dimensions of the computed schedule,
3888  * A the array and L the schedule correponding to the generated loops),
3889  * indicating where to copy the array elements that need to be copied,
3890  * construct code for performing the copying.
3891  *
3892  * "group" is the array reference group that is being copied
3893  * "type" is either "read" or "write"
3894  * private is set if copying needs to be performed to/from registers
3895  *
3896  * We first construct a mapping to a shifted tile of the array,
3897  *
3898  *      [S -> A] -> T(S,A)                                      (1)
3899  *
3900  * If private is set, then we also use this mapping as a schedule
3901  * (which is already thread-specific and will be completely unrolled).
3902  * Otherwise, we wrap/tile the range over the threads.
3903  * The result is
3904  *
3905  *      [S -> A] -> T'(S,A)
3906  *
3907  * Combined with the given schedule, we have
3908  *
3909  *      [S -> A] -> [L -> T'(S,A)]                              (2)
3910  *
3911  * From the shifted tile mapping, we construct a mapping
3912  *
3913  *      [S -> A] -> [A -> T(S,A)]
3914  *
3915  * and apply it to the schedule (2), obtaining
3916  *
3917  *      [A -> T(S(L),A)] -> [L -> T'(S(L),A)]
3918  *
3919  * Note that we can project out S because it is uniquely defined by L.
3920  */
3921 static __isl_give isl_ast_node *copy_access(struct gpu_gen *gen,
3922         __isl_take isl_map *sched,
3923         const char *type, struct gpu_array_ref_group *group,
3924         __isl_take isl_ast_build *build, int private)
3925 {
3926         isl_space *space;
3927         isl_ast_node *tree;
3928         isl_map *schedule, *shift, *map;
3929         isl_set *set;
3930         isl_id_list *iterators;
3931         int n;
3932
3933         shift = shift_access(group);
3934
3935         schedule = isl_map_copy(shift);
3936         schedule = isl_map_reset_tuple_id(schedule, isl_dim_out);
3937         if (!private)
3938                 schedule = tile_access_schedule(gen, schedule);
3939
3940         n = isl_map_dim(schedule, isl_dim_out);
3941         set = isl_set_universe(isl_ast_build_get_schedule_space(build));
3942         set = add_bounded_parameters(set, gen->kernel->n_block,
3943                                         gen->kernel->block_dim, "t");
3944
3945         schedule = isl_map_range_product(sched, schedule);
3946
3947         space = isl_space_domain(isl_map_get_space(shift));
3948         map = isl_map_range_map(isl_map_universe(isl_space_unwrap(space)));
3949         map = isl_map_range_product(map, shift);
3950
3951         schedule = isl_map_apply_domain(schedule, map);
3952
3953         schedule = isl_map_set_tuple_name(schedule, isl_dim_in, type);
3954
3955         build = isl_ast_build_restrict(build, set);
3956
3957         gen->copy_group = group;
3958
3959         if (private) {
3960                 space = isl_space_range(isl_map_get_space(schedule));
3961                 space = isl_space_range(isl_space_unwrap(space));
3962                 build = set_unroll(build, space, 0);
3963         }
3964         iterators = generate_names(gen->ctx, n, "c");
3965         build = isl_ast_build_set_iterators(build, iterators);
3966         build = isl_ast_build_set_at_each_domain(build, &attach_copy_stmt, gen);
3967         tree = isl_ast_build_ast_from_schedule(build,
3968                                             isl_union_map_from_map(schedule));
3969         isl_ast_build_free(build);
3970
3971         return tree;
3972 }
3973
3974 /* Return code for reading into or writing from shared memory
3975  * the given array reference group.
3976  *
3977  * If we are performing a read from global memory to shared memory and
3978  * if the array involved is not a scalar, then we copy
3979  * the entire tile to shared memory.  This may result in some extra
3980  * elements getting copied, but it should lead to simpler code
3981  * (which means that fewer registers may be needed) and less divergence.
3982  *
3983  * Otherwise, we only copy the elements that will be read or have been written
3984  * in the kernel.
3985  *
3986  *
3987  * The input "sched" is of the form.
3988  *
3989  *      type[S -> A] -> L
3990  *
3991  * with S the first shared_len dimensions of the computed schedule,
3992  * A the array and L the schedule correponding to the generated loops.
3993  *
3994  * We first drop "type",
3995  *
3996  *      [S -> A] -> L
3997  *
3998  * If the above conditions are satisfied, we project out A,
3999  * resulting in
4000  *
4001  *      S -> L
4002  *
4003  * and then introduce the group tile [S -> T], resulting in
4004  *
4005  *      [S -> T] -> L
4006  */
4007 static __isl_give isl_ast_node *copy_group_shared_accesses(
4008         struct gpu_gen *gen, struct gpu_array_ref_group *group,
4009         __isl_take isl_map *sched, __isl_take isl_ast_build *build)
4010 {
4011         const char *type;
4012         int read;
4013         isl_union_map *access;
4014
4015         type = isl_map_get_tuple_name(sched, isl_dim_in);
4016         read = !strcmp(type, "read");
4017
4018         sched = isl_map_reset_tuple_id(sched, isl_dim_in);
4019
4020         if (read && !gpu_array_is_scalar(group->array)) {
4021                 isl_space *space;
4022                 isl_map *map;
4023
4024                 space = isl_space_domain(isl_map_get_space(sched));
4025                 space = isl_space_unwrap(space);
4026                 map = isl_map_domain_map(isl_map_universe(space));
4027                 sched = isl_map_apply_domain(sched, map);
4028
4029                 map = group_tile(group);
4030                 map = isl_map_reverse(isl_map_domain_map(map));
4031                 sched = isl_map_apply_domain(sched, map);
4032         }
4033
4034         return copy_access(gen, sched, type, group, build, 0);
4035 }
4036
4037 /* Return code for reading into or writing from private memory
4038  * the given array reference group.
4039  *
4040  * Let S be the first shared_len dimensions of the computed schedule,
4041  * D the iteration domains, A the array and L the schedule correponding
4042  * to the generated loops.
4043  * "sched" is of the form
4044  *
4045  *      type[S -> A] -> L
4046  *
4047  * where type is either "read" or "write".
4048  * We apply the privatization D -> S(t), with t the thread ids,
4049  * to the access relation D -> A to obtain the privatized access relation
4050  *
4051  *      S(t) -> A
4052  *
4053  * We drop the type from "sched" and intersect with the privatized access
4054  * relation to obtain
4055  *
4056  *      [S(t) -> A] -> L
4057  */
4058 static __isl_give isl_ast_node *copy_group_private_accesses(
4059         struct gpu_gen *gen, struct gpu_array_ref_group *group,
4060         __isl_take isl_map *sched, __isl_take isl_ast_build *build)
4061 {
4062         const char *type;
4063         int read;
4064         isl_union_map *priv;
4065         isl_union_map *access;
4066         isl_map *access_map;
4067
4068         type = isl_map_get_tuple_name(sched, isl_dim_in);
4069         read = !strcmp(type, "read");
4070
4071         priv = isl_union_map_from_map(isl_map_copy(gen->privatization));
4072         priv = isl_union_map_apply_range(isl_union_map_copy(gen->shared_sched),
4073                                         priv);
4074
4075         access = group_access_relation(group, read, !read);
4076         access = isl_union_map_apply_domain(access, priv);
4077         access_map = isl_map_from_union_map(access);
4078
4079         sched = isl_map_reset_tuple_id(sched, isl_dim_in);
4080         sched = isl_map_intersect_domain(sched, isl_map_wrap(access_map));
4081
4082         return copy_access(gen, sched, type, group, build, 1);
4083 }
4084
4085 /* Return code for reading into or writing from shared or private memory.
4086  *
4087  * "schedule" is of the form
4088  *
4089  *      type[S -> A] -> L
4090  *
4091  * with S be the first shared_len dimensions of the computed schedule,
4092  * A the array and L the schedule correponding to the generated loops.
4093  * The array reference group is attached to "type".
4094  */
4095 static __isl_give isl_ast_node *create_access_leaf(
4096         struct gpu_gen *gen, __isl_take isl_map *schedule,
4097         __isl_take isl_ast_build *build)
4098 {
4099         struct gpu_array_ref_group *group;
4100         isl_id *id;
4101
4102         id = isl_map_get_tuple_id(schedule, isl_dim_in);
4103         group = isl_id_get_user(id);
4104         isl_id_free(id);
4105
4106         if (group->private_tile)
4107                 return copy_group_private_accesses(gen, group, schedule,
4108                                                         build);
4109         else
4110                 return copy_group_shared_accesses(gen, group, schedule,
4111                                                         build);
4112 }
4113
4114 /* Create a domain node representing a synchronization.
4115  */
4116 static __isl_give isl_ast_node *create_sync_leaf(
4117         struct gpu_gen *gen, __isl_take isl_map *schedule,
4118         __isl_take isl_ast_build *build)
4119 {
4120         struct ppcg_kernel_stmt *stmt;
4121         isl_id *id;
4122         isl_space *space;
4123         isl_ast_node *node;
4124         isl_ast_expr *expr;
4125
4126         isl_map_free(schedule);
4127
4128         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
4129         if (!stmt)
4130                 return NULL;
4131
4132         stmt->type = ppcg_kernel_sync;
4133
4134         space = isl_ast_build_get_schedule_space(build);
4135         space = isl_space_from_domain(space);
4136         space = isl_space_set_tuple_name(space, isl_dim_out, "sync");
4137         expr = isl_ast_build_call_from_pw_multi_aff(build,
4138                     isl_pw_multi_aff_from_multi_aff(isl_multi_aff_zero(space)));
4139         node = isl_ast_node_alloc_user(expr);
4140         isl_ast_build_free(build);
4141
4142         id = isl_id_alloc(gen->ctx, NULL, stmt);
4143         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
4144         return isl_ast_node_set_annotation(node, id);
4145 }
4146
4147 /* This function is called during the code generation at the point
4148  * where the schedule domain element is completely determined by
4149  * the generated code.  The input schedule contains the original
4150  * statements as well as synchronization and copy "statements".
4151  * The latter are scheduled at different points than any of the original
4152  * statements, so they will only arrive here in isolation.
4153  *
4154  * If the current schedule only refers to a single statement,
4155  * we check if it is a copy or synchronization statement and
4156  * call the appropriate functions.
4157  * Otherwise, we assume we are dealing with the original statements
4158  * and we call create_domain_leaf.
4159  */
4160 static __isl_give isl_ast_node *create_kernel_leaf(
4161         __isl_take isl_ast_build *build, void *user)
4162 {
4163         struct gpu_gen *gen = (struct gpu_gen *) user;
4164         isl_map *map;
4165         isl_union_map *schedule;
4166         const char *name;
4167
4168         schedule = isl_ast_build_get_schedule(build);
4169
4170         if (isl_union_map_n_map(schedule) != 1)
4171                 return create_domain_leaf(schedule, build, user);
4172
4173         map = isl_map_from_union_map(schedule);
4174         name = isl_map_get_tuple_name(map, isl_dim_in);
4175         if (!strcmp(name, "read") || !strcmp(name, "write"))
4176                 return create_access_leaf(gen, map, build);
4177         if (!strcmp(name, "sync"))
4178                 return create_sync_leaf(gen, map, build);
4179
4180         return create_domain_leaf(isl_union_map_from_map(map), build, user);
4181 }
4182
4183 /* Mark all odd schedule dimensions as "atomic" (when the even dimensions
4184  * have value 0) and all even schedule dimensions as "unroll".
4185  *
4186  * That is, the options look as follows
4187  *
4188  *      { [0, b, 0, d, ..., 0] -> atomic[i] : exists a : i = 2 a + 1;
4189  *        [a, b, c, d, ..., z] -> unroll[i] : exists a : i = 2 a }
4190  *
4191  * The even positions are used to be able to schedule copying blocks
4192  * and synchronization before or after each level of the shared memory
4193  * tile loops and we want to make sure that code for these is generated
4194  * separately (within each level).
4195  */
4196 static __isl_give isl_ast_build *set_atomic_and_unroll(
4197         __isl_take isl_ast_build *build,
4198         __isl_take isl_space *space, int sched_len)
4199 {
4200         isl_ctx *ctx;
4201         isl_map *map;
4202         isl_constraint *c;
4203         isl_union_map *opt;
4204         isl_local_space *ls;
4205         int i, n;
4206
4207         ctx = isl_ast_build_get_ctx(build);
4208
4209         space = isl_space_params(space);
4210         space = isl_space_add_dims(space, isl_dim_set, sched_len);
4211         space = isl_space_from_domain(space);
4212         space = isl_space_add_dims(space, isl_dim_out, 2);
4213         map = isl_map_universe(isl_space_copy(space));
4214         for (i = 0; i < sched_len; i += 2)
4215                 map = isl_map_fix_si(map, isl_dim_in, i, 0);
4216         ls = isl_local_space_from_space(isl_map_get_space(map));
4217         c = isl_equality_alloc(ls);
4218         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 0, 1);
4219         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 1, 2);
4220         c = isl_constraint_set_constant_si(c, 1);
4221         map = isl_map_add_constraint(map, c);
4222         map = isl_map_project_out(map, isl_dim_out, 1, 1);
4223         map = isl_map_set_tuple_name(map, isl_dim_out, "atomic");
4224         opt = isl_union_map_from_map(map);
4225
4226         map = isl_map_universe(space);
4227         ls = isl_local_space_from_space(isl_map_get_space(map));
4228         c = isl_equality_alloc(ls);
4229         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 0, 1);
4230         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 1, 2);
4231         map = isl_map_add_constraint(map, c);
4232         map = isl_map_project_out(map, isl_dim_out, 1, 1);
4233         map = isl_map_set_tuple_name(map, isl_dim_out, "unroll");
4234         opt = isl_union_map_add_map(opt, map);
4235
4236         build = isl_ast_build_set_options(build, opt);
4237
4238         return build;
4239 }
4240
4241 /* Return a map that maps a space of dimension gen->shared_len
4242  * to its last dimensions starting at gen->tile_first.
4243  * The range is of dimension
4244  *
4245  *      2 * (gen->shared_len - gen->tile_first) + 1
4246  *
4247  * The input dimensions are mapped to the odd dimensions in the output,
4248  * while the even dimensions (except 2*pos) are fixed to 0.
4249  * Output dimension 2*pos (if pos >= 0) is fixed to "val".
4250  * If pos >= 0, then only the pos first dimensions starting at gen->tile_first
4251  * are mapped to the output.  The remaining input dimensions are projected
4252  * out and the corresponding output dimensions are fixed to 0.
4253  */
4254 static __isl_give isl_map *insert_even(struct gpu_gen *gen,
4255         __isl_take isl_space *space, int pos, int val)
4256 {
4257         int i, n;
4258         isl_map *proj;
4259
4260         space = isl_space_set_from_params(space);
4261         space = isl_space_add_dims(space, isl_dim_set, gen->shared_len);
4262         space = isl_space_map_from_set(space);
4263         proj = isl_map_identity(space);
4264         proj = isl_map_project_out(proj, isl_dim_out, 0, gen->tile_first);
4265         n = gen->shared_len - gen->tile_first;
4266         for (i = 0; i <= n; ++i) {
4267                 proj = isl_map_insert_dims(proj, isl_dim_out, 2 * i, 1);
4268                 if (i == pos)
4269                         proj = isl_map_fix_si(proj, isl_dim_out, 2 * i, val);
4270                 else
4271                         proj = isl_map_fix_si(proj, isl_dim_out, 2 * i, 0);
4272         }
4273
4274         if (pos < 0)
4275                 return proj;
4276
4277         proj = isl_map_eliminate(proj, isl_dim_in, gen->tile_first + pos,
4278                                 gen->shared_len - (gen->tile_first + pos));
4279         for (i = pos; i < n; ++i)
4280                 proj = isl_map_fix_si(proj, isl_dim_out, 2 * i + 1, 0);
4281
4282         return proj;
4283 }
4284
4285 /* Given the AST context schedule "schedule" and the mapping from
4286  * domains to the shared tile loops "shared_sched", add a schedule
4287  * for a synchronization operation at position "val" of loop level "pos".
4288  *
4289  * schedule is of the form
4290  *
4291  *      D -> L
4292  *
4293  * (with D the iteration domains and L the already generated loops),
4294  * while shared_sched is of the form
4295  *
4296  *      D -> S
4297  *
4298  * We combine them into
4299  *
4300  *      L -> S
4301  *
4302  * apply a mapping
4303  *
4304  *      [s_0,...] -> [0,s_{tile_first},0,..., val, 0, 0, ... 0]
4305  *
4306  * and use the result as a schedule for "sync".
4307  */
4308 static __isl_give isl_union_map *add_sync_schedule(struct gpu_gen *gen,
4309         __isl_take isl_union_map *res, __isl_keep isl_union_map *schedule,
4310         __isl_keep isl_union_map *shared_sched, int pos, int val)
4311 {
4312         isl_space *space;
4313         isl_map *proj, *map;
4314
4315         shared_sched = isl_union_map_copy(shared_sched);
4316         schedule = isl_union_map_copy(schedule);
4317
4318         space = isl_union_map_get_space(shared_sched);
4319         schedule = isl_union_map_apply_domain(shared_sched, schedule);
4320         map = isl_map_from_union_map(schedule);
4321
4322         proj = insert_even(gen, space, pos, val);
4323         map = isl_map_apply_range(map, proj);
4324         map = isl_map_from_range(isl_map_wrap(map));
4325         map = isl_map_set_tuple_name(map, isl_dim_in, "sync");
4326
4327         res = isl_union_map_add_map(res, map);
4328
4329         return res;
4330 }
4331
4332 /* Given the AST context schedule "schedule" and the mapping from
4333  * domains to the shared tile loops "shared_sched", add a schedule
4334  * for copying an array reference group to/from shared/private memory.
4335  * "read" is set if data should be copied from global memory
4336  * to shared/private memory.
4337  * "k" represents the current group
4338  * "s" is the total number of groups
4339  *
4340  * We schedule an operation before or after the innermost loop
4341  * of "shared_sched" that affects the tile of the array reference group.
4342  *
4343  * schedule is of the form
4344  *
4345  *      D -> L
4346  *
4347  * (with D the iteration domains and L the already generated loops),
4348  * while shared_sched is of the form
4349  *
4350  *      D -> S
4351  *
4352  * We first compute the access relation for the reference group
4353  *
4354  *      D -> A
4355  *
4356  * and combine it with shared_sched into
4357  *
4358  *      D -> [S -> A]
4359  *
4360  * If this results in an empty relation, no copying needs to be performed
4361  * at this point.
4362  * Otherwise, we invert the relation and combine it with "schedule" into
4363  *
4364  *      [S -> A] -> L
4365  *
4366  * The actual additional piece of the schedule is obtained from combining
4367  *
4368  *      [S -> A] -> S
4369  *
4370  * with a mapping
4371  *
4372  *      [s_0,...] -> [0,s_{tile_first},0,..., val, 0, 0, ... 0]
4373  *
4374  * The position of "val" corresponds to the innermost loop that affects
4375  * the tile and the value indicates where the copying is scheduled
4376  * with respect to the actual kernel code (at value 0).
4377  * Reads are schedule before the code, writes to global memory from
4378  * private memory are scheduled at values 1 to s, writes to global
4379  * memory from shared memory are scheduled at values s + 2 to 2 * s + 1.
4380  *
4381  * If we are scheduling a read from global memory to shared memory,
4382  * we insert a synchronization before the kernel code (at the innermost
4383  * level).
4384  * If we are scheduling a write to global memory, then we add
4385  * a synchronization after all writes (at value 2 *s + 2).
4386  * However, there is no need for a synchronization after the outermost loop.
4387  * A write to global memory from private memory at the innermost level
4388  * does not require a synchronization, because it is covered by
4389  * the synchronization after the kernel inserted by body_schedule.
4390  */
4391 static __isl_give isl_union_map *add_group_schedule(struct gpu_gen *gen,
4392         __isl_take isl_union_map *res, __isl_keep isl_union_map *schedule,
4393         __isl_keep isl_union_map *shared_sched,
4394         struct gpu_array_ref_group *group, int read, int k, int s)
4395 {
4396         int n;
4397         int pos, val;
4398         isl_space *space;
4399         isl_union_map *access;
4400         isl_map *map, *proj, *access_map;
4401         isl_id *id;
4402
4403         access = group_access_relation(group, read, !read);
4404         access = isl_union_map_range_product(isl_union_map_copy(shared_sched),
4405                                                 access);
4406
4407         if (isl_union_map_is_empty(access)) {
4408                 isl_union_map_free(access);
4409                 return res;
4410         }
4411
4412         access = isl_union_map_reverse(access);
4413         access = isl_union_map_apply_range(access,
4414                                             isl_union_map_copy(schedule));
4415         access_map = isl_map_from_union_map(access);
4416
4417         space = isl_space_copy(group->array->space);
4418         space = isl_space_from_range(space);
4419         space = isl_space_add_dims(space, isl_dim_in, gen->shared_len);
4420         map = isl_map_domain_map(isl_map_universe(space));
4421
4422         space = isl_union_map_get_space(schedule);
4423         pos = group->last_shared + 1 - gen->tile_first;
4424         assert(pos >= 0);
4425         if (read)
4426                 val = -2 - k;
4427         else if (group->private_tile)
4428                 val = 1 + k;
4429         else
4430                 val = 1 + s + 1 + k;
4431         proj = insert_even(gen, space, pos, val);
4432         map = isl_map_apply_range(map, proj);
4433
4434         access_map = isl_map_range_product(access_map, map);
4435
4436         id = isl_id_alloc(gen->ctx, read ? "read" : "write", group);
4437         access_map = isl_map_set_tuple_id(access_map, isl_dim_in, id);
4438
4439         res = isl_union_map_add_map(res, access_map);
4440
4441         n = gen->shared_len - gen->tile_first;
4442         if (read) {
4443                 if (!group->private_tile)
4444                         res = add_sync_schedule(gen, res, schedule,
4445                                                 shared_sched, n, -1);
4446         } else {
4447                 if (pos == 0)
4448                         return res;
4449                 if (pos == n && group->private_tile)
4450                         return res;
4451                 res = add_sync_schedule(gen, res, schedule, shared_sched,
4452                                         pos, 2 * s + 2);
4453         }
4454
4455         return res;
4456 }
4457
4458 /* Return a schedule for the shared tile loops based on the current
4459  * AST context schedule.
4460  *
4461  * We create a "shared_sched" that maps the domains to the first
4462  * shared_len dimensions of the computed schedule, project out the
4463  * first tile_first dimensions (as these are already covered by
4464  * the host code) and insert "statement-level" dimensions at even
4465  * positions so that we can schedule copy blocks and synchronization
4466  * before/after each level.
4467  *
4468  * In particular, copy blocks are inserted inside the innermost
4469  * level that affect the tile.  For the copying to global memory,
4470  * those from private memory are scheduled before those from shared
4471  * memory such that synchronization can be inserted between the two
4472  * at the innermost level.
4473  * Synchronization is inserted at the innermost level before the
4474  * actual kernel code if there is any copying from global memory
4475  * to shared memory.  It is inserted unconditionally at the innermost
4476  * level after the actual kernel code and the copying to global memory
4477  * from private memory (if any).  Finally, it is inserted after
4478  * any copying to global memory, except at the outermost level
4479  * and at the innermost level if there is no copying from shared
4480  * memory.  The copying from private memory is covered by the unconditional
4481  * synchronization at the innermost level.
4482  */
4483 static __isl_give isl_union_map *body_schedule(struct gpu_gen *gen,
4484         __isl_take isl_union_map *schedule)
4485 {
4486         isl_space *space;
4487         isl_union_map *res;
4488         isl_union_map *shared_sched;
4489         isl_union_map *sched;
4490         isl_map *proj, *map;
4491         int i, j, k, s;
4492
4493         shared_sched = isl_union_map_copy(gen->tiled_sched);
4494         proj = projection(isl_union_map_get_space(shared_sched),
4495                                 gen->tiled_len, gen->shared_len);
4496         shared_sched = isl_union_map_apply_range(shared_sched,
4497                                 isl_union_map_from_map(proj));
4498         space = isl_union_map_get_space(shared_sched);
4499         proj = insert_even(gen, space, -1, 0);
4500         sched = isl_union_map_apply_range(isl_union_map_copy(shared_sched),
4501                                 isl_union_map_from_map(proj));
4502
4503         res = isl_union_map_range_product(isl_union_map_copy(schedule), sched);
4504
4505         s = 0;
4506         for (i = 0; i < gen->prog->n_array; ++i)
4507                 s += gen->prog->array[i].n_group;
4508
4509         k = 0;
4510         for (i = 0; i < gen->prog->n_array; ++i) {
4511                 struct gpu_array_info *array = &gen->prog->array[i];
4512
4513                 for (j = 0; j < array->n_group; ++j) {
4514                         struct gpu_array_ref_group *group;
4515
4516                         group = array->groups[j];
4517                         if (!group->private_tile && !group->shared_tile)
4518                                 continue;
4519                         res = add_group_schedule(gen, res, schedule,
4520                                                 shared_sched, group, 0, k, s);
4521                         res = add_group_schedule(gen, res, schedule,
4522                                                 shared_sched, group, 1, k, s);
4523                         ++k;
4524                 }
4525         }
4526
4527         res = add_sync_schedule(gen, res, schedule, shared_sched,
4528                             gen->shared_len - gen->tile_first, 1 + s);
4529
4530         isl_union_map_free(shared_sched);
4531         isl_union_map_free(schedule);
4532
4533         return res;
4534 }
4535
4536 /* Generate code for "kernel" in the given "context".
4537  *
4538  * We first generate code for the shared tile loops (T1T, T1P and T2)
4539  * in a context that includes the block ids.
4540  * Within each iteration of these loops an additional code generation
4541  * is performed (within create_kernel_leaf) for the rest of the schedule
4542  * in a context that includes the thread ids.
4543  */
4544 static __isl_give isl_ast_node *generate_kernel(struct gpu_gen *gen,
4545         __isl_keep isl_ast_build *build, __isl_keep isl_set *host_domain,
4546         __isl_keep isl_multi_pw_aff *grid_size)
4547 {
4548         isl_space *space;
4549         isl_set *set;
4550         isl_id_list *iterators;
4551         isl_union_map *schedule;
4552         isl_ast_node *tree;
4553         int sched_len;
4554
4555         schedule = isl_ast_build_get_schedule(build);
4556
4557         build = isl_ast_build_copy(build);
4558         build = isl_ast_build_restrict(build, isl_set_copy(host_domain));
4559         space = isl_ast_build_get_schedule_space(build);
4560         set = isl_set_universe(isl_space_copy(space));
4561         set = add_bounded_parameters_dynamic(set, grid_size, "b");
4562         build = isl_ast_build_restrict(build, set);
4563
4564         schedule = body_schedule(gen, schedule);
4565
4566         sched_len = 2 * (gen->shared_len - gen->tile_first) + 1;
4567
4568         build = set_atomic_and_unroll(build, space, sched_len);
4569         iterators = generate_names(gen->ctx, sched_len, "g");
4570         build = isl_ast_build_set_iterators(build, iterators);
4571         build = isl_ast_build_set_create_leaf(build, &create_kernel_leaf, gen);
4572         tree = isl_ast_build_ast_from_schedule(build, schedule);
4573         isl_ast_build_free(build);
4574
4575         return tree;
4576 }
4577
4578 /* Attach "id" to the given node.
4579  */
4580 static __isl_give isl_ast_node *attach_id(__isl_take isl_ast_node *node,
4581         __isl_keep isl_ast_build *build, void *user)
4582 {
4583         isl_id *id = user;
4584
4585         node = isl_ast_node_set_annotation(node, id);
4586
4587         return node;
4588 }
4589
4590 /* Construct an AST node for performing a kernel launch and attach
4591  * the information about the kernel to that node.
4592  *
4593  * The kernel AST has been constructed in the context of the range
4594  * of "schedule".  In particular, the grid size has been computed
4595  * in the context.  We therefore still need to make sure that these
4596  * constraints are expressed in the code.  We do this by creating a schedule
4597  *
4598  *      kernel[] -> [S -> []]
4599  *
4600  * where S is the schedule domain, i.e., the range of "schedule".
4601  * The AST generation will then create a single call surrounded by
4602  * all the condition in "S" that have not been expressed yet.
4603  *
4604  * The kernel information is attached to this node in attach_id.
4605  */
4606 static __isl_give isl_ast_node *construct_launch(
4607         __isl_take isl_ast_build *build, __isl_take isl_union_map *schedule,
4608         __isl_take struct ppcg_kernel *kernel)
4609 {
4610         isl_id *id;
4611         isl_ctx *ctx;
4612         isl_union_set *domain;
4613         isl_set *set;
4614         isl_map *map;
4615         isl_ast_node *node;
4616
4617         ctx = isl_ast_build_get_ctx(build);
4618
4619         id = isl_id_alloc(ctx, NULL, kernel);
4620         id = isl_id_set_free_user(id, &ppcg_kernel_free);
4621
4622         domain = isl_union_map_range(schedule);
4623         set = isl_set_from_union_set(domain);
4624         map = isl_map_from_domain(set);
4625         map = isl_map_from_range(isl_map_wrap(map));
4626         map = isl_map_set_tuple_name(map, isl_dim_in, "kernel");
4627         schedule = isl_union_map_from_map(map);
4628
4629         build = isl_ast_build_set_at_each_domain(build, &attach_id, id);
4630         node = isl_ast_build_ast_from_schedule(build, schedule);
4631         isl_ast_build_free(build);
4632
4633         return node;
4634 }
4635
4636 /* This function is called for each leaf in the AST of the host code.
4637  * We first specialize the schedule to the site of the leaf, compute
4638  * the size of shared memory and then construct the body of the host code
4639  * and the associated kernel.
4640  *
4641  * The necessary information for printing the kernel launch is
4642  * stored in a struct ppcg_kernel and attached to the leaf node
4643  * created to represent the launch.
4644  */
4645 static __isl_give isl_ast_node *create_host_leaf(
4646         __isl_take isl_ast_build *build, void *user)
4647 {
4648         struct gpu_gen *gen = (struct gpu_gen *) user;
4649         isl_id *id;
4650         isl_ast_node *node;
4651         struct ppcg_kernel *kernel;
4652         isl_set *host_domain;
4653         isl_union_map *schedule;
4654         isl_union_map *local_sched;
4655         isl_union_map *access;
4656         isl_union_set *domain;
4657         int i;
4658
4659         schedule = isl_ast_build_get_schedule(build);
4660
4661         isl_union_map_foreach_map(schedule, &extract_tile_len, gen);
4662         read_sizes(gen);
4663
4664         domain = isl_union_map_domain(isl_union_map_copy(schedule));
4665
4666         local_sched = isl_union_map_copy(gen->sched);
4667         local_sched = isl_union_map_intersect_domain(local_sched, domain);
4668         access = isl_union_map_union(isl_union_map_copy(gen->prog->read),
4669                                      isl_union_map_copy(gen->prog->may_write));
4670         access = isl_union_map_apply_domain(access,
4671                                             isl_union_map_copy(local_sched));
4672
4673         gen->tiled_sched = tile_schedule(gen, local_sched);
4674         gen->tiled_sched = parametrize_tiled_schedule(gen, gen->tiled_sched);
4675         gen->tiled_sched = scale_tile_loops(gen, gen->tiled_sched);
4676
4677         gen->local_sched = isl_union_map_copy(gen->tiled_sched);
4678         gen->local_sched = thread_tile_schedule(gen, gen->local_sched);
4679         gen->local_sched = scale_thread_tile_loops(gen, gen->local_sched);
4680
4681         kernel = gen->kernel = isl_calloc_type(gen->ctx, struct ppcg_kernel);
4682         if (!kernel)
4683                 goto error;
4684
4685         kernel->id = gen->kernel_id++;
4686         kernel->context = isl_union_map_params(isl_union_map_copy(schedule));
4687         kernel->grid_size = extract_grid_size(gen, kernel);
4688         extract_block_size(gen, kernel);
4689         kernel->arrays = isl_union_map_range(access);
4690         kernel->arrays = isl_union_set_apply(kernel->arrays,
4691                                 isl_union_map_copy(gen->prog->to_outer));
4692         kernel->space = isl_ast_build_get_schedule_space(build);
4693
4694         gen->private_access = NULL;
4695         compute_shared_sched(gen);
4696         gen->privatization = compute_privatization(gen);
4697         group_references(gen);
4698         compute_private_access(gen);
4699         check_shared_memory_bound(gen);
4700         compute_group_tilings(gen);
4701         host_domain = isl_set_from_union_set(isl_union_map_range(
4702                                                 isl_union_map_copy(schedule)));
4703         localize_bounds(gen, kernel, host_domain);
4704
4705         gen->local_sched = interchange_for_unroll(gen, gen->local_sched);
4706
4707         kernel->tree = generate_kernel(gen, build, host_domain,
4708                                         kernel->grid_size);
4709         create_kernel_vars(gen, kernel);
4710
4711         free_local_array_info(gen);
4712         isl_map_free(gen->privatization);
4713         isl_union_map_free(gen->private_access);
4714         isl_union_map_free(gen->local_sched);
4715         isl_union_map_free(gen->tiled_sched);
4716         isl_union_map_free(gen->shared_sched);
4717         isl_union_map_free(gen->shared_proj);
4718         isl_set_free(host_domain);
4719         free(gen->tile_size);
4720
4721         node = construct_launch(build, schedule, kernel);
4722
4723         return node;
4724 error:
4725         isl_union_map_free(schedule);
4726         return NULL;
4727 }
4728
4729 /* Use isl to generate code for the outer gen->tile_first loops
4730  * of the global schedule in gen->sched, resulting in the host code.
4731  * Within each iteration of this partial schedule, i.e., for each kernel
4732  * launch, create_host_leaf takes care of generating the kernel code.
4733  */
4734 static __isl_give isl_ast_node *generate_host_code(struct gpu_gen *gen)
4735 {
4736         isl_ast_build *build;
4737         isl_ast_node *tree;
4738         isl_union_map *sched;
4739         isl_map *proj;
4740         isl_id_list *iterators;
4741
4742         sched = isl_union_map_copy(gen->sched);
4743         proj = projection(isl_union_map_get_space(sched),
4744                             gen->untiled_len, gen->tile_first);
4745         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
4746
4747         isl_options_set_ast_build_group_coscheduled(gen->ctx, 1);
4748         build = isl_ast_build_from_context(isl_set_copy(gen->prog->context));
4749         iterators = generate_names(gen->ctx, gen->tile_first, "h");
4750         build = isl_ast_build_set_iterators(build, iterators);
4751         build = isl_ast_build_set_create_leaf(build, &create_host_leaf, gen);
4752         tree = isl_ast_build_ast_from_schedule(build, sched);
4753         isl_ast_build_free(build);
4754
4755         return tree;
4756 }
4757
4758 __isl_give isl_union_map *extract_sizes_from_str(isl_ctx *ctx, const char *str)
4759 {
4760         if (!str)
4761                 return NULL;
4762         return isl_union_map_read_from_str(ctx, str);
4763 }
4764
4765 /* Information about the outermost tilable bands in the forest of bands.
4766  *
4767  * tile_len and n_parallel are only sets on band_info structures
4768  * that correspond to outermost bands.  For other bands (in particular,
4769  * ancestors of the outermost bands), n_parallal is set to 0.
4770  *
4771  * prefix is the (padded) schedule leading up to the outermost tilable bands.
4772  *
4773  * tile_first is the number of schedule dimensions in prefix.
4774  *
4775  * suffix is the schedule of the outermost tilable bands and their descendants.
4776  */
4777 struct band_info {
4778         struct gpu_gen *gen;
4779         int tile_first;
4780         int tile_len;
4781         int n_parallel;
4782         isl_union_map *prefix;
4783         isl_union_map *suffix;
4784 };
4785
4786 /* Set tile_len and n_parallel of the statement to that of
4787  * their outermost band, recorded in the band_info.
4788  */
4789 static int set_stmt_tile_len(__isl_take isl_map *map, void *user)
4790 {
4791         struct band_info *info = user;
4792         struct gpu_stmt *stmt;
4793         isl_id *id;
4794
4795         id = isl_map_get_tuple_id(map, isl_dim_in);
4796         stmt = find_stmt(info->gen->prog, id);
4797         isl_id_free(id);
4798
4799         stmt->tile_len = info->tile_len;
4800         stmt->n_parallel = info->n_parallel;
4801
4802         isl_map_free(map);
4803
4804         return 0;
4805 }
4806
4807 static void list_select_outer_band(struct gpu_gen *gen,
4808         __isl_take isl_band_list *list, int pos, struct band_info *list_info);
4809
4810 /* Check if this band has any parallel loops.  If so, take it as
4811  * the outermost tilable band.  If not, continue looking for the
4812  * outermost tilable band in the children of the current band.
4813  */
4814 static void band_select_outer_band(struct gpu_gen *gen,
4815         __isl_take isl_band *band, int pos, struct band_info *info)
4816 {
4817         int n = isl_band_n_member(band);
4818         int n_parallel;
4819
4820         for (n_parallel = 0; n_parallel < n; ++n_parallel)
4821                 if (!isl_band_member_is_coincident(band, n_parallel))
4822                         break;
4823
4824         info->n_parallel = n_parallel;
4825         if (n_parallel) {
4826                 gen->any_parallelism = 1;
4827                 info->gen = gen;
4828                 info->tile_first = pos;
4829                 info->tile_len = n;
4830                 info->prefix = isl_band_get_prefix_schedule(band);
4831                 info->suffix = isl_union_map_flat_range_product(
4832                                 isl_band_get_partial_schedule(band),
4833                                 isl_band_get_suffix_schedule(band));
4834                 isl_union_map_foreach_map(info->prefix,
4835                                             &set_stmt_tile_len, info);
4836         } else if (isl_band_has_children(band)) {
4837                 isl_band_list *children;
4838                 children = isl_band_get_children(band);
4839                 list_select_outer_band(gen, children, pos + n, info);
4840         } else {
4841                 info->gen = gen;
4842                 info->tile_first = pos + n;
4843                 info->tile_len = 0;
4844                 info->prefix = isl_union_map_flat_range_product(
4845                                 isl_band_get_prefix_schedule(band),
4846                                 isl_band_get_partial_schedule(band));
4847                 info->suffix = isl_band_get_suffix_schedule(band);
4848                 isl_union_map_foreach_map(info->prefix,
4849                                             &set_stmt_tile_len, info);
4850         }
4851
4852         isl_band_free(band);
4853 }
4854
4855 /* Comparison function that returns a non-zero value for band_infos
4856  * with different tile_len fields or different n_parallel fields.
4857  */
4858 static int cmp_band(const void *p1, const void *p2)
4859 {
4860         const struct band_info *info1 = p1;
4861         const struct band_info *info2 = p2;
4862
4863         if (info1->tile_len != info2->tile_len)
4864                 return info1->tile_len - info2->tile_len;
4865
4866         return info1->n_parallel - info2->n_parallel;
4867 }
4868
4869 /* Extend "umap" with coordinates with fixed value "val"
4870  * to a total length of "dst_len", assuming the original dimension is "src_len".
4871  */
4872 static __isl_give isl_union_map *extend_range(
4873         __isl_take isl_union_map *umap, int src_len, int dst_len, int val)
4874 {
4875         isl_space *dim;
4876         isl_map *map;
4877         int i;
4878
4879         dim = isl_union_map_get_space(umap);
4880         map = isl_map_reverse(projection(dim, dst_len, src_len));
4881         for (i = src_len; i < dst_len; ++i)
4882                 map = isl_map_fix_si(map, isl_dim_out, i, val);
4883
4884         umap = isl_union_map_apply_range(umap, isl_union_map_from_map(map));
4885
4886         return umap;
4887 }
4888
4889 /* Group bands with the same values for tile_len and n_parallel.
4890  * The prefix schedule is then extended with a fixed coordinate that
4891  * is different for each such group.
4892  * Note that the actual values for this coordinate are not important.
4893  * The bands have already been effectively separated at a higher level
4894  * or they are independent and may be executed in parallel.
4895  * The list of band_info has been sorted before this functions is called.
4896  */
4897 static void separate_bands(struct band_info *info, int n)
4898 {
4899         int i;
4900         int j = 0;
4901
4902         for (i = 0; i < n; ++i) {
4903                 int l = info[i].tile_first;
4904
4905                 if (i &&
4906                     (info[i].tile_len != info[i - 1].tile_len ||
4907                      info[i].n_parallel != info[i - 1].n_parallel))
4908                         j++;
4909
4910                 info[i].prefix = extend_range(info[i].prefix,
4911                                                 l, l + 1, j);
4912                 info[i].tile_first = l + 1;
4913         }
4914 }
4915
4916 /* Select the outermost bands in the elements of the list, align
4917  * their prefix schedules, separate bands with different values
4918  * for tile_len and/or n_parallel and then combine the resulting
4919  * prefix and suffix schedules into a single pair of prefix and
4920  * suffix schedules for the entire list.
4921  */
4922 static void list_select_outer_band(struct gpu_gen *gen,
4923         __isl_take isl_band_list *list, int pos, struct band_info *list_info)
4924 {
4925         isl_band *band;
4926         int i;
4927         int n = isl_band_list_n_band(list);
4928         isl_ctx *ctx = isl_band_list_get_ctx(list);
4929         struct band_info *info;
4930         int max_tile_first;
4931         isl_union_map *prefix;
4932         isl_union_map *suffix;
4933
4934         assert(n >= 1);
4935         info = isl_calloc_array(ctx, struct band_info, n);
4936         assert(info);
4937
4938         max_tile_first = 0;
4939         for (i = 0; i < n; ++i) {
4940                 band = isl_band_list_get_band(list, i);
4941                 band_select_outer_band(gen, band, pos, &info[i]);
4942                 if (info[i].tile_first > max_tile_first)
4943                         max_tile_first = info[i].tile_first;
4944         }
4945
4946         for (i = 0; i < n; ++i) {
4947                 if (info[i].tile_first == max_tile_first)
4948                         continue;
4949                 info[i].prefix = extend_range(info[i].prefix,
4950                                         info[i].tile_first, max_tile_first, 0);
4951                 info[i].tile_first = max_tile_first;
4952         }
4953
4954         qsort(info, n, sizeof(struct band_info), &cmp_band);
4955
4956         for (i = 0; i < n - 1; ++i)
4957                 if (info[i].tile_len != info[i + 1].tile_len ||
4958                     info[i].n_parallel != info[i + 1].n_parallel)
4959                         break;
4960
4961         if (i < n -1)
4962                 separate_bands(info, n);
4963
4964         prefix = info[0].prefix;
4965         suffix = info[0].suffix;
4966
4967         for (i = 1; i < n; ++i) {
4968                 prefix = isl_union_map_union(prefix, info[i].prefix);
4969                 suffix = isl_union_map_union(suffix, info[i].suffix);
4970         }
4971
4972         list_info->tile_first = info[0].tile_first;
4973         list_info->tile_len = -1;
4974         list_info->prefix = prefix;
4975         list_info->suffix = suffix;
4976
4977         isl_band_list_free(list);
4978         free(info);
4979 }
4980
4981 /* Select the outermost tilable band that (by construction)
4982  * has at least one parallel loop.
4983  * The starting position of the aligned band is stored in the pair
4984  * gen->tile_first.
4985  * The sizes and number of parallel loops may be different in different
4986  * parts of the band forest and are therefore stored in the gpu_stmts.
4987  *
4988  * Return the complete schedule, with the tilable bands aligned
4989  * at gen->tile_first and padded with zero, if needed.
4990  */
4991 static __isl_give isl_union_map *select_outer_tilable_band(struct gpu_gen *gen,
4992         __isl_keep isl_schedule *schedule)
4993 {
4994         isl_band_list *list;
4995         struct band_info info;
4996
4997         gen->n_parallel = 0;
4998         gen->tile_len = -1;
4999
5000         list = isl_schedule_get_band_forest(schedule);
5001
5002         if (isl_band_list_n_band(list) == 0) {
5003                 isl_band_list_free(list);
5004                 return isl_schedule_get_map(schedule);
5005         }
5006
5007         list_select_outer_band(gen, list, 0, &info);
5008
5009         gen->tile_first = info.tile_first;
5010         info.suffix = align_range(info.suffix);
5011
5012         return isl_union_map_flat_range_product(info.prefix, info.suffix);
5013 }
5014
5015 /* Set gen->untiled_len to the number of scheduling dimensions
5016  * for the schedule of the first domain.
5017  * We assume here that this number is the same for all domains.
5018  */
5019 static int set_untiled_len(__isl_take isl_map *map, void *user)
5020 {
5021         unsigned *untiled_len = user;
5022
5023         *untiled_len = isl_map_dim(map, isl_dim_out);
5024
5025         isl_map_free(map);
5026         return -1;
5027 }
5028
5029 /* Compute an appropriate schedule based on the accesses in
5030  * gen->read and gen->write.
5031  *
5032  * We use the dependences in gen->prog->scop to compute
5033  * a schedule that has a parallel loop in each tilable band.
5034  * Finally, we select the outermost tilable band.
5035  */
5036 static void compute_schedule(struct gpu_gen *gen)
5037 {
5038         isl_union_set *domain;
5039         isl_union_map *dep_raw, *dep;
5040         isl_union_map *sched;
5041         isl_schedule_constraints *sc;
5042         isl_schedule *schedule;
5043
5044         dep_raw = isl_union_map_copy(gen->prog->scop->dep_flow);
5045
5046         dep = isl_union_map_copy(gen->prog->scop->dep_false);
5047         dep = isl_union_map_union(dep, dep_raw);
5048         dep = isl_union_map_coalesce(dep);
5049
5050         domain = isl_union_set_copy(gen->prog->scop->domain);
5051         domain = isl_union_set_intersect_params(domain,
5052                                 isl_set_copy(gen->prog->scop->context));
5053         sc = isl_schedule_constraints_on_domain(isl_union_set_copy(domain));
5054         sc = isl_schedule_constraints_set_validity(sc, isl_union_map_copy(dep));
5055         sc = isl_schedule_constraints_set_coincidence(sc,
5056                                                     isl_union_map_copy(dep));
5057         sc = isl_schedule_constraints_set_proximity(sc, dep);
5058
5059         if (gen->options->debug->dump_schedule_constraints)
5060                 isl_schedule_constraints_dump(sc);
5061         schedule = isl_schedule_constraints_compute_schedule(sc);
5062         if (gen->options->debug->dump_schedule)
5063                 isl_schedule_dump(schedule);
5064
5065         sched = select_outer_tilable_band(gen, schedule);
5066
5067         isl_union_map_foreach_map(sched, &set_untiled_len, &gen->untiled_len);
5068         sched = isl_union_map_intersect_domain(sched, domain);
5069         gen->sched = sched;
5070
5071         isl_schedule_free(schedule);
5072 }
5073
5074 /* Compute the sets of outer array elements that need to be copied in and out.
5075  *
5076  * In particular, for each array that is possibly written anywhere in
5077  * gen->prog and that is visible outside the corresponding scop,
5078  * we copy out its entire extent.
5079  *
5080  * Any array elements that is read without first being written needs
5081  * to be copied in. Furthermore, if there are any array elements that
5082  * are copied out, but that may not be written inside gen->prog, then
5083  * they also need to be copied in to ensure that the value after execution
5084  * is the same as the value before execution.
5085  * In case the array elements are structures, we need to take into
5086  * account that all members of the structures need to be written
5087  * by gen->prog before we can avoid copying the data structure in.
5088  *
5089  * While computing the set of array elements that are copied out but
5090  * not necessarily written, we intersect both sets with the context.
5091  * This helps in those cases where the arrays are declared with a fixed size,
5092  * while the accesses are parametric and the context assigns a fixed value
5093  * to the parameters.
5094  *
5095  * If an element from a local array is read without first being written,
5096  * then there is no point in copying it in since it cannot have been
5097  * written prior to the scop.  Warn about the uninitialized read instead.
5098  */
5099 static void compute_copy_in_and_out(struct gpu_gen *gen)
5100 {
5101         int i;
5102         isl_union_set *local;
5103         isl_union_set *may_write, *must_write;
5104         isl_union_set *copy_in, *copy_out;
5105         isl_union_set *not_written;
5106         isl_union_map *uninitialized;
5107         isl_union_map *local_uninitialized;
5108
5109         must_write = isl_union_map_range(
5110                                 isl_union_map_copy(gen->prog->must_write));
5111         must_write = isl_union_set_intersect_params(must_write,
5112                                             isl_set_copy(gen->prog->context));
5113         may_write = isl_union_map_range(
5114                                 isl_union_map_copy(gen->prog->may_write));
5115         may_write = isl_union_set_intersect_params(may_write,
5116                                             isl_set_copy(gen->prog->context));
5117         may_write = isl_union_set_universe(may_write);
5118         may_write = isl_union_set_apply(may_write,
5119                                     isl_union_map_copy(gen->prog->to_outer));
5120         copy_out = isl_union_set_empty(isl_union_set_get_space(may_write));
5121         local = isl_union_set_copy(copy_out);
5122
5123         for (i = 0; i < gen->prog->n_array; ++i) {
5124                 isl_space *space;
5125                 isl_set *write_i;
5126                 int empty;
5127
5128                 space = isl_space_copy(gen->prog->array[i].space);
5129
5130                 if (gen->prog->array[i].local) {
5131                         isl_set *set;
5132
5133                         set = isl_set_universe(space);
5134                         local = isl_union_set_add_set(local, set);
5135                         continue;
5136                 }
5137
5138                 write_i = isl_union_set_extract_set(may_write, space);
5139                 empty = isl_set_fast_is_empty(write_i);
5140                 isl_set_free(write_i);
5141                 if (empty)
5142                         continue;
5143
5144                 write_i = isl_set_copy(gen->prog->array[i].extent);
5145                 copy_out = isl_union_set_add_set(copy_out, write_i);
5146         }
5147         isl_union_set_free(may_write);
5148
5149         copy_out = isl_union_set_intersect_params(copy_out,
5150                                             isl_set_copy(gen->prog->context));
5151
5152         gen->prog->copy_out = isl_union_set_copy(copy_out);
5153
5154         copy_out = isl_union_set_apply(copy_out,
5155                                     isl_union_map_copy(gen->prog->to_inner));
5156         not_written = isl_union_set_subtract(copy_out, must_write);
5157
5158         uninitialized = isl_union_map_copy(gen->prog->scop->live_in);
5159         local_uninitialized = isl_union_map_copy(uninitialized);
5160
5161         local = isl_union_set_apply(local,
5162                                     isl_union_map_copy(gen->prog->to_inner));
5163         local_uninitialized = isl_union_map_intersect_range(local_uninitialized,
5164                                                             local);
5165         if (!isl_union_map_is_empty(local_uninitialized)) {
5166                 fprintf(stderr,
5167                         "possibly uninitialized reads (not copied in):\n");
5168                 isl_union_map_dump(local_uninitialized);
5169         }
5170         uninitialized = isl_union_map_subtract(uninitialized,
5171                                                 local_uninitialized);
5172         copy_in = isl_union_map_range(uninitialized);
5173         copy_in = isl_union_set_union(copy_in, not_written);
5174         copy_in = isl_union_set_apply(copy_in,
5175                                     isl_union_map_copy(gen->prog->to_outer));
5176
5177         gen->prog->copy_in = copy_in;
5178 }
5179
5180 static struct gpu_stmt_access **expr_extract_access(struct pet_expr *expr,
5181         struct gpu_stmt_access **next_access)
5182 {
5183         struct gpu_stmt_access *access;
5184         isl_ctx *ctx = isl_map_get_ctx(expr->acc.access);
5185
5186         access = isl_alloc_type(ctx, struct gpu_stmt_access);
5187         assert(access);
5188         access->next = NULL;
5189         access->read = expr->acc.read;
5190         access->write = expr->acc.write;
5191         access->access = pet_expr_access_get_may_access(expr);
5192         access->exact_write = !expr->acc.write ||
5193                 isl_map_is_equal(expr->acc.access, access->access);
5194         access->ref_id = isl_id_copy(expr->acc.ref_id);
5195         access->group = -1;
5196
5197         *next_access = access;
5198         next_access = &(*next_access)->next;
5199         return next_access;
5200 }
5201
5202 static struct gpu_stmt_access **expr_extract_accesses(struct pet_expr *expr,
5203         struct gpu_stmt_access **next_access)
5204 {
5205         int i;
5206
5207         for (i = 0; i < expr->n_arg; ++i)
5208                 next_access = expr_extract_accesses(expr->args[i],
5209                                                         next_access);
5210
5211         if (expr->type == pet_expr_access)
5212                 next_access = expr_extract_access(expr, next_access);
5213
5214         return next_access;
5215 }
5216
5217 static void pet_stmt_extract_accesses(struct gpu_stmt *stmt)
5218 {
5219         struct gpu_stmt_access **next_access = &stmt->accesses;
5220
5221         stmt->accesses = NULL;
5222         expr_extract_accesses(stmt->stmt->body, next_access);
5223 }
5224
5225 /* Return an array of gpu_stmt representing the statements in "scop".
5226  */
5227 static struct gpu_stmt *extract_stmts(isl_ctx *ctx, struct ppcg_scop *scop,
5228         __isl_keep isl_set *context)
5229 {
5230         int i;
5231         struct gpu_stmt *stmts;
5232
5233         stmts = isl_calloc_array(ctx, struct gpu_stmt, scop->n_stmt);
5234         if (!stmts)
5235                 return NULL;
5236
5237         for (i = 0; i < scop->n_stmt; ++i) {
5238                 struct gpu_stmt *s = &stmts[i];
5239
5240                 s->id = isl_set_get_tuple_id(scop->stmts[i]->domain);
5241                 s->stmt = scop->stmts[i];
5242                 pet_stmt_extract_accesses(s);
5243         }
5244
5245         return stmts;
5246 }
5247
5248 /* Callback for ppcg_print_guarded that calls the callback for generate_gpu.
5249  */
5250 static __isl_give isl_printer *print_gpu(__isl_take isl_printer *p, void *user)
5251 {
5252         struct gpu_gen *gen = user;
5253
5254         return gen->print(p, gen->prog, gen->tree, &gen->types,
5255                             gen->print_user);
5256 }
5257
5258 /* Generate CUDA code for "scop" and print it to "p".
5259  * After generating an AST for the transformed scop as explained below,
5260  * we call "gen->print" to print the AST in the desired output format
5261  * to "p".
5262  *
5263  * If it turns out that it does not make sense to generate GPU code,
5264  * then we generate CPU code instead.
5265  *
5266  * The GPU code is generated in a context where at least one
5267  * statement instance is executed.  The corresponding guard (if any) is printed
5268  * around the entire generated GPU code, except for the declaration
5269  * of the arrays that are visible outside of the scop and that therefore
5270  * cannot be declared inside the body of any possible guard.
5271  *
5272  * We first compute a schedule that respects the dependences
5273  * of the original program and select the outermost band
5274  * of tilable dimensions that has at least one parallel loop.
5275  * We then have three blocks of dimensions
5276  *
5277  *      H               B                       G
5278  *
5279  * The tilable band "B" is first tiled according to "tile" sizes, resulting
5280  * in
5281  *
5282  *      H       T               P               G
5283  *
5284  * For each iteration of the T loop and for each array, we compute
5285  * the array elements accessed by that iteration, construct a rectangular
5286  * box around it and shift it to the origin.  The result is used
5287  * as shared memory for the array.
5288  *
5289  * We then split off at most 2 parallel loops from the T loops and
5290  * at most 3 parallel loops from the P loops
5291  *
5292  *      H       T1      T2      P1      P2      G
5293  *
5294  * The T1/P1 loops are then tiled or "wrapped" over the blocks/threads,
5295  * according to "grid"/"block" sizes.
5296  *
5297  *      H       T1T T1P T2      P1T P1P P2      G
5298  *
5299  * Finally, the T1P and P1P iterators are equated to the block and
5300  * thread dimensions respectively and so are effectively removed.
5301  * The H loops are run on the host.  The T1T, T2, P1T, P2 and G loops
5302  * are run on the GPU.
5303  *
5304  * Code is generated in three stages.  We first generate code for the
5305  * host (the H loops), with iterators h%d.  Then, for each leaf node
5306  * of the resulting AST, we generate code for the shared loops (up to
5307  * and including T2), with iterators g%d and after equating the H loops
5308  * to h%d parameters and the T1P loops to the block dimensions.
5309  * Finally, we generate code for the remaining loops in a similar fashion.
5310  */
5311 static __isl_give isl_printer *generate(__isl_take isl_printer *p,
5312         struct gpu_gen *gen, struct ppcg_scop *scop,
5313         struct ppcg_options *options)
5314 {
5315         struct gpu_prog *prog;
5316         isl_ctx *ctx;
5317         isl_set *context, *guard;
5318
5319         if (!scop)
5320                 return isl_printer_free(p);
5321
5322         ctx = isl_printer_get_ctx(p);
5323         prog = gpu_prog_alloc(ctx, scop);
5324         if (!prog)
5325                 return isl_printer_free(p);
5326
5327         context = isl_set_copy(prog->context);
5328         guard = isl_union_set_params(isl_union_set_copy(prog->scop->domain));
5329         prog->context = isl_set_intersect(prog->context, isl_set_copy(guard));
5330
5331         gen->prog = prog;
5332         gen->any_parallelism = 0;
5333         compute_schedule(gen);
5334
5335         if (!gen->any_parallelism) {
5336                 isl_set_free(context);
5337                 isl_set_free(guard);
5338                 p = print_cpu(p, scop, options);
5339         } else {
5340                 compute_copy_in_and_out(gen);
5341                 gen->tree = generate_host_code(gen);
5342                 p = ppcg_print_exposed_declarations(p, prog->scop);
5343                 p = ppcg_print_guarded(p, guard, context, &print_gpu, gen);
5344                 isl_ast_node_free(gen->tree);
5345         }
5346
5347         isl_union_map_free(gen->sched);
5348
5349         gpu_prog_free(prog);
5350
5351         return p;
5352 }
5353
5354 /* Wrapper around generate for use as a ppcg_transform callback.
5355  */
5356 static __isl_give isl_printer *generate_wrap(__isl_take isl_printer *p,
5357         struct ppcg_scop *scop, void *user)
5358 {
5359         struct gpu_gen *gen = user;
5360
5361         return generate(p, gen, scop, gen->options);
5362 }
5363
5364 /* Transform the code in the file called "input" by replacing
5365  * all scops by corresponding GPU code and write the results to "out".
5366  */
5367 int generate_gpu(isl_ctx *ctx, const char *input, FILE *out,
5368         struct ppcg_options *options,
5369         __isl_give isl_printer *(*print)(__isl_take isl_printer *p,
5370                 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
5371                 struct gpu_types *types, void *user), void *user)
5372 {
5373         struct gpu_gen gen;
5374         int r;
5375         int i;
5376
5377         gen.ctx = ctx;
5378         gen.sizes = extract_sizes_from_str(ctx, options->sizes);
5379         gen.options = options;
5380         gen.kernel_id = 0;
5381         gen.print = print;
5382         gen.print_user = user;
5383         gen.types.n = 0;
5384         gen.types.name = NULL;
5385
5386         r = ppcg_transform(ctx, input, out, options, &generate_wrap, &gen);
5387
5388         isl_union_map_free(gen.sizes);
5389         for (i = 0; i < gen.types.n; ++i)
5390                 free(gen.types.name[i]);
5391         free(gen.types.name);
5392
5393         return r;
5394 }
5395
5396 struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop)
5397 {
5398         struct gpu_prog *prog;
5399
5400         if (!scop)
5401                 return NULL;
5402
5403         prog = isl_calloc_type(ctx, struct gpu_prog);
5404         assert(prog);
5405
5406         prog->ctx = ctx;
5407         prog->scop = scop;
5408         prog->context = isl_set_copy(scop->context);
5409         prog->n_stmts = scop->n_stmt;
5410         prog->stmts = extract_stmts(ctx, scop, prog->context);
5411         prog->read = isl_union_map_copy(scop->reads);
5412         prog->may_write = isl_union_map_copy(scop->may_writes);
5413         prog->must_write = isl_union_map_copy(scop->must_writes);
5414         prog->to_inner = compute_to_inner(scop);
5415         prog->to_outer = isl_union_map_copy(prog->to_inner);
5416         prog->to_outer = isl_union_map_reverse(prog->to_outer);
5417
5418         if (!prog->stmts)
5419                 return gpu_prog_free(prog);
5420
5421         if (collect_array_info(prog) < 0)
5422                 return gpu_prog_free(prog);
5423
5424         return prog;
5425 }
5426
5427 void *gpu_prog_free(struct gpu_prog *prog)
5428 {
5429         if (!prog)
5430                 return NULL;
5431         free_array_info(prog);
5432         free_stmts(prog->stmts, prog->n_stmts);
5433         isl_union_map_free(prog->to_outer);
5434         isl_union_map_free(prog->to_inner);
5435         isl_union_set_free(prog->copy_in);
5436         isl_union_set_free(prog->copy_out);
5437         isl_union_map_free(prog->read);
5438         isl_union_map_free(prog->may_write);
5439         isl_union_map_free(prog->must_write);
5440         isl_set_free(prog->context);
5441         free(prog);
5442         return NULL;
5443 }