gpu.c

   1 /*
   2  * Copyright 2010-2011 INRIA Saclay
   3  * Copyright 2012-2013 Ecole Normale Superieure
   4  *
   5  * Use of this software is governed by the MIT license
   6  *
   7  * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
   8  * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
   9  * 91893 Orsay, France
  10  * and Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
  11  */
  12
  13 #include <assert.h>
  14 #include <stdlib.h>
  15 #include <string.h>
  16
  17 #include <isl/polynomial.h>
  18 #include <isl/union_set.h>
  19 #include <isl/aff.h>
  20 #include <isl/ilp.h>
  21 #include <isl/flow.h>
  22 #include <isl/schedule.h>
  23 #include <isl/schedule_node.h>
  24 #include <isl/options.h>
  25 #include <isl/ast_build.h>
  26
  27 #include "cpu.h"
  28 #include "gpu.h"
  29 #include "gpu_array_tile.h"
  30 #include "gpu_group.h"
  31 #include "schedule.h"
  32 #include "ppcg_options.h"
  33 #include "print.h"
  34
  35 struct gpu_array_info;
  36
  37 /* Collect all references to the given array and store pointers to them
  38  * in array->refs.
  39  *
  40  * If the array contains structures, then there is no need to collect
  41  * the references since we will not be computing any reference groups.
  42  */
  43 static void collect_references(struct gpu_prog *prog,
  44         struct gpu_array_info *array)
  45 {
  46         int i;
  47         int n;
  48
  49         if (array->has_compound_element)
  50                 return;
  51
  52         n = 0;
  53         for (i = 0; i < prog->n_stmts; ++i) {
  54                 struct gpu_stmt *stmt = &prog->stmts[i];
  55                 struct gpu_stmt_access *access;
  56
  57                 for (access = stmt->accesses; access; access = access->next) {
  58                         const char *name;
  59                         name = isl_map_get_tuple_name(access->access,
  60                                                       isl_dim_out);
  61                         if (name && !strcmp(array->name, name))
  62                                 n++;
  63                 }
  64         }
  65
  66         array->n_ref = n;
  67         array->refs = isl_alloc_array(prog->ctx, struct gpu_stmt_access *, n);
  68         assert(array->refs);
  69
  70         n = 0;
  71         for (i = 0; i < prog->n_stmts; ++i) {
  72                 struct gpu_stmt *stmt = &prog->stmts[i];
  73                 struct gpu_stmt_access *access;
  74
  75                 for (access = stmt->accesses; access; access = access->next) {
  76                         const char *name;
  77                         name = isl_map_get_tuple_name(access->access,
  78                                                       isl_dim_out);
  79                         if (!name || strcmp(array->name, name))
  80                                 continue;
  81
  82                         array->refs[n++] = access;
  83                 }
  84         }
  85 }
  86
  87 /* Compute and return the extent of "array", taking into account the set of
  88  * accessed elements.
  89  *
  90  * In particular, the extent in the outer dimension is taken
  91  * from "accessed", while the extents in the remaining dimensions
  92  * are taken from array->extent.
  93  *
  94  * The extent in the outer dimension cannot be taken from array->extent
  95  * because that may be unbounded.  Furthermore, even if it is bounded,
  96  * it may be larger than the piece of the array that is being accessed.
  97  */
  98 static __isl_give isl_set *compute_extent(struct pet_array *array,
  99         __isl_keep isl_set *accessed)
 100 {
 101         int n_index;
 102         isl_id *id;
 103         isl_set *outer;
 104         isl_set *extent;
 105
 106         extent = isl_set_copy(array->extent);
 107
 108         n_index = isl_set_dim(accessed, isl_dim_set);
 109         if (n_index == 0)
 110                 return extent;
 111
 112         extent = isl_set_project_out(extent, isl_dim_set, 0, 1);
 113         outer = isl_set_copy(accessed);
 114         outer = isl_set_project_out(outer, isl_dim_set, 1, n_index - 1);
 115         extent = isl_set_flat_product(outer, extent);
 116         id = isl_set_get_tuple_id(accessed);
 117         extent = isl_set_set_tuple_id(extent, id);
 118
 119         return extent;
 120 }
 121
 122 /* Is the array "array" being extracted a read-only scalar?
 123  *
 124  * That is, is "array" a scalar that is never possibly written to.
 125  * An array containing structures is never considered to be a scalar.
 126  */
 127 static int is_read_only_scalar(struct gpu_array_info *array,
 128         struct gpu_prog *prog)
 129 {
 130         isl_set *space;
 131         isl_union_map *write;
 132         int empty;
 133
 134         if (array->has_compound_element)
 135                 return 0;
 136         if (array->n_index != 0)
 137                 return 0;
 138
 139         write = isl_union_map_copy(prog->may_write);
 140         space = isl_set_universe(isl_space_copy(array->space));
 141         write = isl_union_map_intersect_range(write,
 142                                                 isl_union_set_from_set(space));
 143         empty = isl_union_map_is_empty(write);
 144         isl_union_map_free(write);
 145
 146         return empty;
 147 }
 148
 149 /* Compute bounds on the host array "pa" based on the corresponding
 150  * accessed elements in "arrays"
 151  * and collect all references to the array.
 152  * Store the results in "info".
 153  *
 154  * If the array is zero-dimensional and does not contain structures,
 155  * i.e., if the array is a scalar, we check whether it is read-only.
 156  * We also check whether the array is accessed at all.
 157  */
 158 static int extract_array_info(struct gpu_prog *prog,
 159         struct gpu_array_info *info, struct pet_array *pa,
 160         __isl_keep isl_union_set *arrays)
 161 {
 162         int i, empty;
 163         const char *name;
 164         int n_index;
 165         isl_pw_aff **bounds;
 166         isl_set *accessed, *extent;
 167
 168         n_index = isl_set_dim(pa->extent, isl_dim_set);
 169         name = isl_set_get_tuple_name(pa->extent);
 170         bounds = isl_alloc_array(prog->ctx, isl_pw_aff *, n_index);
 171         if (!bounds)
 172                 return -1;
 173
 174         info->space = isl_set_get_space(pa->extent);
 175         info->name = strdup(name);
 176         info->n_index = n_index;
 177         info->bound = bounds;
 178         info->linearize = prog->scop->options->linearize_device_arrays;
 179
 180         info->type = strdup(pa->element_type);
 181         info->size = pa->element_size;
 182         info->local = pa->declared && !pa->exposed;
 183         info->has_compound_element = pa->element_is_record;
 184         info->read_only_scalar = is_read_only_scalar(info, prog);
 185
 186         accessed = isl_union_set_extract_set(arrays,
 187                                             isl_space_copy(info->space));
 188         empty = isl_set_is_empty(accessed);
 189         extent = compute_extent(pa, accessed);
 190         isl_set_free(accessed);
 191         info->extent = extent;
 192         if (empty < 0)
 193                 return -1;
 194         info->accessed = !empty;
 195         for (i = 0; i < n_index; ++i) {
 196                 isl_set *dom;
 197                 isl_local_space *ls;
 198                 isl_aff *one;
 199                 isl_pw_aff *bound;
 200
 201                 dom = isl_set_copy(extent);
 202                 dom = isl_set_project_out(dom, isl_dim_set, i + 1,
 203                                             n_index - (i + 1));
 204                 dom = isl_set_project_out(dom, isl_dim_set, 0, i);
 205                 if (!isl_set_dim_has_upper_bound(dom, isl_dim_set, 0)) {
 206                         fprintf(stderr, "unable to determine extent of '%s' "
 207                                 "in dimension %d\n", info->name, i);
 208                         dom = isl_set_free(dom);
 209                 }
 210                 bound = isl_set_dim_max(dom, 0);
 211                 dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
 212                 ls = isl_local_space_from_space(isl_set_get_space(dom));
 213                 one = isl_aff_zero_on_domain(ls);
 214                 one = isl_aff_add_constant_si(one, 1);
 215                 bound = isl_pw_aff_add(bound, isl_pw_aff_alloc(dom, one));
 216                 bound = isl_pw_aff_gist(bound, isl_set_copy(prog->context));
 217
 218                 bounds[i] = bound;
 219                 if (!isl_pw_aff_is_cst(bound))
 220                         info->linearize = 1;
 221         }
 222
 223         collect_references(prog, info);
 224
 225         return 0;
 226 }
 227
 228 /* Remove independence from the order constraints "order" on array "array".
 229  * Since the pairs of iterations in the filter relation of an independence
 230  * are guaranteed to be completely independent by the user, there is
 231  * no need to ensure that live ranges are ordered along thong pairs.
 232  * We make an exception for local variables, though, as the independence
 233  * guarantee does not apply to those.
 234  *
 235  * The order constraints are used in two places.
 236  * Those on scalars are used in check_scalar_live_ranges to check if
 237  * we need to force the scalar to be private.  Any non-local scalar
 238  * should not be forced scalar if it only appears in independent loops.
 239  * Those on non-scalars are added to the coincidence constraints
 240  * in compute_schedule because we do not support any array expansion.
 241  * Accesses to non-local arrays should not prevent a loop from being
 242  * considered coincident so we should indeed remove those constraints
 243  * from the order constraints.
 244  */
 245 static __isl_give isl_union_map *remove_independences(struct gpu_prog *prog,
 246         struct gpu_array_info *array, __isl_take isl_union_map *order)
 247 {
 248         int i;
 249
 250         for (i = 0; i < prog->scop->pet->n_independence; ++i) {
 251                 struct pet_independence *pi = prog->scop->pet->independences[i];
 252                 if (isl_union_set_contains(pi->local, array->space))
 253                         continue;
 254
 255                 order = isl_union_map_subtract(order,
 256                                                 isl_union_map_copy(pi->filter));
 257         }
 258
 259         return order;
 260 }
 261
 262 /* For each array in "prog", store the (untagged) order dependences
 263  * derived from the array in array->dep_order.
 264  * In particular, consider all references that access the given array
 265  * and take the order dependences that have one of these references
 266  * as source.  (Since an order dependence relates two references to
 267  * the same array, the target of these order dependences will also
 268  * be one of these references.)
 269  * Additionally, store the union of these array->dep_order relations
 270  * for all non-scalar arrays in prog->array_order.
 271  */
 272 void collect_order_dependences(struct gpu_prog *prog)
 273 {
 274         int i;
 275         isl_space *space;
 276         isl_union_map *accesses;
 277
 278         space = isl_union_map_get_space(prog->read);
 279         prog->array_order = isl_union_map_empty(space);
 280
 281         accesses = isl_union_map_copy(prog->scop->tagged_reads);
 282         accesses = isl_union_map_union(accesses,
 283                             isl_union_map_copy(prog->scop->tagged_may_writes));
 284         accesses = isl_union_map_universe(accesses);
 285         accesses = isl_union_map_apply_range(accesses,
 286                                             isl_union_map_copy(prog->to_outer));
 287
 288         for (i = 0; i < prog->n_array; ++i) {
 289                 struct gpu_array_info *array = &prog->array[i];
 290                 isl_set *set;
 291                 isl_union_set *uset;
 292                 isl_union_map *order;
 293
 294                 set = isl_set_universe(isl_space_copy(array->space));
 295                 uset = isl_union_set_from_set(set);
 296                 uset = isl_union_map_domain(
 297                     isl_union_map_intersect_range(isl_union_map_copy(accesses),
 298                                                     uset));
 299                 order = isl_union_map_copy(prog->scop->tagged_dep_order);
 300                 order = isl_union_map_intersect_domain(order, uset);
 301                 order = isl_union_map_zip(order);
 302                 order = isl_union_set_unwrap(isl_union_map_domain(order));
 303                 order = remove_independences(prog, array, order);
 304                 array->dep_order = order;
 305
 306                 if (gpu_array_is_scalar(array) && !array->has_compound_element)
 307                         continue;
 308
 309                 prog->array_order = isl_union_map_union(prog->array_order,
 310                                         isl_union_map_copy(array->dep_order));
 311         }
 312
 313         isl_union_map_free(accesses);
 314 }
 315
 316 /* Construct a gpu_array_info for each array referenced by prog->scop and
 317  * collect them in prog->array.
 318  *
 319  * The sizes are based on the extents and the set of possibly accessed
 320  * elements by "prog".
 321  * If there are any member accesses involved, then they are first mapped
 322  * to the outer arrays of structs.
 323  *
 324  * If we are allowing live range reordering, then also set
 325  * the dep_order field.  Otherwise leave it NULL.
 326  */
 327 static int collect_array_info(struct gpu_prog *prog)
 328 {
 329         int i;
 330         int r = 0;
 331         isl_union_set *arrays;
 332
 333         arrays = isl_union_map_range(isl_union_map_copy(prog->read));
 334         arrays = isl_union_set_union(arrays,
 335                     isl_union_map_range(isl_union_map_copy(prog->may_write)));
 336
 337         arrays = isl_union_set_apply(arrays,
 338                                         isl_union_map_copy(prog->to_outer));
 339
 340         arrays = isl_union_set_coalesce(arrays);
 341
 342         prog->n_array = prog->scop->pet->n_array;
 343         prog->array = isl_calloc_array(prog->ctx,
 344                                      struct gpu_array_info, prog->n_array);
 345         assert(prog->array);
 346         for (i = 0; i < prog->scop->pet->n_array; ++i)
 347                 if (extract_array_info(prog, &prog->array[i],
 348                                         prog->scop->pet->arrays[i], arrays) < 0)
 349                         r = -1;
 350
 351         isl_union_set_free(arrays);
 352
 353         if (prog->scop->options->live_range_reordering)
 354                 collect_order_dependences(prog);
 355
 356         return r;
 357 }
 358
 359 static void free_array_info(struct gpu_prog *prog)
 360 {
 361         int i, j;
 362
 363         for (i = 0; i < prog->n_array; ++i) {
 364                 int n_index = prog->array[i].n_index;
 365                 free(prog->array[i].type);
 366                 free(prog->array[i].name);
 367                 for (j = 0; j < n_index; ++j)
 368                         isl_pw_aff_free(prog->array[i].bound[j]);
 369                 isl_space_free(prog->array[i].space);
 370                 isl_set_free(prog->array[i].extent);
 371                 free(prog->array[i].bound);
 372                 free(prog->array[i].refs);
 373                 isl_union_map_free(prog->array[i].dep_order);
 374         }
 375         free(prog->array);
 376 }
 377
 378 /* Check if a gpu array is a scalar.  A scalar is a value that is not stored
 379  * as an array or through a pointer reference, but as a single data element.
 380  * At the moment, scalars are represented as zero-dimensional arrays.
 381  * Note that the single data element may be an entire structure.
 382  */
 383 int gpu_array_is_scalar(struct gpu_array_info *array)
 384 {
 385         return array->n_index == 0;
 386 }
 387
 388 /* Is "array" a read-only scalar?
 389  */
 390 int gpu_array_is_read_only_scalar(struct gpu_array_info *array)
 391 {
 392         return array->read_only_scalar;
 393 }
 394
 395 /* Return the set of parameter values for which the array has a positive
 396  * size in all dimensions.
 397  * If the sizes are only valid for some parameter values, then those
 398  * constraints are also taken into account.
 399  */
 400 __isl_give isl_set *gpu_array_positive_size_guard(struct gpu_array_info *array)
 401 {
 402         int i;
 403         isl_space *space;
 404         isl_set *guard;
 405
 406         space = isl_space_params(isl_space_copy(array->space));
 407         guard = isl_set_universe(space);
 408
 409         for (i = 0; i < array->n_index; ++i) {
 410                 isl_pw_aff *bound;
 411                 isl_set *guard_i, *zero;
 412
 413                 bound = isl_pw_aff_copy(array->bound[i]);
 414                 guard_i = isl_pw_aff_nonneg_set(isl_pw_aff_copy(bound));
 415                 zero = isl_pw_aff_zero_set(bound);
 416                 guard_i = isl_set_subtract(guard_i, zero);
 417                 guard = isl_set_intersect(guard, guard_i);
 418         }
 419
 420         return guard;
 421 }
 422
 423 /* Internal data structure for extract_size_of_type.
 424  * "type" specifies the name of the space that we want to extract.
 425  * "res" is used to store the subset of that space.
 426  */
 427 struct ppcg_extract_size_data {
 428         const char *type;
 429         isl_set *res;
 430 };
 431
 432 /* This function is called for each set in a union_set.
 433  * If the name of the set matches data->type, we store the
 434  * set in data->res.
 435  */
 436 static int extract_size_of_type(__isl_take isl_set *size, void *user)
 437 {
 438         struct ppcg_extract_size_data *data = user;
 439         const char *name;
 440
 441         name = isl_set_get_tuple_name(size);
 442         if (name && !strcmp(name, data->type)) {
 443                 data->res = size;
 444                 return -1;
 445         }
 446
 447         isl_set_free(size);
 448         return 0;
 449 }
 450
 451 /* Given a union map { kernel[i] -> *[...] },
 452  * return the range in the space called "type" for the kernel with
 453  * sequence number "id".
 454  */
 455 static __isl_give isl_set *extract_sizes(__isl_keep isl_union_map *sizes,
 456         const char *type, int id)
 457 {
 458         isl_space *space;
 459         isl_set *dom;
 460         isl_union_set *local_sizes;
 461         struct ppcg_extract_size_data data = { type, NULL };
 462
 463         if (!sizes)
 464                 return NULL;
 465
 466         space = isl_union_map_get_space(sizes);
 467         space = isl_space_set_from_params(space);
 468         space = isl_space_add_dims(space, isl_dim_set, 1);
 469         space = isl_space_set_tuple_name(space, isl_dim_set, "kernel");
 470         dom = isl_set_universe(space);
 471         dom = isl_set_fix_si(dom, isl_dim_set, 0, id);
 472
 473         local_sizes = isl_union_set_apply(isl_union_set_from_set(dom),
 474                                         isl_union_map_copy(sizes));
 475         isl_union_set_foreach_set(local_sizes, &extract_size_of_type, &data);
 476         isl_union_set_free(local_sizes);
 477         return data.res;
 478 }
 479
 480 /* Given a singleton set, extract the first (at most *len) elements
 481  * of the single integer tuple into *sizes and update *len if needed.
 482  */
 483 static void read_sizes_from_set(__isl_take isl_set *set, int *sizes, int *len)
 484 {
 485         int i;
 486         int dim;
 487
 488         if (!set)
 489                 return;
 490
 491         dim = isl_set_dim(set, isl_dim_set);
 492         if (dim < *len)
 493                 *len = dim;
 494
 495         for (i = 0; i < *len; ++i) {
 496                 isl_val *v;
 497
 498                 v = isl_set_plain_get_val_if_fixed(set, isl_dim_set, i);
 499                 assert(v);
 500
 501                 sizes[i] = isl_val_get_num_si(v);
 502                 isl_val_free(v);
 503         }
 504
 505         isl_set_free(set);
 506 }
 507
 508 /* Add the map { kernel[id] -> type[sizes] } to gen->used_sizes,
 509  * if the option debug->dump_sizes is set.
 510  */
 511 static void set_used_sizes(struct gpu_gen *gen, const char *type, int id,
 512         int *sizes, int len)
 513 {
 514         int i;
 515         isl_space *space;
 516         isl_map *map;
 517
 518         if (!gen->options->debug->dump_sizes)
 519                 return;
 520
 521         space = isl_union_map_get_space(gen->used_sizes);
 522         space = isl_space_set_from_params(space);
 523         space = isl_space_add_dims(space, isl_dim_set, 1);
 524         space = isl_space_set_tuple_name(space, isl_dim_set, "kernel");
 525         space = isl_space_from_domain(space);
 526         space = isl_space_add_dims(space, isl_dim_out, len);
 527         space = isl_space_set_tuple_name(space, isl_dim_out, type);
 528
 529         map = isl_map_universe(space);
 530         map = isl_map_fix_si(map, isl_dim_in, 0, id);
 531         for (i = 0; i < len; ++i)
 532                 map = isl_map_fix_si(map, isl_dim_out, i, sizes[i]);
 533
 534         gen->used_sizes = isl_union_map_add_map(gen->used_sizes, map);
 535 }
 536
 537 /* Extract user specified "tile" sizes from the "sizes" command line option,
 538  * defaulting to option->tile_size in each dimension.
 539  * Add the effectively used sizes to gen->used_sizes.
 540  */
 541 static void read_tile_sizes(struct gpu_gen *gen)
 542 {
 543         int n;
 544         isl_set *size;
 545         struct ppcg_kernel *kernel = gen->kernel;
 546
 547         kernel->tile_size = isl_alloc_array(gen->ctx, int, kernel->tile_len);
 548         assert(kernel->tile_size);
 549         for (n = 0; n < kernel->tile_len; ++n)
 550                 kernel->tile_size[n] = kernel->options->tile_size;
 551
 552         size = extract_sizes(gen->sizes, "tile", kernel->id);
 553         read_sizes_from_set(size, kernel->tile_size, &kernel->tile_len);
 554         set_used_sizes(gen, "tile", kernel->id,
 555                         kernel->tile_size, kernel->tile_len);
 556
 557         if (kernel->n_parallel > kernel->tile_len)
 558                 kernel->n_parallel = kernel->tile_len;
 559 }
 560
 561 /* Extract user specified "block" sizes from the "sizes" command line option,
 562  * after filling in some potentially useful defaults.
 563  */
 564 static void read_block_sizes(struct ppcg_kernel *kernel,
 565         __isl_keep isl_union_map *sizes)
 566 {
 567         isl_set *size;
 568
 569         if (kernel->n_block > 3)
 570                 kernel->n_block = 3;
 571         switch (kernel->n_block) {
 572         case 1:
 573                 kernel->block_dim[0] = 512;
 574                 break;
 575         case 2:
 576                 kernel->block_dim[0] = 32;
 577                 kernel->block_dim[1] = 16;
 578                 break;
 579         default:
 580                 kernel->block_dim[0] = 32;
 581                 kernel->block_dim[1] = 4;
 582                 kernel->block_dim[2] = 4;
 583                 break;
 584         }
 585
 586         size = extract_sizes(sizes, "block", kernel->id);
 587         read_sizes_from_set(size, kernel->block_dim, &kernel->n_block);
 588 }
 589
 590 /* Extract user specified "grid" sizes from the "sizes" command line option,
 591  * after filling in some potentially useful defaults.
 592  */
 593 static void read_grid_sizes(struct ppcg_kernel *kernel,
 594         __isl_keep isl_union_map *sizes)
 595 {
 596         isl_set *size;
 597
 598         if (kernel->n_grid > 2)
 599                 kernel->n_grid = 2;
 600         switch (kernel->n_grid) {
 601         case 1:
 602                 kernel->grid_dim[0] = 32768;
 603                 break;
 604         default:
 605                 kernel->grid_dim[0] = 256;
 606                 kernel->grid_dim[1] = 256;
 607                 break;
 608         }
 609
 610         size = extract_sizes(sizes, "grid", kernel->id);
 611         read_sizes_from_set(size, kernel->grid_dim, &kernel->n_grid);
 612 }
 613
 614 /* Extract user specified sizes from the "sizes" command line option
 615  * after filling in some potentially useful defaults.
 616  * Add the effectively used sizes to gen->used_sizes.
 617  */
 618 static void read_sizes(struct gpu_gen *gen)
 619 {
 620         struct ppcg_kernel *kernel = gen->kernel;
 621
 622         read_tile_sizes(gen);
 623         read_block_sizes(kernel, gen->sizes);
 624         read_grid_sizes(kernel, gen->sizes);
 625         set_used_sizes(gen, "block", kernel->id,
 626                                             kernel->block_dim, kernel->n_block);
 627         set_used_sizes(gen, "grid", kernel->id,
 628                                             kernel->grid_dim, kernel->n_grid);
 629 }
 630
 631 static void *free_stmts(struct gpu_stmt *stmts, int n)
 632 {
 633         int i;
 634
 635         if (!stmts)
 636                 return NULL;
 637
 638         for (i = 0; i < n; ++i) {
 639                 struct gpu_stmt_access *access, *next;
 640
 641                 for (access = stmts[i].accesses; access; access = next) {
 642                         next = access->next;
 643                         isl_id_free(access->ref_id);
 644                         isl_map_free(access->access);
 645                         isl_map_free(access->tagged_access);
 646                         free(access);
 647                 }
 648
 649                 isl_id_free(stmts[i].id);
 650         }
 651         free(stmts);
 652
 653         return NULL;
 654 }
 655
 656 /* Construct a map from a domain of dimensionality "len"
 657  * to a domain of dimensionality "len" + "tile_len" that tiles
 658  * the "tile_len" coordinates starting at "first".
 659  * In particular, [s_i] -> [s_i / tile_size[i], s_i % tile_size[i]].
 660  * "dim" prescribes the parameters.
 661  */
 662 static __isl_give isl_map *tile(__isl_take isl_space *dim, int len,
 663         int first, int tile_len, int *tile_size)
 664 {
 665         int i;
 666         isl_basic_map *bmap;
 667         isl_constraint *c;
 668         isl_local_space *ls;
 669
 670         dim = isl_space_add_dims(dim, isl_dim_in, len);
 671         dim = isl_space_add_dims(dim, isl_dim_out, len + tile_len);
 672         bmap = isl_basic_map_universe(isl_space_copy(dim));
 673         ls = isl_local_space_from_space(dim);
 674
 675         for (i = 0; i < len - tile_len; ++i) {
 676                 int j = i < first ? i : i + tile_len;
 677                 int k = i < first ? i : i + 2 * tile_len;
 678
 679                 c = isl_equality_alloc(isl_local_space_copy(ls));
 680                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, j, -1);
 681                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, k, 1);
 682                 bmap = isl_basic_map_add_constraint(bmap, c);
 683         }
 684
 685         for (i = 0; i < tile_len; ++i) {
 686                 c = isl_equality_alloc(isl_local_space_copy(ls));
 687                 c = isl_constraint_set_coefficient_si(c, isl_dim_in,
 688                                                 first + i, -1);
 689                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 690                                                 first + i, tile_size[i]);
 691                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 692                                                 first + i + tile_len, 1);
 693                 bmap = isl_basic_map_add_constraint(bmap, c);
 694
 695                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 696                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 697                                                    first + i + tile_len, 1);
 698                 bmap = isl_basic_map_add_constraint(bmap, c);
 699
 700                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 701                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 702                                                    first + i + tile_len, -1);
 703                 c = isl_constraint_set_constant_si(c, tile_size[i] - 1);
 704                 bmap = isl_basic_map_add_constraint(bmap, c);
 705         }
 706
 707         isl_local_space_free(ls);
 708
 709         return isl_map_from_basic_map(bmap);
 710 }
 711
 712 /* Construct a map from a domain of dimensionality "len"
 713  * to a domain of dimensionality "len" + "wrap_len" that "wraps"
 714  * the "wrap_len" coordinates starting at "first" according to "wrap_size".
 715  * In particular, [s_i] -> [s_i, s_i % wrap_size[i]].
 716  * To do so, we need extra variables corresponding to [s_i / wrap_size[i]],
 717  * that are projected out at the end.
 718  * "dim" prescribes the parameters.
 719  */
 720 static __isl_give isl_map *wrap(__isl_take isl_space *dim, int len,
 721         int first, int wrap_len, int *wrap_size)
 722 {
 723         int i;
 724         isl_basic_map *bmap;
 725         isl_constraint *c;
 726         isl_local_space *ls;
 727
 728         dim = isl_space_add_dims(dim, isl_dim_in, len);
 729         dim = isl_space_add_dims(dim, isl_dim_out, len + 2 * wrap_len);
 730         bmap = isl_basic_map_universe(isl_space_copy(dim));
 731         ls = isl_local_space_from_space(dim);
 732
 733         for (i = 0; i < len; ++i) {
 734                 int k = i < first + wrap_len ? i : i + 2 * wrap_len;
 735
 736                 c = isl_equality_alloc(isl_local_space_copy(ls));
 737                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, -1);
 738                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, k, 1);
 739                 bmap = isl_basic_map_add_constraint(bmap, c);
 740         }
 741
 742         for (i = 0; i < wrap_len; ++i) {
 743                 c = isl_equality_alloc(isl_local_space_copy(ls));
 744                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 745                                                     first + i, -1);
 746                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 747                                                     first + wrap_len + i, 1);
 748                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 749                                     first + 2 * wrap_len + i, wrap_size[i]);
 750                 bmap = isl_basic_map_add_constraint(bmap, c);
 751
 752                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 753                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 754                                                     first + wrap_len + i, 1);
 755                 bmap = isl_basic_map_add_constraint(bmap, c);
 756
 757                 c = isl_inequality_alloc(isl_local_space_copy(ls));
 758                 c = isl_constraint_set_coefficient_si(c, isl_dim_out,
 759                                                     first + wrap_len + i, -1);
 760                 c = isl_constraint_set_constant_si(c, wrap_size[i] - 1);
 761                 bmap = isl_basic_map_add_constraint(bmap, c);
 762         }
 763
 764         isl_local_space_free(ls);
 765
 766         bmap = isl_basic_map_project_out(bmap, isl_dim_out,
 767                                 first + 2 * wrap_len, wrap_len);
 768
 769         return isl_map_from_basic_map(bmap);
 770 }
 771
 772 /* Tile the B loops over the tile sizes and then tile/wrap
 773  * the T1 loops over the blocks.
 774  */
 775 static __isl_give isl_union_map *tile_schedule(struct gpu_gen *gen,
 776         __isl_take isl_union_map *sched)
 777 {
 778         struct ppcg_kernel *kernel = gen->kernel;
 779         isl_space *dim;
 780         isl_map *tiling, *block_tiling;
 781
 782         dim = isl_union_map_get_space(sched);
 783         tiling = tile(isl_space_copy(dim), gen->untiled_len,
 784                       gen->tile_first, kernel->tile_len, kernel->tile_size);
 785
 786         if (gen->options->wrap)
 787                 block_tiling = wrap(dim, gen->untiled_len + kernel->tile_len,
 788                             gen->tile_first, kernel->n_grid, kernel->grid_dim);
 789         else
 790                 block_tiling = tile(dim, gen->untiled_len + kernel->tile_len,
 791                             gen->tile_first, kernel->n_grid, kernel->grid_dim);
 792
 793         gen->tiled_len = gen->untiled_len + kernel->tile_len + kernel->n_grid;
 794
 795         tiling = isl_map_apply_range(tiling, block_tiling);
 796
 797         sched = isl_union_map_apply_range(sched,
 798                                              isl_union_map_from_map(tiling));
 799
 800         gen->shared_len = gen->tile_first + kernel->tile_len + kernel->n_grid;
 801
 802         return sched;
 803 }
 804
 805 /* Equate the "T1P" iterators in the tiled schedule "sched"
 806  * to the block dimensions.
 807  */
 808 static __isl_give isl_union_map *parametrize_tiled_schedule(
 809         struct gpu_gen *gen, __isl_take isl_union_map *sched)
 810 {
 811         struct ppcg_kernel *kernel = gen->kernel;
 812         isl_space *dim;
 813         isl_set *par;
 814
 815         dim = isl_union_map_get_space(sched);
 816         par = parametrization(dim, gen->tiled_len,
 817                 gen->tile_first + kernel->n_grid, kernel->block_ids);
 818         sched = isl_union_map_intersect_range(sched,
 819                                                 isl_union_set_from_set(par));
 820
 821         return sched;
 822 }
 823
 824 /* Tile/wrap the P1 loops over the threads.
 825  */
 826 static __isl_give isl_union_map *thread_tile_schedule(struct gpu_gen *gen,
 827         __isl_take isl_union_map *sched)
 828 {
 829         struct ppcg_kernel *kernel = gen->kernel;
 830         isl_space *dim;
 831         isl_map *tiling;
 832         isl_set *par;
 833
 834         dim = isl_union_map_get_space(sched);
 835
 836         if (gen->options->wrap)
 837                 tiling = wrap(isl_space_copy(dim), gen->tiled_len,
 838                         gen->shared_len, kernel->n_block, kernel->block_dim);
 839         else
 840                 tiling = tile(isl_space_copy(dim), gen->tiled_len,
 841                         gen->shared_len, kernel->n_block, kernel->block_dim);
 842         gen->thread_tiled_len = gen->tiled_len + kernel->n_block;
 843
 844         sched = isl_union_map_apply_range(sched,
 845                                              isl_union_map_from_map(tiling));
 846
 847         par = parametrization(dim, gen->thread_tiled_len,
 848                 gen->tile_first + kernel->tile_len +
 849                 kernel->n_grid + kernel->n_block, kernel->thread_ids);
 850         sched = isl_union_map_intersect_range(sched,
 851                                                 isl_union_set_from_set(par));
 852
 853         gen->shared_len = gen->tile_first + kernel->tile_len + kernel->n_grid;
 854
 855         return sched;
 856 }
 857
 858 /* If the user asked for it, scale the shared memory tile loops
 859  * (T1T and T2) of "sched" by kernel->tile_size[i].
 860  * If we are not performing "wrapping", then additionally scale the T1P
 861  * loops by kernel->grid_dim[i].
 862  */
 863 static __isl_give isl_union_map *scale_tile_loops(struct gpu_gen *gen,
 864         __isl_take isl_union_map *sched)
 865 {
 866         struct ppcg_kernel *kernel = gen->kernel;
 867         int i;
 868         isl_space *dim;
 869         isl_basic_map *scale;
 870         isl_constraint *c;
 871         isl_local_space *ls;
 872
 873         if (!gen->options->scale_tile_loops)
 874                 return sched;
 875
 876         dim = isl_union_map_get_space(sched);
 877         dim = isl_space_add_dims(dim, isl_dim_in, gen->tiled_len);
 878         dim = isl_space_add_dims(dim, isl_dim_out, gen->tiled_len);
 879         scale = isl_basic_map_universe(isl_space_copy(dim));
 880         ls = isl_local_space_from_space(dim);
 881
 882         for (i = 0; i < gen->tiled_len; ++i) {
 883                 int f = 1;
 884
 885                 if (i >= gen->tile_first &&
 886                     i < gen->tile_first + kernel->n_grid) {
 887                         f = kernel->tile_size[i - gen->tile_first];
 888                         if (!gen->options->wrap)
 889                                 f *= kernel->grid_dim[i - gen->tile_first];
 890                 } else if (i >= gen->tile_first + kernel->n_grid &&
 891                            i < gen->tile_first + kernel->n_grid +
 892                                 kernel->tile_len) {
 893                         f = kernel->tile_size[i -
 894                                             (gen->tile_first + kernel->n_grid)];
 895                 }
 896
 897                 c = isl_equality_alloc(isl_local_space_copy(ls));
 898                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
 899                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
 900                 scale = isl_basic_map_add_constraint(scale, c);
 901         }
 902
 903         isl_local_space_free(ls);
 904
 905         sched = isl_union_map_apply_range(sched,
 906                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
 907
 908         return sched;
 909 }
 910
 911 /* If we are not performing "wrapping" and if the user asked for it,
 912  * scale the thread tile loops (P1T) of "sched" by kernel->block_dim[i].
 913  */
 914 static __isl_give isl_union_map *scale_thread_tile_loops(struct gpu_gen *gen,
 915         __isl_take isl_union_map *sched)
 916 {
 917         int i;
 918         isl_space *dim;
 919         isl_basic_map *scale;
 920         isl_constraint *c;
 921         isl_local_space *ls;
 922
 923         if (gen->options->wrap)
 924                 return sched;
 925         if (!gen->options->scale_tile_loops)
 926                 return sched;
 927
 928         dim = isl_union_map_get_space(sched);
 929         dim = isl_space_add_dims(dim, isl_dim_in, gen->thread_tiled_len);
 930         dim = isl_space_add_dims(dim, isl_dim_out, gen->thread_tiled_len);
 931         scale = isl_basic_map_universe(isl_space_copy(dim));
 932         ls = isl_local_space_from_space(dim);
 933
 934         for (i = 0; i < gen->thread_tiled_len; ++i) {
 935                 int f = 1;
 936
 937                 if (i >= gen->shared_len &&
 938                     i < gen->shared_len + gen->kernel->n_block)
 939                         f = gen->kernel->block_dim[i - gen->shared_len];
 940
 941                 c = isl_equality_alloc(isl_local_space_copy(ls));
 942                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
 943                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
 944                 scale = isl_basic_map_add_constraint(scale, c);
 945         }
 946
 947         isl_local_space_free(ls);
 948
 949         sched = isl_union_map_apply_range(sched,
 950                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
 951
 952         return sched;
 953 }
 954
 955 /* If we are not performing "wrapping" and if the user asked for it,
 956  * scale the "n_tile" loops starting at "first" of "sched" by gen->block_dim[i].
 957  */
 958 static __isl_give isl_union_map *scale_access_tile_loops(struct gpu_gen *gen,
 959         __isl_take isl_union_map *sched, int len, int first, int n_tile)
 960 {
 961         int i;
 962         isl_space *dim;
 963         isl_basic_map *scale;
 964         isl_constraint *c;
 965         isl_local_space *ls;
 966
 967         if (gen->options->wrap)
 968                 return sched;
 969         if (!gen->options->scale_tile_loops)
 970                 return sched;
 971
 972         dim = isl_union_map_get_space(sched);
 973         dim = isl_space_add_dims(dim, isl_dim_in, len);
 974         dim = isl_space_add_dims(dim, isl_dim_out, len);
 975         scale = isl_basic_map_universe(isl_space_copy(dim));
 976         ls = isl_local_space_from_space(dim);
 977
 978         for (i = 0; i < len; ++i) {
 979                 int f = 1;
 980
 981                 if (i >= first && i < first + n_tile)
 982                         f = gen->kernel->block_dim[i - first];
 983
 984                 c = isl_equality_alloc(isl_local_space_copy(ls));
 985                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i, f);
 986                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, i, -1);
 987                 scale = isl_basic_map_add_constraint(scale, c);
 988         }
 989
 990         isl_local_space_free(ls);
 991
 992         sched = isl_union_map_apply_range(sched,
 993                 isl_union_map_from_map(isl_map_from_basic_map(scale)));
 994
 995         return sched;
 996 }
 997
 998 /* Add parameters p[i] with identifiers "ids" to "set",
 999  * with bounds to 0 <= p[i] < size[i].
1000  */
1001 __isl_give isl_set *add_bounded_parameters(__isl_take isl_set *set,
1002         int *size, __isl_keep isl_id_list *ids)
1003 {
1004         int i, len;
1005         unsigned nparam;
1006
1007         len = isl_id_list_n_id(ids);
1008         nparam = isl_set_dim(set, isl_dim_param);
1009         set = isl_set_add_dims(set, isl_dim_param, len);
1010
1011         for (i = 0; i < len; ++i) {
1012                 isl_id *id;
1013
1014                 id = isl_id_list_get_id(ids, i);
1015                 set = isl_set_set_dim_id(set, isl_dim_param, nparam + i, id);
1016                 set = isl_set_lower_bound_si(set, isl_dim_param, nparam + i, 0);
1017                 set = isl_set_upper_bound_si(set, isl_dim_param,
1018                                             nparam + i, size[i] - 1);
1019         }
1020
1021         return set;
1022 }
1023
1024 /* Add "len" parameters p[i] with identifiers "ids" and intersect "set"
1025  * with
1026  *
1027  *      { : 0 <= p[i] < size[i] }
1028  *
1029  * or an overapproximation.
1030  */
1031 static __isl_give isl_set *add_bounded_parameters_dynamic(
1032         __isl_take isl_set *set, __isl_keep isl_multi_pw_aff *size,
1033         __isl_keep isl_id_list *ids)
1034 {
1035         int i, len;
1036         unsigned nparam;
1037         isl_space *space;
1038         isl_local_space *ls;
1039
1040         len = isl_multi_pw_aff_dim(size, isl_dim_out);
1041         nparam = isl_set_dim(set, isl_dim_param);
1042         set = isl_set_add_dims(set, isl_dim_param, len);
1043
1044         for (i = 0; i < len; ++i) {
1045                 isl_id *id;
1046
1047                 id = isl_id_list_get_id(ids, i);
1048                 set = isl_set_set_dim_id(set, isl_dim_param, nparam + i, id);
1049         }
1050
1051         space = isl_space_params(isl_set_get_space(set));
1052         ls = isl_local_space_from_space(space);
1053         for (i = 0; i < len; ++i) {
1054                 isl_pw_aff *param, *size_i, *zero;
1055                 isl_set *bound;
1056
1057                 param = isl_pw_aff_var_on_domain(isl_local_space_copy(ls),
1058                                                 isl_dim_param, nparam + i);
1059
1060                 size_i = isl_multi_pw_aff_get_pw_aff(size, i);
1061                 bound = isl_pw_aff_lt_set(isl_pw_aff_copy(param), size_i);
1062                 bound = isl_set_from_basic_set(isl_set_simple_hull(bound));
1063                 set = isl_set_intersect_params(set, bound);
1064
1065                 zero = isl_pw_aff_zero_on_domain(isl_local_space_copy(ls));
1066                 bound = isl_pw_aff_ge_set(param, zero);
1067                 set = isl_set_intersect_params(set, bound);
1068         }
1069         isl_local_space_free(ls);
1070
1071         return set;
1072 }
1073
1074 /* Construct a map from an access to group->array to the corresponding
1075  * shared/private memory tile.
1076  * The map is of the form
1077  *
1078  *      { [D[i] -> A[a]] -> T[t] }
1079  *
1080  * where D represents the initial shared_len dimensions
1081  * of the computed schedule.
1082  */
1083 static __isl_give isl_map *shift_access(struct gpu_array_ref_group *group)
1084 {
1085         struct gpu_array_tile *tile;
1086         isl_multi_aff *tiling;
1087
1088         tile = group->private_tile;
1089         if (!tile)
1090                 tile = group->shared_tile;
1091
1092         tiling = isl_multi_aff_copy(tile->tiling);
1093
1094         return isl_map_from_multi_aff(tiling);
1095 }
1096
1097 /* Given a schedule that iterates over all elements in a piece of an array,
1098  * perform tiling/wrapping over the threads.
1099  *
1100  * In particular, we tile the final iterators so that the final thread
1101  * dimension runs over the final array dimension.
1102  * However, if those final iterators have only a single iteration,
1103  * we try to tile earlier iterators instead.
1104  */
1105 static __isl_give isl_map *tile_access_schedule(struct gpu_gen *gen,
1106         __isl_take isl_map *sched)
1107 {
1108         isl_space *dim;
1109         isl_union_map *usched;
1110         isl_map *tiling;
1111         isl_set *par;
1112         unsigned nvar = isl_map_dim(sched, isl_dim_out);
1113         int n_tile;
1114         int first;
1115
1116         n_tile = gen->kernel->n_block;
1117         if (n_tile > nvar) {
1118                 int i;
1119                 sched = isl_map_insert_dims(sched,
1120                                                 isl_dim_out, 0, n_tile - nvar);
1121                 for (i = 0; i < n_tile - nvar; ++i)
1122                         sched = isl_map_fix_si(sched, isl_dim_out, i, 0);
1123                 nvar = n_tile;
1124         }
1125
1126         first = nvar - n_tile;
1127
1128         for (; first > 0; first --)
1129                 if (!map_plain_is_fixed(sched, isl_dim_out, first + n_tile - 1))
1130                         break;
1131
1132         dim = isl_map_get_space(sched);
1133         dim = isl_space_params(dim);
1134         if (gen->options->wrap)
1135                 tiling = wrap(isl_space_copy(dim), nvar, first,
1136                                 n_tile, gen->kernel->block_dim);
1137         else
1138                 tiling = tile(isl_space_copy(dim), nvar, first,
1139                                 n_tile, gen->kernel->block_dim);
1140         sched = isl_map_apply_range(sched, tiling);
1141
1142         par = parametrization(dim, nvar + n_tile, first + n_tile,
1143                                 gen->kernel->thread_ids);
1144         sched = isl_map_intersect_range(sched, par);
1145
1146         usched = isl_union_map_from_map(sched);
1147         usched = scale_access_tile_loops(gen, usched, nvar + n_tile,
1148                                          first, n_tile);
1149         sched = isl_map_from_union_map(usched);
1150
1151         return sched;
1152 }
1153
1154 /* Return the union of all tagged access relations in the group.
1155  */
1156 static __isl_give isl_union_map *group_tagged_access_relation(
1157         struct gpu_array_ref_group *group)
1158 {
1159         int i;
1160         isl_union_map *access;
1161
1162         access = isl_union_map_empty(isl_map_get_space(group->access));
1163         for (i = 0; i < group->n_ref; ++i) {
1164                 isl_map *map_i;
1165
1166                 map_i = isl_map_copy(group->refs[i]->tagged_access);
1167                 access = isl_union_map_union(access,
1168                                             isl_union_map_from_map(map_i));
1169         }
1170
1171         return access;
1172 }
1173
1174 /* Return the extent of "array", recomputed from the bounds.
1175  * The recomputed extent may be simpler than the original extent.
1176  */
1177 static __isl_give isl_set *array_extent(struct gpu_array_info *array)
1178 {
1179         int i;
1180         isl_id *id;
1181         isl_space *space;
1182         isl_local_space *ls;
1183         isl_set *extent;
1184
1185         id = isl_set_get_tuple_id(array->extent);
1186         space = isl_set_get_space(array->extent);
1187         extent = isl_set_universe(isl_space_copy(space));
1188         ls = isl_local_space_from_space(space);
1189         for (i = 0; i < array->n_index; ++i) {
1190                 isl_pw_aff *bound;
1191                 isl_aff *aff;
1192                 isl_pw_aff *index;
1193                 isl_set *lt;
1194
1195                 extent = isl_set_lower_bound_si(extent, isl_dim_set, i, 0);
1196
1197                 aff = isl_aff_var_on_domain(isl_local_space_copy(ls),
1198                                                 isl_dim_set, i);
1199                 index = isl_pw_aff_from_aff(aff);
1200                 bound = isl_pw_aff_copy(array->bound[i]);
1201                 bound = isl_pw_aff_from_range(bound);
1202                 bound = isl_pw_aff_add_dims(bound, isl_dim_in, array->n_index);
1203                 bound = isl_pw_aff_set_tuple_id(bound, isl_dim_in,
1204                                                 isl_id_copy(id));
1205                 lt = isl_pw_aff_lt_set(index, bound);
1206                 extent = isl_set_intersect(extent, lt);
1207         }
1208         isl_local_space_free(ls);
1209         isl_id_free(id);
1210
1211         return extent;
1212 }
1213
1214 /* Return a map from the first shared_len dimensions of the computed
1215  * schedule to the array tile in
1216  * global memory that corresponds to the shared memory copy.
1217  *
1218  * In particular, return a map
1219  *
1220  *      { D[i] -> A[a] }
1221  *
1222  * with constraints
1223  *
1224  *      tile_offset(i) <= a <= tile_offset(i) + tile_size - 1           (1)
1225  *
1226  * and
1227  *
1228  *      0 <= a <= array_size - 1                                        (2)
1229  *
1230  * Note that if some stride has been detected (i.e., when
1231  * group->shared_tile->bound[i].shift is set), then a in (1) refers
1232  * to the shifted and scaled down version.
1233  *
1234  * Constraints (1) are obtained by mapping the size constraints on the
1235  * shared/private memory tile back to the access relation.
1236  * Constraints (2) are obtained from the (recomputed) extent.
1237  */
1238 static __isl_give isl_map *group_tile(struct gpu_array_ref_group *group)
1239 {
1240         int i;
1241         int n_index = group->array->n_index;
1242         isl_map *tile;
1243         isl_space *space;
1244         isl_set *local;
1245         isl_set *extent;
1246
1247         space = isl_multi_aff_get_space(group->shared_tile->tiling);
1248         space = isl_space_range(space);
1249         local = isl_set_universe(space);
1250         for (i = 0; i < n_index; ++i) {
1251                 isl_val *bound;
1252
1253                 local = isl_set_lower_bound_si(local, isl_dim_set, i, 0);
1254                 bound = isl_val_copy(group->shared_tile->bound[i].size);
1255                 bound = isl_val_sub_ui(bound, 1);
1256                 local = isl_set_upper_bound_val(local, isl_dim_set, i, bound);
1257         }
1258         local = isl_set_preimage_multi_aff(local,
1259                                 isl_multi_aff_copy(group->shared_tile->tiling));
1260         tile = isl_set_unwrap(local);
1261         extent = array_extent(group->array);
1262         tile = isl_map_intersect_range(tile, extent);
1263
1264         return tile;
1265 }
1266
1267 /* Given a mapping "iterator_map" from the AST schedule to a domain,
1268  * return the corresponding mapping from the AST schedule to
1269  * to the first shared_len dimensions of the schedule computed by PPCG.
1270  */
1271 static __isl_give isl_pw_multi_aff *compute_sched_to_shared(struct gpu_gen *gen,
1272         __isl_take isl_pw_multi_aff *iterator_map)
1273 {
1274         isl_union_map *umap;
1275         isl_space *space;
1276         isl_map *map, *sched;;
1277
1278         space = isl_space_range(isl_pw_multi_aff_get_space(iterator_map));
1279         space = isl_space_from_domain(space);
1280         space = isl_space_add_dims(space, isl_dim_out, gen->shared_len);
1281
1282         umap = isl_union_map_copy(gen->shared_sched);
1283         umap = isl_union_map_apply_range(umap,
1284                         isl_union_map_copy(gen->shared_proj));
1285         map = isl_union_map_extract_map(umap, space);
1286         isl_union_map_free(umap);
1287
1288         sched = isl_map_preimage_domain_pw_multi_aff(map, iterator_map);
1289         sched = isl_map_detect_equalities(sched);
1290
1291         return isl_pw_multi_aff_from_map(sched);
1292 }
1293
1294 /* Set unroll[j] if the input dimension j is involved in
1295  * the index expression represented by ma.
1296  */
1297 static int check_unroll(__isl_take isl_set *set, __isl_take isl_multi_aff *ma,
1298         void *user)
1299 {
1300         int i, j;
1301         int n_in = isl_multi_aff_dim(ma, isl_dim_in);
1302         int n_out = isl_multi_aff_dim(ma, isl_dim_out);
1303         int *unroll = user;
1304
1305         for (i = 0; i < n_out; ++i) {
1306                 isl_aff *aff;
1307
1308                 aff = isl_multi_aff_get_aff(ma, i);
1309                 for (j = 0; j < n_in; ++j)
1310                         if (isl_aff_involves_dims(aff, isl_dim_in, j, 1))
1311                                 unroll[j] = 1;
1312                 isl_aff_free(aff);
1313         }
1314
1315         isl_set_free(set);
1316         isl_multi_aff_free(ma);
1317         return 0;
1318 }
1319
1320 /* Given an array pos mapping input dimensions to the corresponding
1321  * output dimension, construct the corresponding map.
1322  */
1323 static __isl_give isl_map *permutation(__isl_take isl_space *dim,
1324         int *pos, int len)
1325 {
1326         int i;
1327         isl_constraint *c;
1328         isl_basic_map *bmap;
1329         isl_local_space *ls;
1330
1331         dim = isl_space_add_dims(dim, isl_dim_in, len);
1332         dim = isl_space_add_dims(dim, isl_dim_out, len);
1333         bmap = isl_basic_map_universe(isl_space_copy(dim));
1334         ls = isl_local_space_from_space(dim);
1335
1336         for (i = 0; i < len; ++i) {
1337                 c = isl_equality_alloc(isl_local_space_copy(ls));
1338                 c = isl_constraint_set_coefficient_si(c, isl_dim_in, i,
1339                                                       -1);
1340                 c = isl_constraint_set_coefficient_si(c, isl_dim_out, pos[i],
1341                                                       1);
1342                 bmap = isl_basic_map_add_constraint(bmap, c);
1343         }
1344         isl_local_space_free(ls);
1345
1346         return isl_map_from_basic_map(bmap);
1347 }
1348
1349 /* Remove the private tiles from all array reference groups,
1350  * except for the groups of arrays that are marked force_private.
1351  */
1352 static void remove_private_tiles(struct gpu_gen *gen)
1353 {
1354         int i, j;
1355
1356         for (i = 0; i < gen->kernel->n_array; ++i) {
1357                 struct gpu_local_array_info *local = &gen->kernel->array[i];
1358
1359                 if (local->force_private)
1360                         continue;
1361
1362                 for (j = 0; j < local->n_group; ++j) {
1363                         struct gpu_array_ref_group *group = local->groups[j];
1364
1365                         group->private_tile =
1366                                     gpu_array_tile_free(group->private_tile);
1367                 }
1368         }
1369 }
1370
1371 /* Find all loops involved in any of the index expressions for any of
1372  * the private accesses, move them innermost and then mark them as
1373  * requiring unrolling by setting gen->first_unroll.
1374  * The loops involved should all be parallel because of the checks
1375  * we performed in check_private_group_access.  Moving them innermost
1376  * is therefore a valid transformation.
1377  *
1378  * If any of the arrays are marked force_private, however, then
1379  * those loops may not be parallel with respect to the marked arrays.
1380  * If any of the loops would have to be moved innermost for the
1381  * (non forced) private accesses and if there are any force_private
1382  * arrays, then we revert the decision to map the selected arrays
1383  * to private memory.  An alternative solution would be to expand
1384  * the force_private arrays.
1385  *
1386  * Loops up to gen->shared_len are generated before the mapping to
1387  * threads is applied.  They should therefore be ignored.
1388  *
1389  * We compute the hidden equalities of the schedule first
1390  * since we will need them in our calls to isl_pw_multi_aff_from_map
1391  * and because we want to make sure that the same equalities
1392  * are also available to the code generator.
1393  */
1394 static __isl_give isl_union_map *interchange_for_unroll(struct gpu_gen *gen,
1395         __isl_take isl_union_map *sched)
1396 {
1397         struct ppcg_kernel *kernel = gen->kernel;
1398         int i, j;
1399         int unroll[gen->thread_tiled_len];
1400         int perm[gen->thread_tiled_len];
1401         isl_space *dim;
1402         isl_map *permute;
1403         int len = gen->shared_len + kernel->n_parallel + kernel->n_block;
1404
1405         gen->first_unroll = -1;
1406
1407         sched = isl_union_map_detect_equalities(sched);
1408         for (i = 0; i < gen->thread_tiled_len; ++i)
1409                 unroll[i] = 0;
1410         for (i = 0; i < kernel->n_array; ++i) {
1411                 struct gpu_local_array_info *array = &kernel->array[i];
1412
1413                 for (j = 0; j < array->n_group; ++j) {
1414                         isl_union_map *access;
1415                         isl_map *acc;
1416                         isl_pw_multi_aff *pma;
1417
1418                         if (!array->groups[j]->private_tile)
1419                                 continue;
1420
1421                         access = gpu_array_ref_group_access_relation(
1422                                                         array->groups[j], 1, 1);
1423                         access = isl_union_map_apply_domain(access,
1424                                                 isl_union_map_copy(sched));
1425
1426                         acc = isl_map_from_union_map(access);
1427                         pma = isl_pw_multi_aff_from_map(acc);
1428                         isl_pw_multi_aff_foreach_piece(pma,
1429                                                         &check_unroll, unroll);
1430
1431                         isl_pw_multi_aff_free(pma);
1432                 }
1433         }
1434
1435         for (i = gen->shared_len; i < len; ++i)
1436                 if (unroll[i])
1437                         break;
1438
1439         if (i >= len)
1440                 return sched;
1441
1442         for (i = len; i < gen->thread_tiled_len; ++i)
1443                 if (unroll[i])
1444                         return sched;
1445
1446         if (kernel->any_force_private) {
1447                 remove_private_tiles(gen);
1448                 return sched;
1449         }
1450
1451         j = 0;
1452         for (i = 0; i < gen->shared_len; ++i)
1453                 perm[i] = j++;
1454         for (i = gen->shared_len; i < gen->thread_tiled_len; ++i)
1455                 if (!unroll[i])
1456                         perm[i] = j++;
1457         gen->first_unroll = j - gen->shared_len;
1458         for (i = gen->shared_len; i < len; ++i)
1459                 if (unroll[i])
1460                         perm[i] = j++;
1461
1462         dim = isl_union_map_get_space(sched);
1463         permute = permutation(dim, perm, gen->thread_tiled_len);
1464         sched = isl_union_map_apply_range(sched,
1465                                           isl_union_map_from_map(permute));
1466
1467         return sched;
1468 }
1469
1470 /* Construct a map with input the shared tile loops and the loops that
1471  * will be wrapped around the threads that relates these later loops
1472  * to the thread indices and then projects them out.
1473  */
1474 static __isl_give isl_map *compute_privatization(struct gpu_gen *gen)
1475 {
1476         struct ppcg_kernel *kernel = gen->kernel;
1477         isl_map *priv;
1478         isl_map *tiling;
1479         isl_map *proj;
1480         isl_set *par;
1481         isl_space *dim;
1482
1483         dim = isl_union_map_get_space(gen->shared_sched);
1484
1485         if (gen->options->wrap)
1486                 tiling = wrap(isl_space_copy(dim),
1487                         gen->shared_len + kernel->n_block,
1488                         gen->shared_len, kernel->n_block, kernel->block_dim);
1489         else
1490                 tiling = tile(isl_space_copy(dim),
1491                         gen->shared_len + kernel->n_block,
1492                         gen->shared_len, kernel->n_block, kernel->block_dim);
1493
1494         priv = tiling;
1495
1496         par = parametrization(dim, gen->shared_len + 2 * kernel->n_block,
1497                 gen->tile_first + kernel->tile_len +
1498                 kernel->n_grid + kernel->n_block, kernel->thread_ids);
1499
1500         priv = isl_map_align_params(priv, isl_set_get_space(par));
1501         priv = isl_map_intersect_range(priv, par);
1502
1503         dim = isl_map_get_space(priv);
1504         dim = isl_space_drop_dims(dim, isl_dim_in, 0, isl_space_dim(dim, isl_dim_in));
1505         dim = isl_space_drop_dims(dim, isl_dim_out, 0, isl_space_dim(dim, isl_dim_out));
1506         proj = projection(dim, gen->shared_len + 2 * kernel->n_block,
1507                           gen->shared_len);
1508
1509         priv = isl_map_apply_range(priv, proj);
1510
1511         return priv;
1512 }
1513
1514 /* If max_shared_memory is not set to infinity (-1), then make
1515  * sure that the total amount of shared memory required by the
1516  * array reference groups mapped to shared memory is no larger
1517  * than this maximum.
1518  *
1519  * We apply a greedy approach and discard (keep in global memory)
1520  * those groups that would result in a total memory size that
1521  * is larger than the maximum.
1522  *
1523  * This function should be called after any function that may
1524  * affect the decision on whether to place a reference group
1525  * in private, shared or global memory.
1526  */
1527 static void check_shared_memory_bound(struct gpu_gen *gen)
1528 {
1529         int i, j;
1530         isl_val *left, *size;
1531
1532         if (gen->options->max_shared_memory < 0)
1533                 return;
1534
1535         left = isl_val_int_from_si(gen->ctx, gen->options->max_shared_memory);
1536
1537         for (i = 0; i < gen->kernel->n_array; ++i) {
1538                 struct gpu_local_array_info *local = &gen->kernel->array[i];
1539
1540                 for (j = 0; j < local->n_group; ++j) {
1541                         struct gpu_array_ref_group *group;
1542
1543                         group = local->groups[j];
1544                         if (group->private_tile)
1545                                 continue;
1546                         if (!group->shared_tile)
1547                                 continue;
1548
1549                         size = gpu_array_tile_size(group->shared_tile);
1550                         size = isl_val_mul_ui(size, local->array->size);
1551
1552                         if (isl_val_le(size, left)) {
1553                                 left = isl_val_sub(left, size);
1554                                 continue;
1555                         }
1556                         isl_val_free(size);
1557
1558                         group->shared_tile =
1559                                         gpu_array_tile_free(group->shared_tile);
1560                 }
1561         }
1562
1563         isl_val_free(left);
1564 }
1565
1566 /* Compute a tiling for all the array reference groups.
1567  */
1568 static void compute_group_tilings(struct gpu_gen *gen)
1569 {
1570         int i, j;
1571
1572         for (i = 0; i < gen->kernel->n_array; ++i) {
1573                 struct gpu_local_array_info *array = &gen->kernel->array[i];
1574
1575                 for (j = 0; j < array->n_group; ++j)
1576                         gpu_array_ref_group_compute_tiling(array->groups[j]);
1577         }
1578 }
1579
1580 /* Take tiled_sched, project it onto the shared tile loops and
1581  * the loops that will be wrapped over the threads and
1582  * store the result in gen->shared_sched.
1583  * Also compute a projection that projects out the loops that will be
1584  * wrapped over the threads and store this projection in gen->shared_proj.
1585  */
1586 static void compute_shared_sched(struct gpu_gen *gen)
1587 {
1588         isl_space *dim;
1589         isl_map *proj;
1590         isl_set *par;
1591         isl_union_map *sched;
1592
1593         sched = isl_union_map_copy(gen->tiled_sched);
1594
1595         dim = isl_union_map_get_space(sched);
1596         proj = projection(dim, gen->tiled_len,
1597                                 gen->shared_len + gen->kernel->n_block);
1598         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
1599
1600         dim = isl_union_map_get_space(sched);
1601         proj = projection(dim, gen->shared_len + gen->kernel->n_block,
1602                         gen->shared_len);
1603
1604         gen->shared_sched = sched;
1605         gen->shared_proj = isl_union_map_from_map(proj);
1606 }
1607
1608 /* Compute the size of a bounding box around the origin and "set",
1609  * where "set" is assumed to contain only non-negative elements.
1610  * In particular, compute the maximal value of "set" in each direction
1611  * and add one.
1612  */
1613 static __isl_give isl_multi_pw_aff *extract_size(__isl_take isl_set *set,
1614         __isl_take isl_set *context)
1615 {
1616         int i, n;
1617         isl_multi_pw_aff *mpa;
1618
1619         context = isl_set_params(context);
1620         n = isl_set_dim(set, isl_dim_set);
1621         mpa = isl_multi_pw_aff_zero(isl_set_get_space(set));
1622         for (i = 0; i < n; ++i) {
1623                 isl_space *space;
1624                 isl_aff *one;
1625                 isl_pw_aff *bound;
1626
1627                 bound = isl_set_dim_max(isl_set_copy(set), i);
1628                 bound = isl_pw_aff_coalesce(bound);
1629                 bound = isl_pw_aff_gist(bound, isl_set_copy(context));
1630
1631                 space = isl_pw_aff_get_domain_space(bound);
1632                 one = isl_aff_zero_on_domain(isl_local_space_from_space(space));
1633                 one = isl_aff_add_constant_si(one, 1);
1634                 bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one));
1635                 mpa = isl_multi_pw_aff_set_pw_aff(mpa, i, bound);
1636         }
1637         isl_set_free(set);
1638         isl_set_free(context);
1639
1640         return mpa;
1641 }
1642
1643 /* Compute the effective grid size as a list of the sizes in each dimension.
1644  *
1645  * The grid size specified by the user or set by default
1646  * in read_grid_sizes() and applied in tile_schedule(),
1647  * may be too large for the given code in the sense that
1648  * it may contain blocks that don't need to execute anything.
1649  * We therefore don't return this grid size, but instead the
1650  * smallest grid size that ensures that all blocks that actually
1651  * execute code are included in the grid.
1652  *
1653  * We first extract a description of the grid, i.e., the possible values
1654  * of the block ids, from gen->tiled_sched.
1655  * The block ids are parameters in gen->tiled_sched.
1656  * We simply need to change them into set dimensions.
1657  *
1658  * Then, for each block dimension, we compute the maximal value of the block id
1659  * and add one.
1660  */
1661 static __isl_give isl_multi_pw_aff *extract_grid_size(struct gpu_gen *gen,
1662         struct ppcg_kernel *kernel)
1663 {
1664         int i;
1665         isl_set *grid;
1666
1667         grid = isl_union_map_params(isl_union_map_copy(gen->tiled_sched));
1668         grid = isl_set_from_params(grid);
1669         grid = isl_set_add_dims(grid, isl_dim_set, kernel->n_grid);
1670         for (i = 0; i < kernel->n_grid; ++i) {
1671                 int pos;
1672                 isl_id *id;
1673
1674                 id = isl_id_list_get_id(kernel->block_ids, i);
1675                 pos = isl_set_find_dim_by_id(grid, isl_dim_param, id);
1676                 isl_id_free(id);
1677                 assert(pos >= 0);
1678                 grid = isl_set_equate(grid, isl_dim_param, pos, isl_dim_set, i);
1679                 grid = isl_set_project_out(grid, isl_dim_param, pos, 1);
1680         }
1681
1682         return extract_size(grid, isl_set_copy(kernel->context));
1683 }
1684
1685 /* Compute the size of a fixed bounding box around the origin and "set",
1686  * where "set" is assumed to contain only non-negative elements,
1687  * and store the results in "size".
1688  * In particular, compute the maximal value of "set" in each direction
1689  * and add one.
1690  */
1691 static void extract_fixed_size(__isl_take isl_set *set, int *size)
1692 {
1693         int i, n;
1694         isl_local_space *ls;
1695         isl_aff *obj;
1696
1697         n = isl_set_dim(set, isl_dim_set);
1698         ls = isl_local_space_from_space(isl_set_get_space(set));
1699         obj = isl_aff_zero_on_domain(ls);
1700         for (i = 0; i < n; ++i) {
1701                 isl_val *max;
1702
1703                 obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 1);
1704                 max = isl_set_max_val(set, obj);
1705                 size[i] = isl_val_get_num_si(max) + 1;
1706                 isl_val_free(max);
1707                 obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 0);
1708         }
1709         isl_aff_free(obj);
1710         isl_set_free(set);
1711 }
1712
1713 /* Compute the effective block size as a list of the sizes in each dimension
1714  * and store the sizes in kernel->block_dim.
1715  *
1716  * The block size specified by the user or set by default
1717  * in read_block_sizes() and applied in thread_tile_schedule(),
1718  * may be too large for the given code in the sense that
1719  * it may contain threads that don't need to execute anything.
1720  * We therefore update this block size in kernel->block_dim
1721  * to the smallest block size that ensures that all threads
1722  * that actually execute code are included in the block.
1723  *
1724  * The current implementation eliminates all parameters, ensuring
1725  * that the size is a fixed constant in each dimension.
1726  * In principle we could also compute parametric sizes.
1727  * We would have to make sure to project out all b%d and t%d parameters,
1728  * however.
1729  */
1730 static void extract_block_size(struct gpu_gen *gen, struct ppcg_kernel *kernel)
1731 {
1732         int i;
1733         int nparam;
1734         isl_set *block;
1735         isl_multi_pw_aff *mpa;
1736
1737         block = isl_union_map_params(isl_union_map_copy(gen->local_sched));
1738         block = isl_set_from_params(block);
1739         block = isl_set_add_dims(block, isl_dim_set, kernel->n_block);
1740         for (i = 0; i < kernel->n_block; ++i) {
1741                 int pos;
1742                 isl_id *id;
1743
1744                 id = isl_id_list_get_id(kernel->thread_ids, i);
1745                 pos = isl_set_find_dim_by_id(block, isl_dim_param, id);
1746                 isl_id_free(id);
1747                 assert(pos >= 0);
1748                 block = isl_set_equate(block, isl_dim_param, pos,
1749                                         isl_dim_set, i);
1750         }
1751         nparam = isl_set_dim(block, isl_dim_param);
1752         block = isl_set_project_out(block, isl_dim_param, 0, nparam);
1753
1754         extract_fixed_size(block, kernel->block_dim);
1755 }
1756
1757 struct ppcg_kernel *ppcg_kernel_free(struct ppcg_kernel *kernel)
1758 {
1759         int i, j;
1760
1761         if (!kernel)
1762                 return NULL;
1763
1764         isl_id_list_free(kernel->block_ids);
1765         isl_id_list_free(kernel->thread_ids);
1766         isl_multi_pw_aff_free(kernel->grid_size);
1767         isl_set_free(kernel->context);
1768         isl_union_set_free(kernel->arrays);
1769         isl_space_free(kernel->space);
1770         isl_ast_node_free(kernel->tree);
1771
1772         for (i = 0; i < kernel->n_array; ++i) {
1773                 struct gpu_local_array_info *array = &kernel->array[i];
1774
1775                 for (j = 0; j < array->n_group; ++j)
1776                         gpu_array_ref_group_free(array->groups[j]);
1777                 free(array->groups);
1778
1779                 isl_pw_aff_list_free(array->bound);
1780         }
1781         free(kernel->array);
1782
1783         for (i = 0; i < kernel->n_var; ++i) {
1784                 free(kernel->var[i].name);
1785                 isl_vec_free(kernel->var[i].size);
1786         }
1787         free(kernel->var);
1788         free(kernel->tile_size);
1789
1790         free(kernel);
1791
1792         return NULL;
1793 }
1794
1795 /* Wrapper around ppcg_kernel_free for use as a isl_id_set_free_user callback.
1796  */
1797 static void ppcg_kernel_free_wrap(void *user)
1798 {
1799         struct ppcg_kernel *kernel = user;
1800
1801         ppcg_kernel_free(kernel);
1802 }
1803
1804 static void create_kernel_var(isl_ctx *ctx, struct gpu_array_ref_group *group,
1805         struct ppcg_kernel_var *var)
1806 {
1807         int j;
1808         struct gpu_array_tile *tile;
1809         isl_printer *p;
1810         char *name;
1811
1812         var->array = group->array;
1813
1814         tile = group->private_tile;
1815         var->type = ppcg_access_private;
1816         if (!tile) {
1817                 tile = group->shared_tile;
1818                 var->type = ppcg_access_shared;
1819         }
1820
1821         p = isl_printer_to_str(ctx);
1822         p = gpu_array_ref_group_print_name(group, p);
1823         var->name = isl_printer_get_str(p);
1824         isl_printer_free(p);
1825
1826         var->size = isl_vec_alloc(ctx, group->array->n_index);
1827
1828         for (j = 0; j < group->array->n_index; ++j)
1829                 var->size = isl_vec_set_element_val(var->size, j,
1830                                             isl_val_copy(tile->bound[j].size));
1831 }
1832
1833 static void create_kernel_vars(struct gpu_gen *gen, struct ppcg_kernel *kernel)
1834 {
1835         int i, j, n;
1836
1837         n = 0;
1838         for (i = 0; i < kernel->n_array; ++i) {
1839                 struct gpu_local_array_info *array = &kernel->array[i];
1840
1841                 for (j = 0; j < array->n_group; ++j) {
1842                         struct gpu_array_ref_group *group = array->groups[j];
1843                         if (group->private_tile || group->shared_tile)
1844                                 ++n;
1845                 }
1846         }
1847
1848         kernel->n_var = n;
1849         kernel->var = isl_calloc_array(gen->ctx, struct ppcg_kernel_var, n);
1850         assert(kernel->var);
1851
1852         n = 0;
1853         for (i = 0; i < kernel->n_array; ++i) {
1854                 struct gpu_local_array_info *array = &kernel->array[i];
1855
1856                 for (j = 0; j < array->n_group; ++j) {
1857                         struct gpu_array_ref_group *group = array->groups[j];
1858                         if (!group->private_tile && !group->shared_tile)
1859                                 continue;
1860                         create_kernel_var(gen->ctx, group, &kernel->var[n]);
1861                         ++n;
1862                 }
1863         }
1864 }
1865
1866 /* Replace "pa" by the zero function defined over the universe domain
1867  * in the space of "pa".
1868  */
1869 static __isl_give isl_pw_aff *set_universally_zero(__isl_take isl_pw_aff *pa)
1870 {
1871         isl_space *space;
1872         isl_aff *zero;
1873
1874         space = isl_space_domain(isl_pw_aff_get_space(pa));
1875         isl_pw_aff_free(pa);
1876         zero = isl_aff_zero_on_domain(isl_local_space_from_space(space));
1877
1878         return isl_pw_aff_from_aff(zero);
1879 }
1880
1881 /* The sizes of the arrays on the host that have been computed by
1882  * extract_array_info may depend on the parameters.  Use the extra
1883  * constraints on the parameters that are valid at "host_domain"
1884  * to simplify these expressions and store the results in kernel->array.
1885  *
1886  * We only need these localized bounds for arrays that are accessed
1887  * by the current kernel.  If we have found at least one reference group
1888  * then the array is accessed by the kernel.  If the array has compound
1889  * elements then we skipped the construction of array reference groups.
1890  *
1891  * The resulting sizes may be functions that are nowhere defined
1892  * in case the access function cannot possibly access anything inside
1893  * the kernel for some reason.  If so, they are replaced by the zero
1894  * function.  Since the access function cannot actually access anything,
1895  * there is no harm in printing the array sizes as zero.
1896  */
1897 static void localize_bounds(struct gpu_gen *gen, struct ppcg_kernel *kernel,
1898         __isl_keep isl_set *host_domain)
1899 {
1900         int i, j;
1901         isl_set *context;
1902
1903         context = isl_set_copy(host_domain);
1904         context = isl_set_params(context);
1905
1906         for (i = 0; i < kernel->n_array; ++i) {
1907                 struct gpu_local_array_info *local = &kernel->array[i];
1908                 isl_pw_aff_list *bound;
1909                 int n_index;
1910
1911                 if (local->n_group == 0 && !local->array->has_compound_element)
1912                         continue;
1913
1914                 n_index = local->array->n_index;
1915                 bound = isl_pw_aff_list_alloc(gen->ctx, n_index);
1916
1917                 for (j = 0; j < n_index; ++j) {
1918                         isl_pw_aff *pwaff;
1919                         int empty;
1920
1921                         pwaff = isl_pw_aff_copy(local->array->bound[j]);
1922                         pwaff = isl_pw_aff_gist(pwaff, isl_set_copy(context));
1923                         empty = isl_pw_aff_is_empty(pwaff);
1924                         if (empty < 0)
1925                                 pwaff = isl_pw_aff_free(pwaff);
1926                         else if (empty)
1927                                 pwaff = set_universally_zero(pwaff);
1928                         bound = isl_pw_aff_list_add(bound, pwaff);
1929                 }
1930
1931                 local->n_index = n_index;
1932                 local->bound = bound;
1933         }
1934         isl_set_free(context);
1935 }
1936
1937 /* Create the array of gpu_local_array_info structures "array"
1938  * inside "kernel".  The number of elements in this array is
1939  * the same as the number of arrays in "prog".
1940  * Initialize the "array" field of each local array to point
1941  * to the corresponding array in "prog".
1942  */
1943 static struct ppcg_kernel *ppcg_kernel_create_local_arrays(
1944         struct ppcg_kernel *kernel, struct gpu_prog *prog)
1945 {
1946         int i;
1947         isl_ctx *ctx;
1948
1949         ctx = isl_set_get_ctx(prog->context);
1950         kernel->array = isl_calloc_array(ctx,
1951                             struct gpu_local_array_info, prog->n_array);
1952         if (!kernel->array)
1953                 return ppcg_kernel_free(kernel);
1954         kernel->n_array = prog->n_array;
1955
1956         for (i = 0; i < prog->n_array; ++i)
1957                 kernel->array[i].array = &prog->array[i];
1958
1959         return kernel;
1960 }
1961
1962 /* Find the element in gen->stmt that has the given "id".
1963  * Return NULL if no such gpu_stmt can be found.
1964  */
1965 static struct gpu_stmt *find_stmt(struct gpu_prog *prog, __isl_keep isl_id *id)
1966 {
1967         int i;
1968
1969         for (i = 0; i < prog->n_stmts; ++i) {
1970                 if (id == prog->stmts[i].id)
1971                         break;
1972         }
1973
1974         return i < prog->n_stmts ? &prog->stmts[i] : NULL;
1975 }
1976
1977 void ppcg_kernel_stmt_free(void *user)
1978 {
1979         int i;
1980         struct ppcg_kernel_stmt *stmt = user;
1981
1982         if (!stmt)
1983                 return;
1984
1985         switch (stmt->type) {
1986         case ppcg_kernel_copy:
1987                 isl_ast_expr_free(stmt->u.c.index);
1988                 isl_ast_expr_free(stmt->u.c.local_index);
1989                 break;
1990         case ppcg_kernel_domain:
1991                 isl_id_to_ast_expr_free(stmt->u.d.ref2expr);
1992                 break;
1993         case ppcg_kernel_sync:
1994                 break;
1995         }
1996
1997         free(stmt);
1998 }
1999
2000 /* Set the options of "context" to
2001  *
2002  *      { space -> [x] : x >= first }
2003  */
2004 static __isl_give isl_ast_build *set_unroll(
2005         __isl_take isl_ast_build *build, __isl_take isl_space *space,
2006         int first)
2007 {
2008         isl_ctx *ctx;
2009         isl_map *unroll;
2010         isl_union_map *opt;
2011
2012         ctx = isl_ast_build_get_ctx(build);
2013
2014         space = isl_space_from_domain(space);
2015         space = isl_space_add_dims(space, isl_dim_out, 1);
2016         space = isl_space_set_tuple_name(space, isl_dim_out, "unroll");
2017         unroll = isl_map_universe(space);
2018         unroll = isl_map_lower_bound_si(unroll, isl_dim_out, 0, first);
2019         opt = isl_union_map_from_map(unroll);
2020
2021         build = isl_ast_build_set_options(build, opt);
2022
2023         return build;
2024 }
2025
2026 /* Extend the schedule "schedule" with the part of "extension"
2027  * starting at "first" up to "len".
2028  */
2029 static __isl_give isl_union_map *extend_schedule(
2030         __isl_take isl_union_map *schedule,
2031         __isl_take isl_union_map *extension, int first, int len)
2032 {
2033         isl_space *space;
2034         isl_map *proj;
2035         isl_union_map *umap;
2036         isl_set *set;
2037
2038         space = isl_union_map_get_space(schedule);
2039         space = isl_space_set_from_params(space);
2040         space = isl_space_add_dims(space, isl_dim_set, len);
2041         proj = isl_set_identity(isl_set_universe(space));
2042         proj = isl_map_project_out(proj, isl_dim_out, 0, first);
2043         extension = isl_union_map_apply_range(extension,
2044                                                 isl_union_map_from_map(proj));
2045
2046         schedule = isl_union_map_range_product(schedule, extension);
2047
2048         return schedule;
2049 }
2050
2051 /* Return the gpu_stmt_access in the list "accesses" that corresponds
2052  * to "ref_id".
2053  */
2054 static struct gpu_stmt_access *find_access(struct gpu_stmt_access *accesses,
2055         __isl_keep isl_id *ref_id)
2056 {
2057         struct gpu_stmt_access *access;
2058
2059         for (access = accesses; access; access = access->next)
2060                 if (access->ref_id == ref_id)
2061                         return access;
2062
2063         return NULL;
2064 }
2065
2066 /* Return the index of the array called "name" in the list of arrays.
2067  */
2068 static int find_array_index(struct gpu_gen *gen, const char *name)
2069 {
2070         int i;
2071
2072         for (i = 0; i < gen->prog->n_array; ++i)
2073                 if (!strcmp(name, gen->prog->array[i].name))
2074                         return i;
2075
2076         return -1;
2077 }
2078
2079 /* Internal data structure for the index and AST expression transformation
2080  * callbacks for pet_stmt_build_ast_exprs.
2081  *
2082  * "accesses" is the list of gpu_stmt_access in the statement.
2083  * "iterator_map" expresses the statement iterators in terms of
2084  * the AST loop iterators.
2085  * "sched2shared" expresses the first shared_len dimensions of
2086  * the computed schedule in terms of the AST loop iterators.
2087  *
2088  * The following fields are set in transform_index and used in transform_expr.
2089  * "array" is the array that is being accessed.
2090  * "global" is set if the global array is accessed (rather than
2091  * shared/private memory).
2092  * "local_array" refers to information on the array specialized
2093  * to the current kernel.
2094  */
2095 struct ppcg_transform_data {
2096         struct gpu_gen *gen;
2097         struct gpu_stmt_access *accesses;
2098         isl_pw_multi_aff *iterator_map;
2099         isl_pw_multi_aff *sched2shared;
2100
2101         struct gpu_array_info *array;
2102         int global;
2103         struct gpu_local_array_info *local_array;
2104 };
2105
2106 /* Return the name of the outer array (of structs) accessed by "access".
2107  */
2108 static const char *get_outer_array_name(__isl_keep isl_map *access)
2109 {
2110         isl_space *space;
2111         const char *name;
2112
2113         space = isl_space_range(isl_map_get_space(access));
2114         while (space && isl_space_is_wrapping(space))
2115                 space = isl_space_domain(isl_space_unwrap(space));
2116         name = isl_space_get_tuple_name(space, isl_dim_set);
2117         isl_space_free(space);
2118
2119         return name;
2120 }
2121
2122 /* Return a pointer to the gpu_array_ref_group in "local"
2123  * that contains the reference "access".
2124  * Return NULL if no such group can be found.
2125  */
2126 static struct gpu_array_ref_group *find_ref_group(
2127         struct gpu_local_array_info *local, struct gpu_stmt_access *access)
2128 {
2129         int i, j;
2130
2131         for (i = 0; i < local->n_group; ++i) {
2132                 struct gpu_array_ref_group *group = local->groups[i];
2133
2134                 for (j = 0; j < group->n_ref; ++j)
2135                         if (group->refs[j] == access)
2136                                 return group;
2137         }
2138
2139         return NULL;
2140 }
2141
2142 /* Index transformation callback for pet_stmt_build_ast_exprs.
2143  *
2144  * "index" expresses the array indices in terms of statement iterators
2145  *
2146  * We first reformulate "index" in terms of the AST loop iterators.
2147  * Then we check if we are accessing the global array or
2148  * a shared/private copy.  In the former case, we simply return
2149  * the updated index.  If "index" is an affine expression rather
2150  * than an array access, then we also return the updated index here.
2151  *
2152  * If no reference groups have been computed for the array,
2153  * then we can only be accessing the global array.
2154  *
2155  * Otherwise, we apply the tiling to the index.
2156  * This tiling is of the form
2157  *
2158  *      [D -> A] -> T
2159  *
2160  * The index is of the form
2161  *
2162  *      L -> A
2163  *
2164  * We update the tiling to refer to the AST loop iterators
2165  *
2166  *      [L -> A] -> T
2167  *
2168  * and modify index to keep track of those iterators
2169  *
2170  *      L -> [L -> A]
2171  *
2172  * Combining these two yields a tiled index expression in terms
2173  * of the AST loop iterators
2174  *
2175  *      L -> T
2176  */
2177 static __isl_give isl_multi_pw_aff *transform_index(
2178         __isl_take isl_multi_pw_aff *index, __isl_keep isl_id *ref_id,
2179         void *user)
2180 {
2181         struct ppcg_transform_data *data = user;
2182         struct gpu_stmt_access *access;
2183         struct gpu_array_ref_group *group;
2184         struct gpu_array_tile *tile;
2185         isl_pw_multi_aff *iterator_map;
2186         int i;
2187         const char *name;
2188         isl_space *space;
2189         isl_multi_pw_aff *tiling;
2190         isl_pw_multi_aff *pma;
2191         isl_multi_pw_aff *mpa;
2192
2193         data->array = NULL;
2194
2195         iterator_map = isl_pw_multi_aff_copy(data->iterator_map);
2196         index = isl_multi_pw_aff_pullback_pw_multi_aff(index, iterator_map);
2197
2198         access = find_access(data->accesses, ref_id);
2199         if (!access)
2200                 return index;
2201         if (!isl_map_has_tuple_name(access->access, isl_dim_out))
2202                 return index;
2203
2204         name = get_outer_array_name(access->access);
2205         i = find_array_index(data->gen, name);
2206         if (i < 0)
2207                 isl_die(isl_multi_pw_aff_get_ctx(index), isl_error_internal,
2208                         "cannot find array",
2209                         return isl_multi_pw_aff_free(index));
2210         data->array = &data->gen->prog->array[i];
2211         data->local_array = &data->gen->kernel->array[i];
2212
2213         group = find_ref_group(data->local_array, access);
2214         if (!group) {
2215                 data->global = 1;
2216                 return index;
2217         }
2218
2219         tile = group->private_tile;
2220         if (!tile)
2221                 tile = group->shared_tile;
2222         data->global = !tile;
2223         if (!tile)
2224                 return index;
2225
2226         space = isl_space_range(isl_multi_pw_aff_get_space(index));
2227         space = isl_space_map_from_set(space);
2228         pma = isl_pw_multi_aff_identity(space);
2229         pma = isl_pw_multi_aff_product(
2230                         isl_pw_multi_aff_copy(data->sched2shared), pma);
2231         tiling = isl_multi_pw_aff_from_multi_aff(
2232                                     isl_multi_aff_copy(tile->tiling));
2233         tiling = isl_multi_pw_aff_pullback_pw_multi_aff(tiling, pma);
2234
2235         space = isl_space_domain(isl_multi_pw_aff_get_space(index));
2236         space = isl_space_map_from_set(space);
2237         mpa = isl_multi_pw_aff_identity(space);
2238         index = isl_multi_pw_aff_range_product(mpa, index);
2239         index = isl_multi_pw_aff_pullback_multi_pw_aff(tiling, index);
2240
2241         return index;
2242 }
2243
2244 /* Dereference "expr" by adding an index [0].
2245  * The original "expr" is assumed not to have any indices.
2246  *
2247  * If "expr" is a member access, then the dereferencing needs
2248  * to be applied to the structure argument of this member access.
2249  */
2250 static __isl_give isl_ast_expr *dereference(__isl_take isl_ast_expr *expr)
2251 {
2252         isl_ctx *ctx;
2253         isl_ast_expr *arg0, *res;
2254         isl_ast_expr_list *list;
2255
2256         arg0 = isl_ast_expr_get_op_arg(expr, 0);
2257         if (!arg0)
2258                 return isl_ast_expr_free(expr);
2259         if (isl_ast_expr_get_type(arg0) == isl_ast_expr_op &&
2260             isl_ast_expr_get_op_type(arg0) == isl_ast_op_member) {
2261                 isl_ast_expr *arg;
2262
2263                 arg = isl_ast_expr_get_op_arg(arg0, 0);
2264                 arg = dereference(arg);
2265                 arg0 = isl_ast_expr_set_op_arg(arg0, 0, arg);
2266                 expr = isl_ast_expr_set_op_arg(expr, 0, arg0);
2267
2268                 return expr;
2269         }
2270         isl_ast_expr_free(arg0);
2271
2272         ctx = isl_ast_expr_get_ctx(expr);
2273         res = isl_ast_expr_from_val(isl_val_zero(ctx));
2274         list = isl_ast_expr_list_from_ast_expr(res);
2275         res = isl_ast_expr_get_op_arg(expr, 0);
2276         res = isl_ast_expr_access(res, list);
2277         isl_ast_expr_free(expr);
2278
2279         return res;
2280 }
2281
2282 /* Linearize the index expression "expr" based on the array bounds
2283  * of "array".
2284  *
2285  * That is, transform expression
2286  *
2287  *      A[i_0][i_1]...[i_n]
2288  *
2289  * to
2290  *
2291  *      A[(..((i_0 * b_1 + i_1) ... ) * b_n + i_n]
2292  *
2293  * where b_0, b_1, ..., b_n are the bounds on the array.
2294  *
2295  * If the base of "expr" is a member access, then the linearization needs
2296  * to be applied to the structure argument of this member access.
2297  *
2298  * In the base case, if "expr" has no arguments (other than the name of
2299  * the array), then we are passing an entire array to a function.
2300  * In this case, there is nothing to linearize.
2301  * Note that at this point an expression with no arguments can
2302  * only be an entire array because the scalar case and
2303  * the case of single struct are handled by the caller.
2304  *
2305  * If the number of specified index expressions in "expr"
2306  * is smaller than the dimension of the accessed array,
2307  * then the missing i_j also do not appear in the linearized expression.
2308  * Furthermore, since such an expression does not refer to a single
2309  * element while the default linearized expression would refer to
2310  * a single element, we return the expression
2311  *
2312  *      A + (..((i_0 * b_1 + i_1) ... ) * b_n]
2313  *
2314  * instead.  Note that because of the special case handling above,
2315  * we can assume here that here that there is at least one index expression.
2316  */
2317 __isl_give isl_ast_expr *gpu_local_array_info_linearize_index(
2318         struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr)
2319 {
2320         int i, n;
2321         isl_ctx *ctx;
2322         isl_set *context;
2323         isl_ast_expr *arg0;
2324         isl_ast_expr *res;
2325         isl_ast_expr_list *list;
2326         isl_ast_build *build;
2327
2328         arg0 = isl_ast_expr_get_op_arg(expr, 0);
2329         if (isl_ast_expr_get_type(arg0) == isl_ast_expr_op &&
2330             isl_ast_expr_get_op_type(arg0) == isl_ast_op_member) {
2331                 isl_ast_expr *arg;
2332
2333                 arg = isl_ast_expr_get_op_arg(arg0, 0);
2334                 arg = gpu_local_array_info_linearize_index(array, arg);
2335                 arg0 = isl_ast_expr_set_op_arg(arg0, 0, arg);
2336                 expr = isl_ast_expr_set_op_arg(expr, 0, arg0);
2337
2338                 return expr;
2339         }
2340         isl_ast_expr_free(arg0);
2341
2342         if (isl_ast_expr_get_op_n_arg(expr) == 1)
2343                 return expr;
2344
2345         ctx = isl_ast_expr_get_ctx(expr);
2346         context = isl_set_universe(isl_space_params_alloc(ctx, 0));
2347         build = isl_ast_build_from_context(context);
2348
2349         n = isl_ast_expr_get_op_n_arg(expr);
2350         res = isl_ast_expr_get_op_arg(expr, 1);
2351         for (i = 1; i < array->n_index; ++i) {
2352                 isl_pw_aff *bound_i;
2353                 isl_ast_expr *expr_i;
2354
2355                 bound_i = isl_pw_aff_list_get_pw_aff(array->bound, i);
2356                 expr_i = isl_ast_build_expr_from_pw_aff(build, bound_i);
2357                 res = isl_ast_expr_mul(res, expr_i);
2358
2359                 if (i + 1 >= n)
2360                         continue;
2361                 expr_i = isl_ast_expr_get_op_arg(expr, i + 1);
2362                 res = isl_ast_expr_add(res, expr_i);
2363         }
2364
2365         isl_ast_build_free(build);
2366
2367         if (1 + array->n_index > n) {
2368                 res = isl_ast_expr_add(isl_ast_expr_get_op_arg(expr, 0), res);
2369         } else {
2370                 list = isl_ast_expr_list_from_ast_expr(res);
2371                 res = isl_ast_expr_get_op_arg(expr, 0);
2372                 res = isl_ast_expr_access(res, list);
2373         }
2374
2375         isl_ast_expr_free(expr);
2376
2377         return res;
2378 }
2379
2380 /* AST expression transformation callback for pet_stmt_build_ast_exprs.
2381  *
2382  * If the AST expression refers to an array that is not accessed
2383  * at all, then this means the value of the expression is not used,
2384  * so we might as well print zero (NULL pointer) instead.
2385  *
2386  * If the AST expression refers to a global scalar that is not
2387  * a read-only scalar, then its address was passed to the kernel and
2388  * we need to dereference it.
2389  *
2390  * If the AST expression refers to an access to a global array,
2391  * then we linearize the access exploiting the bounds in data->local_array.
2392  */
2393 static __isl_give isl_ast_expr *transform_expr(__isl_take isl_ast_expr *expr,
2394         __isl_keep isl_id *id, void *user)
2395 {
2396         struct ppcg_transform_data *data = user;
2397
2398         if (!data->array)
2399                 return expr;
2400         if (!data->array->accessed) {
2401                 isl_ctx *ctx;
2402
2403                 ctx = isl_ast_expr_get_ctx(expr);
2404                 isl_ast_expr_free(expr);
2405                 return isl_ast_expr_from_val(isl_val_zero(ctx));
2406         }
2407         if (gpu_array_is_read_only_scalar(data->array))
2408                 return expr;
2409         if (!data->global)
2410                 return expr;
2411         if (data->array->n_index == 0)
2412                 return dereference(expr);
2413         if (!data->array->linearize)
2414                 return expr;
2415
2416         return gpu_local_array_info_linearize_index(data->local_array, expr);
2417 }
2418
2419 /* This function is called for each instance of a user statement
2420  * in the kernel.
2421  *
2422  * We attach a struct ppcg_kernel_stmt to the "node", containing
2423  * a computed AST expression for each access.
2424  * These AST expressions are computed from iterator_map,
2425  * which expresses the domain
2426  * elements in terms of the generated loops, and sched2shared,
2427  * which expresses the first shared_len dimensions of the schedule
2428  * computed by PPCG in terms of the generated loops.
2429  */
2430 static __isl_give isl_ast_node *at_each_domain(__isl_take isl_ast_node *node,
2431         __isl_keep isl_ast_build *build, void *user)
2432 {
2433         struct ppcg_transform_data data;
2434         struct gpu_gen *gen = (struct gpu_gen *) user;
2435         struct ppcg_kernel_stmt *stmt;
2436         isl_id *id;
2437         isl_pw_multi_aff *sched2shared;
2438         isl_map *map;
2439         isl_pw_multi_aff *iterator_map;
2440         isl_ast_expr *expr, *arg;
2441         isl_union_map *schedule;
2442
2443         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
2444         if (!stmt)
2445                 return isl_ast_node_free(node);
2446
2447         expr = isl_ast_node_user_get_expr(node);
2448         arg = isl_ast_expr_get_op_arg(expr, 0);
2449         id = isl_ast_expr_get_id(arg);
2450
2451         schedule = isl_ast_build_get_schedule(build);
2452         map = isl_map_reverse(isl_map_from_union_map(schedule));
2453         iterator_map = isl_pw_multi_aff_from_map(map);
2454         sched2shared = compute_sched_to_shared(gen,
2455                                         isl_pw_multi_aff_copy(iterator_map));
2456
2457         stmt->type = ppcg_kernel_domain;
2458         stmt->u.d.stmt = find_stmt(gen->prog, id);
2459         if (!stmt->u.d.stmt)
2460                 isl_die(gen->ctx, isl_error_internal,
2461                         "statement not found", goto error);
2462
2463         data.gen = gen;
2464         data.accesses = stmt->u.d.stmt->accesses;
2465         data.iterator_map = iterator_map;
2466         data.sched2shared = sched2shared;
2467         stmt->u.d.ref2expr = pet_stmt_build_ast_exprs(stmt->u.d.stmt->stmt,
2468                                             build, &transform_index, &data,
2469                                             &transform_expr, &data);
2470
2471         isl_id_free(id);
2472         isl_pw_multi_aff_free(iterator_map);
2473         isl_pw_multi_aff_free(sched2shared);
2474         isl_ast_expr_free(arg);
2475         isl_ast_expr_free(expr);
2476
2477         id = isl_id_alloc(gen->ctx, NULL, stmt);
2478         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
2479         return isl_ast_node_set_annotation(node, id);
2480 error:
2481         isl_id_free(id);
2482         isl_pw_multi_aff_free(iterator_map);
2483         ppcg_kernel_stmt_free(stmt);
2484         isl_pw_multi_aff_free(sched2shared);
2485         return isl_ast_node_free(node);
2486 }
2487
2488 /* This function is called when code has been generated for the shared
2489  * tile loops.  The "schedule" refers only to the original statements.
2490  *
2491  * We extend the schedule with that part of gen->local_sched that hasn't
2492  * been taken into account yet.  This introduces parameters referring
2493  * to thread ids in the schedule, so we add them (with the appropriate
2494  * bounds to the context as well).
2495  * Finally, we set the appropriate unrolling options
2496  * if gen->first_unroll is set.
2497  */
2498 static __isl_give isl_ast_node *create_domain_leaf(
2499         __isl_take isl_union_map *schedule, __isl_take isl_ast_build *build,
2500         void *user)
2501 {
2502         struct gpu_gen *gen = (struct gpu_gen *) user;
2503         isl_space *space;
2504         isl_union_map *sched;
2505         isl_ast_node *tree;
2506         isl_set *set;
2507         isl_id_list *iterators;
2508         int n;
2509
2510         schedule = extend_schedule(schedule,
2511                         isl_union_map_copy(gen->local_sched),
2512                         gen->shared_len, gen->thread_tiled_len);
2513
2514         space = isl_ast_build_get_schedule_space(build);
2515         set = isl_set_universe(space);
2516         set = add_bounded_parameters(set, gen->kernel->block_dim,
2517                                         gen->kernel->thread_ids);
2518         build = isl_ast_build_restrict(build, set);
2519
2520         n = gen->thread_tiled_len - gen->shared_len;
2521
2522         if (gen->first_unroll >= 0) {
2523                 space = isl_space_set_alloc(gen->ctx, 0, n);
2524                 build = set_unroll(build, space, gen->first_unroll);
2525         }
2526         iterators = ppcg_scop_generate_names(gen->prog->scop, n, "c");
2527         build = isl_ast_build_set_iterators(build, iterators);
2528         build = isl_ast_build_set_at_each_domain(build, &at_each_domain, gen);
2529         tree = isl_ast_build_node_from_schedule_map(build, schedule);
2530         isl_ast_build_free(build);
2531
2532         return tree;
2533 }
2534
2535 /* This function is called for each statement node in the AST of the code
2536  * for copying to or from shared/private memory.
2537  * Attach a pointer to a ppcg_kernel_stmt representing the copy
2538  * statement to the node.
2539  * The statement name is "read" or "write", depending on whether we are
2540  * reading from global memory or writing to global memory.
2541  * The name of the T space is {shared,private}_<array>.
2542  *
2543  * The schedule is of the form
2544  *
2545  *      type[A -> T] -> L
2546  *
2547  * where A refers to a piece of an array and T to the corresponding
2548  * shifted tile.  We split this schedule into mappings L -> A and L -> T
2549  * and store the corresponding expressions in stmt->index and stmt->local_index,
2550  * where stmt points to the ppcg_kernel_stmt that is attached to the node.
2551  */
2552 static __isl_give isl_ast_node *attach_copy_stmt(__isl_take isl_ast_node *node,
2553         __isl_keep isl_ast_build *build, void *user)
2554 {
2555         struct gpu_gen *gen = (struct gpu_gen *) user;
2556         struct ppcg_kernel_stmt *stmt;
2557         isl_id *id;
2558         isl_ast_expr *expr;
2559         isl_space *space;
2560         isl_map *access, *local_access, *map;
2561         isl_pw_multi_aff *pma;
2562         const char *type;
2563         int array_index;
2564
2565         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
2566         if (!stmt)
2567                 return isl_ast_node_free(node);
2568
2569         access = isl_map_from_union_map(isl_ast_build_get_schedule(build));
2570         type = isl_map_get_tuple_name(access, isl_dim_in);
2571         stmt->u.c.read = !strcmp(type, "read");
2572         access = isl_map_reverse(access);
2573         space = isl_space_unwrap(isl_space_range(isl_map_get_space(access)));
2574         local_access = isl_map_copy(access);
2575
2576         map = isl_map_domain_map(isl_map_universe(isl_space_copy(space)));
2577         id = isl_map_get_tuple_id(access, isl_dim_out);
2578         map = isl_map_set_tuple_id(map, isl_dim_in, id);
2579         access = isl_map_apply_range(access, map);
2580         pma = isl_pw_multi_aff_from_map(access);
2581         expr = isl_ast_build_access_from_pw_multi_aff(build, pma);
2582         stmt->u.c.index = expr;
2583
2584         map = isl_map_range_map(isl_map_universe(space));
2585         id = isl_map_get_tuple_id(local_access, isl_dim_out);
2586         map = isl_map_set_tuple_id(map, isl_dim_in, id);
2587         local_access = isl_map_apply_range(local_access, map);
2588         pma = isl_pw_multi_aff_from_map(local_access);
2589         expr = isl_ast_build_access_from_pw_multi_aff(build, pma);
2590         stmt->u.c.local_index = expr;
2591
2592         stmt->u.c.array = gen->copy_group->array;
2593         array_index = stmt->u.c.array - gen->prog->array;
2594         stmt->u.c.local_array = &gen->kernel->array[array_index];
2595         stmt->type = ppcg_kernel_copy;
2596
2597         id = isl_id_alloc(gen->ctx, NULL, stmt);
2598         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
2599         return isl_ast_node_set_annotation(node, id);
2600 }
2601
2602 /* Given a schedule of the form
2603  *
2604  *      [S -> A] -> L
2605  *
2606  * (with S the first shared_len dimensions of the computed schedule,
2607  * A the array and L the schedule correponding to the generated loops),
2608  * indicating where to copy the array elements that need to be copied,
2609  * construct code for performing the copying.
2610  *
2611  * "group" is the array reference group that is being copied
2612  * "type" is either "read" or "write"
2613  * private is set if copying needs to be performed to/from registers
2614  *
2615  * We first construct a mapping to a shifted tile of the array,
2616  *
2617  *      [S -> A] -> T(S,A)                                      (1)
2618  *
2619  * If private is set, then we also use this mapping as a schedule
2620  * (which is already thread-specific and will be completely unrolled).
2621  * Otherwise, we wrap/tile the range over the threads.
2622  * The result is
2623  *
2624  *      [S -> A] -> T'(S,A)
2625  *
2626  * Combined with the given schedule, we have
2627  *
2628  *      [S -> A] -> [L -> T'(S,A)]                              (2)
2629  *
2630  * From the shifted tile mapping, we construct a mapping
2631  *
2632  *      [S -> A] -> [A -> T(S,A)]
2633  *
2634  * and apply it to the schedule (2), obtaining
2635  *
2636  *      [A -> T(S(L),A)] -> [L -> T'(S(L),A)]
2637  *
2638  * Note that we can project out S because it is uniquely defined by L.
2639  */
2640 static __isl_give isl_ast_node *copy_access(struct gpu_gen *gen,
2641         __isl_take isl_map *sched,
2642         const char *type, struct gpu_array_ref_group *group,
2643         __isl_take isl_ast_build *build, int private)
2644 {
2645         isl_space *space;
2646         isl_ast_node *tree;
2647         isl_map *schedule, *shift, *map;
2648         isl_set *set;
2649         isl_id_list *iterators;
2650         int n;
2651
2652         shift = shift_access(group);
2653
2654         schedule = isl_map_copy(shift);
2655         schedule = isl_map_reset_tuple_id(schedule, isl_dim_out);
2656         if (!private)
2657                 schedule = tile_access_schedule(gen, schedule);
2658
2659         n = isl_map_dim(schedule, isl_dim_out);
2660         set = isl_set_universe(isl_ast_build_get_schedule_space(build));
2661         set = add_bounded_parameters(set, gen->kernel->block_dim,
2662                                         gen->kernel->thread_ids);
2663
2664         schedule = isl_map_range_product(sched, schedule);
2665
2666         space = isl_space_domain(isl_map_get_space(shift));
2667         map = isl_map_range_map(isl_map_universe(isl_space_unwrap(space)));
2668         map = isl_map_range_product(map, shift);
2669
2670         schedule = isl_map_apply_domain(schedule, map);
2671
2672         schedule = isl_map_set_tuple_name(schedule, isl_dim_in, type);
2673
2674         build = isl_ast_build_restrict(build, set);
2675
2676         gen->copy_group = group;
2677
2678         if (private) {
2679                 space = isl_space_range(isl_map_get_space(schedule));
2680                 space = isl_space_range(isl_space_unwrap(space));
2681                 build = set_unroll(build, space, 0);
2682         }
2683         iterators = ppcg_scop_generate_names(gen->prog->scop, n, "c");
2684         build = isl_ast_build_set_iterators(build, iterators);
2685         build = isl_ast_build_set_at_each_domain(build, &attach_copy_stmt, gen);
2686         tree = isl_ast_build_node_from_schedule_map(build,
2687                                             isl_union_map_from_map(schedule));
2688         isl_ast_build_free(build);
2689
2690         return tree;
2691 }
2692
2693 /* Return code for reading into or writing from shared memory
2694  * the given array reference group.
2695  *
2696  * If we are performing a read from global memory to shared memory and
2697  * if the array involved is not a scalar, then we copy
2698  * the entire tile to shared memory.  This may result in some extra
2699  * elements getting copied, but it should lead to simpler code
2700  * (which means that fewer registers may be needed) and less divergence.
2701  *
2702  * Otherwise, we only copy the elements that will be read or have been written
2703  * in the kernel.
2704  *
2705  *
2706  * The input "sched" is of the form.
2707  *
2708  *      type[S -> A] -> L
2709  *
2710  * with S the first shared_len dimensions of the computed schedule,
2711  * A the array and L the schedule correponding to the generated loops.
2712  *
2713  * We first drop "type",
2714  *
2715  *      [S -> A] -> L
2716  *
2717  * If the above conditions are satisfied, we project out A,
2718  * resulting in
2719  *
2720  *      S -> L
2721  *
2722  * and then introduce the group tile [S -> T], resulting in
2723  *
2724  *      [S -> T] -> L
2725  */
2726 static __isl_give isl_ast_node *copy_group_shared_accesses(
2727         struct gpu_gen *gen, struct gpu_array_ref_group *group,
2728         __isl_take isl_map *sched, __isl_take isl_ast_build *build)
2729 {
2730         const char *type;
2731         int read;
2732         isl_union_map *access;
2733
2734         type = isl_map_get_tuple_name(sched, isl_dim_in);
2735         read = !strcmp(type, "read");
2736
2737         sched = isl_map_reset_tuple_id(sched, isl_dim_in);
2738
2739         if (read && !gpu_array_is_scalar(group->array)) {
2740                 isl_space *space;
2741                 isl_map *map;
2742
2743                 space = isl_space_domain(isl_map_get_space(sched));
2744                 space = isl_space_unwrap(space);
2745                 map = isl_map_domain_map(isl_map_universe(space));
2746                 sched = isl_map_apply_domain(sched, map);
2747
2748                 map = group_tile(group);
2749                 map = isl_map_reverse(isl_map_domain_map(map));
2750                 sched = isl_map_apply_domain(sched, map);
2751         }
2752
2753         return copy_access(gen, sched, type, group, build, 0);
2754 }
2755
2756 /* Return code for reading into or writing from private memory
2757  * the given array reference group.
2758  *
2759  * Let S be the first shared_len dimensions of the computed schedule,
2760  * D the iteration domains, A the array and L the schedule correponding
2761  * to the generated loops.
2762  * "sched" is of the form
2763  *
2764  *      type[S -> A] -> L
2765  *
2766  * where type is either "read" or "write".
2767  * We apply the privatization D -> S(t), with t the thread ids,
2768  * to the access relation D -> A to obtain the privatized access relation
2769  *
2770  *      S(t) -> A
2771  *
2772  * We drop the type from "sched" and intersect with the privatized access
2773  * relation to obtain
2774  *
2775  *      [S(t) -> A] -> L
2776  */
2777 static __isl_give isl_ast_node *copy_group_private_accesses(
2778         struct gpu_gen *gen, struct gpu_array_ref_group *group,
2779         __isl_take isl_map *sched, __isl_take isl_ast_build *build)
2780 {
2781         const char *type;
2782         int read;
2783         isl_union_map *priv;
2784         isl_union_map *access;
2785         isl_map *access_map;
2786
2787         type = isl_map_get_tuple_name(sched, isl_dim_in);
2788         read = !strcmp(type, "read");
2789
2790         priv = isl_union_map_from_map(isl_map_copy(gen->privatization));
2791         priv = isl_union_map_apply_range(isl_union_map_copy(gen->shared_sched),
2792                                         priv);
2793
2794         access = gpu_array_ref_group_access_relation(group, read, !read);
2795         access = isl_union_map_apply_domain(access, priv);
2796         access_map = isl_map_from_union_map(access);
2797
2798         sched = isl_map_reset_tuple_id(sched, isl_dim_in);
2799         sched = isl_map_intersect_domain(sched, isl_map_wrap(access_map));
2800
2801         return copy_access(gen, sched, type, group, build, 1);
2802 }
2803
2804 /* Return code for reading into or writing from shared or private memory.
2805  *
2806  * "schedule" is of the form
2807  *
2808  *      type[S -> A] -> L
2809  *
2810  * with S be the first shared_len dimensions of the computed schedule,
2811  * A the array and L the schedule correponding to the generated loops.
2812  * The array reference group is attached to "type".
2813  */
2814 static __isl_give isl_ast_node *create_access_leaf(
2815         struct gpu_gen *gen, __isl_take isl_map *schedule,
2816         __isl_take isl_ast_build *build)
2817 {
2818         struct gpu_array_ref_group *group;
2819         isl_id *id;
2820
2821         id = isl_map_get_tuple_id(schedule, isl_dim_in);
2822         group = isl_id_get_user(id);
2823         isl_id_free(id);
2824
2825         if (group->private_tile)
2826                 return copy_group_private_accesses(gen, group, schedule,
2827                                                         build);
2828         else
2829                 return copy_group_shared_accesses(gen, group, schedule,
2830                                                         build);
2831 }
2832
2833 /* Create a domain node representing a synchronization.
2834  */
2835 static __isl_give isl_ast_node *create_sync_leaf(
2836         struct gpu_gen *gen, __isl_take isl_map *schedule,
2837         __isl_take isl_ast_build *build)
2838 {
2839         struct ppcg_kernel_stmt *stmt;
2840         isl_id *id;
2841         isl_space *space;
2842         isl_ast_node *node;
2843         isl_ast_expr *expr;
2844
2845         isl_map_free(schedule);
2846
2847         stmt = isl_calloc_type(gen->ctx, struct ppcg_kernel_stmt);
2848         if (!stmt)
2849                 return NULL;
2850
2851         stmt->type = ppcg_kernel_sync;
2852
2853         space = isl_ast_build_get_schedule_space(build);
2854         space = isl_space_from_domain(space);
2855         space = isl_space_set_tuple_name(space, isl_dim_out, "sync");
2856         expr = isl_ast_build_call_from_pw_multi_aff(build,
2857                     isl_pw_multi_aff_from_multi_aff(isl_multi_aff_zero(space)));
2858         node = isl_ast_node_alloc_user(expr);
2859         isl_ast_build_free(build);
2860
2861         id = isl_id_alloc(gen->ctx, NULL, stmt);
2862         id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
2863         return isl_ast_node_set_annotation(node, id);
2864 }
2865
2866 /* This function is called during the code generation at the point
2867  * where the schedule domain element is completely determined by
2868  * the generated code.  The input schedule contains the original
2869  * statements as well as synchronization and copy "statements".
2870  * The latter are scheduled at different points than any of the original
2871  * statements, so they will only arrive here in isolation.
2872  *
2873  * If the current schedule only refers to a single statement,
2874  * we check if it is a copy or synchronization statement and
2875  * call the appropriate functions.
2876  * Otherwise, we assume we are dealing with the original statements
2877  * and we call create_domain_leaf.
2878  */
2879 static __isl_give isl_ast_node *create_kernel_leaf(
2880         __isl_take isl_ast_build *build, void *user)
2881 {
2882         struct gpu_gen *gen = (struct gpu_gen *) user;
2883         isl_map *map;
2884         isl_union_map *schedule;
2885         const char *name;
2886
2887         schedule = isl_ast_build_get_schedule(build);
2888
2889         if (isl_union_map_n_map(schedule) != 1)
2890                 return create_domain_leaf(schedule, build, user);
2891
2892         map = isl_map_from_union_map(schedule);
2893         name = isl_map_get_tuple_name(map, isl_dim_in);
2894         if (!strcmp(name, "read") || !strcmp(name, "write"))
2895                 return create_access_leaf(gen, map, build);
2896         if (!strcmp(name, "sync"))
2897                 return create_sync_leaf(gen, map, build);
2898
2899         return create_domain_leaf(isl_union_map_from_map(map), build, user);
2900 }
2901
2902 /* Mark all odd schedule dimensions as "atomic" (when the even dimensions
2903  * have value 0) and all even schedule dimensions as "unroll".
2904  *
2905  * That is, the options look as follows
2906  *
2907  *      { [0, b, 0, d, ..., 0] -> atomic[i] : exists a : i = 2 a + 1;
2908  *        [a, b, c, d, ..., z] -> unroll[i] : exists a : i = 2 a }
2909  *
2910  * The even positions are used to be able to schedule copying blocks
2911  * and synchronization before or after each level of the shared memory
2912  * tile loops and we want to make sure that code for these is generated
2913  * separately (within each level).
2914  */
2915 static __isl_give isl_ast_build *set_atomic_and_unroll(
2916         __isl_take isl_ast_build *build,
2917         __isl_take isl_space *space, int sched_len)
2918 {
2919         isl_ctx *ctx;
2920         isl_map *map;
2921         isl_constraint *c;
2922         isl_union_map *opt;
2923         isl_local_space *ls;
2924         int i, n;
2925
2926         ctx = isl_ast_build_get_ctx(build);
2927
2928         space = isl_space_params(space);
2929         space = isl_space_add_dims(space, isl_dim_set, sched_len);
2930         space = isl_space_from_domain(space);
2931         space = isl_space_add_dims(space, isl_dim_out, 2);
2932         map = isl_map_universe(isl_space_copy(space));
2933         for (i = 0; i < sched_len; i += 2)
2934                 map = isl_map_fix_si(map, isl_dim_in, i, 0);
2935         ls = isl_local_space_from_space(isl_map_get_space(map));
2936         c = isl_equality_alloc(ls);
2937         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 0, 1);
2938         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 1, 2);
2939         c = isl_constraint_set_constant_si(c, 1);
2940         map = isl_map_add_constraint(map, c);
2941         map = isl_map_project_out(map, isl_dim_out, 1, 1);
2942         map = isl_map_set_tuple_name(map, isl_dim_out, "atomic");
2943         opt = isl_union_map_from_map(map);
2944
2945         map = isl_map_universe(space);
2946         ls = isl_local_space_from_space(isl_map_get_space(map));
2947         c = isl_equality_alloc(ls);
2948         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 0, 1);
2949         c = isl_constraint_set_coefficient_si(c, isl_dim_out, 1, 2);
2950         map = isl_map_add_constraint(map, c);
2951         map = isl_map_project_out(map, isl_dim_out, 1, 1);
2952         map = isl_map_set_tuple_name(map, isl_dim_out, "unroll");
2953         opt = isl_union_map_add_map(opt, map);
2954
2955         build = isl_ast_build_set_options(build, opt);
2956
2957         return build;
2958 }
2959
2960 /* Return a map that maps a space of dimension gen->shared_len
2961  * to its last dimensions starting at gen->tile_first.
2962  * The range is of dimension
2963  *
2964  *      2 * (gen->shared_len - gen->tile_first) + 1
2965  *
2966  * The input dimensions are mapped to the odd dimensions in the output,
2967  * while the even dimensions (except 2*pos) are fixed to 0.
2968  * Output dimension 2*pos (if pos >= 0) is fixed to "val".
2969  * If pos >= 0, then only the pos first dimensions starting at gen->tile_first
2970  * are mapped to the output.  The remaining input dimensions are projected
2971  * out and the corresponding output dimensions are fixed to 0.
2972  */
2973 static __isl_give isl_map *insert_even(struct gpu_gen *gen,
2974         __isl_take isl_space *space, int pos, int val)
2975 {
2976         int i, n;
2977         isl_map *proj;
2978
2979         space = isl_space_set_from_params(space);
2980         space = isl_space_add_dims(space, isl_dim_set, gen->shared_len);
2981         space = isl_space_map_from_set(space);
2982         proj = isl_map_identity(space);
2983         proj = isl_map_project_out(proj, isl_dim_out, 0, gen->tile_first);
2984         n = gen->shared_len - gen->tile_first;
2985         for (i = 0; i <= n; ++i) {
2986                 proj = isl_map_insert_dims(proj, isl_dim_out, 2 * i, 1);
2987                 if (i == pos)
2988                         proj = isl_map_fix_si(proj, isl_dim_out, 2 * i, val);
2989                 else
2990                         proj = isl_map_fix_si(proj, isl_dim_out, 2 * i, 0);
2991         }
2992
2993         if (pos < 0)
2994                 return proj;
2995
2996         proj = isl_map_eliminate(proj, isl_dim_in, gen->tile_first + pos,
2997                                 gen->shared_len - (gen->tile_first + pos));
2998         for (i = pos; i < n; ++i)
2999                 proj = isl_map_fix_si(proj, isl_dim_out, 2 * i + 1, 0);
3000
3001         return proj;
3002 }
3003
3004 /* Given the AST context schedule "schedule" and the mapping from
3005  * domains to the shared tile loops "shared_sched", add a schedule
3006  * for a synchronization operation at position "val" of loop level "pos".
3007  *
3008  * schedule is of the form
3009  *
3010  *      D -> L
3011  *
3012  * (with D the iteration domains and L the already generated loops),
3013  * while shared_sched is of the form
3014  *
3015  *      D -> S
3016  *
3017  * We combine them into
3018  *
3019  *      L -> S
3020  *
3021  * apply a mapping
3022  *
3023  *      [s_0,...] -> [0,s_{tile_first},0,..., val, 0, 0, ... 0]
3024  *
3025  * and use the result as a schedule for "sync".
3026  */
3027 static __isl_give isl_union_map *add_sync_schedule(struct gpu_gen *gen,
3028         __isl_take isl_union_map *res, __isl_keep isl_union_map *schedule,
3029         __isl_keep isl_union_map *shared_sched, int pos, int val)
3030 {
3031         isl_space *space;
3032         isl_map *proj, *map;
3033
3034         shared_sched = isl_union_map_copy(shared_sched);
3035         schedule = isl_union_map_copy(schedule);
3036
3037         space = isl_union_map_get_space(shared_sched);
3038         schedule = isl_union_map_apply_domain(shared_sched, schedule);
3039         map = isl_map_from_union_map(schedule);
3040
3041         proj = insert_even(gen, space, pos, val);
3042         map = isl_map_apply_range(map, proj);
3043         map = isl_map_from_range(isl_map_wrap(map));
3044         map = isl_map_set_tuple_name(map, isl_dim_in, "sync");
3045
3046         res = isl_union_map_add_map(res, map);
3047
3048         return res;
3049 }
3050
3051 /* Given a set of wrapped references "ref", return the corresponding
3052  * access relations based on the tagged access relations "tagged".
3053  *
3054  * The elements of "ref" are of the form
3055  *
3056  *      [D -> R]
3057  *
3058  * with D an iteration domains and R a reference.
3059  * The elements of "tagged" are of the form
3060  *
3061  *      [D -> R] -> A
3062  *
3063  * with A an array.
3064  *
3065  * Extend "tagged" to include the iteration domain in the range, i.e.,
3066  *
3067  *      [D -> R] -> [D -> A]
3068  *
3069  * apply the result to "ref" and then unwrap the resulting set
3070  * to obtain relations of the form
3071  *
3072  *      D -> A
3073  */
3074 static __isl_give isl_union_map *wrapped_reference_to_access(
3075         __isl_take isl_union_set *ref, __isl_take isl_union_map *tagged)
3076 {
3077         isl_union_map *tag2access;
3078
3079         tag2access = isl_union_map_copy(tagged);
3080         tag2access = isl_union_map_universe(tag2access);
3081         tag2access = isl_union_set_unwrap(isl_union_map_domain(tag2access));
3082         tag2access = isl_union_map_domain_map(tag2access);
3083         tag2access = isl_union_map_range_product(tag2access, tagged);
3084
3085         ref = isl_union_set_coalesce(ref);
3086         ref = isl_union_set_apply(ref, tag2access);
3087
3088         return isl_union_set_unwrap(ref);
3089 }
3090
3091 /* Given an access relation "access" from "group", remove those reads
3092  * if ("read" is 1) or writes (if "read" is 0) that are only needed to
3093  * communicate data within the same iteration of the last_shared dimension
3094  * of the group.
3095  *
3096  * If the access is a read then it is either an element of
3097  *
3098  *      live_in union (range flow)
3099  *
3100  * where live_in and flow may be overapproximations, or
3101  * it reads an uninitialized value (that is not live-in because
3102  * there is an intermediate kill) or it reads a value that was
3103  * written within the same (compound) statement instance.
3104  * If the access is a write then it is either an element of
3105  *
3106  *      live_out union (domain flow)
3107  *
3108  * or it writes a value that is never read (and is not live-out
3109  * because of an intermediate kill) or only
3110  * within the same (compound) statement instance.
3111  * In both cases, the access relation is also a subset of
3112  * the group access relation.
3113  *
3114  * The cases where an uninitialized value is read or a value is written
3115  * that is never read or where the dataflow occurs within a statement
3116  * instance are also considered local and may also be removed.
3117  *
3118  * Essentially, we compute the intersection of "access" with either
3119  *
3120  *      live_in union (range non-local-flow)
3121  *
3122  * or
3123  *
3124  *      live_out union (domain non-local-flow)
3125  *
3126  * We first construct a relation "local"
3127  *
3128  *      [[D -> R] -> [D' -> R']]
3129  *
3130  * of pairs of domain iterations accessing the reference group
3131  * and references in the group that are scheduled to the same iteration
3132  * of the last_shared dimension.
3133  *
3134  * If this relation does not intersect the dataflow dependences,
3135  * then there is nothing we can possibly remove, unless the dataflow
3136  * dependences themselves only relate a subset of the accesses.
3137  * In particular, the accesses may not be involved in any dataflow
3138  * dependences, either because they are uninitialized reads/dead writes
3139  * or because the dataflow occurs inside a statement instance.
3140  *
3141  * Since the computation below may break up the access relation
3142  * into smaller pieces, we only perform the intersection with
3143  * the non-local dependent accesses if the local pairs
3144  * intersect the dataflow dependences.  Otherwise, we intersect
3145  * with the universe of the non-local dependent accesses.
3146  * This should at least remove accesses from statements that
3147  * do not participate in any dependences.
3148  *
3149  * In particular, we remove the "local" dataflow dependences from
3150  * the set of all dataflow dependences.
3151  * Note that if the potential dataflow dependences are an overapproximation
3152  * of the actual dataflow dependences, then the result remains an
3153  * overapproximation of the non-local dataflow dependences.
3154  * Copying to/from global memory is only needed for the references
3155  * in the domain/range of the result or for accesses that are live out/in
3156  * for the entire scop.
3157  *
3158  * We therefore map the domain/range of the "external" relation
3159  * to the corresponding access relation and take the union with
3160  * the live out/in relation.
3161  */
3162 static __isl_give isl_union_map *remove_local_accesses(struct gpu_gen *gen,
3163         struct gpu_array_ref_group *group, __isl_take isl_union_map *access,
3164         int read)
3165 {
3166         int empty;
3167         isl_union_pw_multi_aff *tagger;
3168         isl_union_set *domain;
3169         isl_space *space;
3170         isl_union_map *sched, *local, *tagged, *external;
3171         isl_union_set *tag_set;
3172         isl_map *proj;
3173
3174         if (isl_union_map_is_empty(access))
3175                 return access;
3176
3177         tagged = group_tagged_access_relation(group);
3178
3179         sched = isl_union_map_copy(gen->sched);
3180
3181         space = isl_union_map_get_space(sched);
3182         proj = projection(space, gen->untiled_len, group->last_shared + 1);
3183         sched = isl_union_map_apply_range(sched, isl_union_map_from_map(proj));
3184
3185         tagger = isl_union_pw_multi_aff_copy(gen->prog->scop->tagger);
3186         domain = isl_union_map_domain(isl_union_map_copy(tagged));
3187         tagger = isl_union_pw_multi_aff_intersect_domain(tagger, domain);
3188         sched = isl_union_map_preimage_domain_union_pw_multi_aff(sched, tagger);
3189
3190         local = isl_union_map_apply_range(sched,
3191                             isl_union_map_reverse(isl_union_map_copy(sched)));
3192         local = isl_union_map_intersect(local,
3193                         isl_union_map_copy(gen->prog->scop->tagged_dep_flow));
3194
3195         empty = isl_union_map_is_empty(local);
3196
3197         external = isl_union_map_copy(gen->prog->scop->tagged_dep_flow);
3198         external = isl_union_map_intersect_params(external,
3199                                 isl_set_copy(gen->prog->scop->context));
3200         external = isl_union_map_subtract(external, local);
3201
3202         if (read) {
3203                 tag_set = isl_union_map_range(external);
3204                 external = wrapped_reference_to_access(tag_set, tagged);
3205                 external = isl_union_map_union(external,
3206                                 isl_union_map_copy(gen->prog->scop->live_in));
3207         } else {
3208                 tag_set = isl_union_map_domain(external);
3209                 external = wrapped_reference_to_access(tag_set, tagged);
3210                 external = isl_union_map_union(external,
3211                                 isl_union_map_copy(gen->prog->scop->live_out));
3212         }
3213
3214         if (empty < 0)
3215                 external = isl_union_map_free(external);
3216         else if (empty)
3217                 external = isl_union_map_universe(external);
3218
3219         access = isl_union_map_intersect(access, external);
3220
3221         return access;
3222 }
3223
3224 /* Given the AST context schedule "schedule" and the mapping from
3225  * domains to the shared tile loops "shared_sched", add a schedule
3226  * for copying an array reference group to/from shared/private memory.
3227  * "read" is set if data should be copied from global memory
3228  * to shared/private memory.
3229  * "k" represents the current group
3230  * "s" is the total number of groups
3231  *
3232  * We schedule an operation before or after the innermost loop
3233  * of "shared_sched" that affects the tile of the array reference group.
3234  *
3235  * schedule is of the form
3236  *
3237  *      D -> L
3238  *
3239  * (with D the iteration domains and L the already generated loops),
3240  * while shared_sched is of the form
3241  *
3242  *      D -> S
3243  *
3244  * We first compute the access relation for the reference group
3245  *
3246  *      D -> A
3247  *
3248  * and remove from this access relation those reads or writes
3249  * that only needed to communicate data within the same iteration
3250  * of the last_shared dimension of the group.
3251  * We then combine what is left with shared_sched into
3252  *
3253  *      D -> [S -> A]
3254  *
3255  * If this results in an empty relation, no copying needs to be performed
3256  * at this point.
3257  * Otherwise, we invert the relation and combine it with "schedule" into
3258  *
3259  *      [S -> A] -> L
3260  *
3261  * The actual additional piece of the schedule is obtained from combining
3262  *
3263  *      [S -> A] -> S
3264  *
3265  * with a mapping
3266  *
3267  *      [s_0,...] -> [0,s_{tile_first},0,..., val, 0, 0, ... 0]
3268  *
3269  * The position of "val" corresponds to the innermost loop that affects
3270  * the tile and the value indicates where the copying is scheduled
3271  * with respect to the actual kernel code (at value 0).
3272  * Reads are schedule before the code, writes to global memory from
3273  * private memory are scheduled at values 1 to s, writes to global
3274  * memory from shared memory are scheduled at values s + 2 to 2 * s + 1.
3275  *
3276  * If we are scheduling a read from global memory to shared memory,
3277  * we insert a synchronization before the kernel code (at the innermost
3278  * level).
3279  * If we are scheduling a write to global memory, then we add
3280  * a synchronization after all writes (at value 2 *s + 2).
3281  * However, there is no need for a synchronization after the outermost loop.
3282  * A write to global memory from private memory at the innermost level
3283  * does not require a synchronization, because it is covered by
3284  * the synchronization after the kernel inserted by body_schedule.
3285  */
3286 static __isl_give isl_union_map *add_group_schedule(struct gpu_gen *gen,
3287         __isl_take isl_union_map *res, __isl_keep isl_union_map *schedule,
3288         __isl_keep isl_union_map *shared_sched,
3289         struct gpu_array_ref_group *group, int read, int k, int s)
3290 {
3291         int n;
3292         int pos, val;
3293         isl_space *space;
3294         isl_union_map *access;
3295         isl_map *map, *proj, *access_map;
3296         isl_id *id;
3297
3298         access = gpu_array_ref_group_access_relation(group, read, !read);
3299         access = remove_local_accesses(gen, group, access, read);
3300         access = isl_union_map_range_product(isl_union_map_copy(shared_sched),
3301                                                 access);
3302
3303         if (isl_union_map_is_empty(access)) {
3304                 isl_union_map_free(access);
3305                 return res;
3306         }
3307
3308         access = isl_union_map_reverse(access);
3309         access = isl_union_map_apply_range(access,
3310                                             isl_union_map_copy(schedule));
3311         access_map = isl_map_from_union_map(access);
3312
3313         space = isl_space_copy(group->array->space);
3314         space = isl_space_from_range(space);
3315         space = isl_space_add_dims(space, isl_dim_in, gen->shared_len);
3316         map = isl_map_domain_map(isl_map_universe(space));
3317
3318         space = isl_union_map_get_space(schedule);
3319         pos = group->last_shared + 1 - gen->tile_first;
3320         assert(pos >= 0);
3321         if (read)
3322                 val = -2 - k;
3323         else if (group->private_tile)
3324                 val = 1 + k;
3325         else
3326                 val = 1 + s + 1 + k;
3327         proj = insert_even(gen, space, pos, val);
3328         map = isl_map_apply_range(map, proj);
3329
3330         access_map = isl_map_range_product(access_map, map);
3331
3332         id = isl_id_alloc(gen->ctx, read ? "read" : "write", group);
3333         access_map = isl_map_set_tuple_id(access_map, isl_dim_in, id);
3334
3335         res = isl_union_map_add_map(res, access_map);
3336
3337         n = gen->shared_len - gen->tile_first;
3338         if (read) {
3339                 if (!group->private_tile)
3340                         res = add_sync_schedule(gen, res, schedule,
3341                                                 shared_sched, n, -1);
3342         } else {
3343                 if (pos == 0)
3344                         return res;
3345                 if (pos == n && group->private_tile)
3346                         return res;
3347                 res = add_sync_schedule(gen, res, schedule, shared_sched,
3348                                         pos, 2 * s + 2);
3349         }
3350
3351         return res;
3352 }
3353
3354 /* Return a schedule for the shared tile loops based on the current
3355  * AST context schedule.
3356  *
3357  * We create a "shared_sched" that maps the domains to the first
3358  * shared_len dimensions of the computed schedule, project out the
3359  * first tile_first dimensions (as these are already covered by
3360  * the host code) and insert "statement-level" dimensions at even
3361  * positions so that we can schedule copy blocks and synchronization
3362  * before/after each level.
3363  *
3364  * In particular, copy blocks are inserted inside the innermost
3365  * level that affect the tile.  For the copying to global memory,
3366  * those from private memory are scheduled before those from shared
3367  * memory such that synchronization can be inserted between the two
3368  * at the innermost level.
3369  * Synchronization is inserted at the innermost level before the
3370  * actual kernel code if there is any copying from global memory
3371  * to shared memory.  It is inserted unconditionally at the innermost
3372  * level after the actual kernel code and the copying to global memory
3373  * from private memory (if any).  Finally, it is inserted after
3374  * any copying to global memory, except at the outermost level
3375  * and at the innermost level if there is no copying from shared
3376  * memory.  The copying from private memory is covered by the unconditional
3377  * synchronization at the innermost level.
3378  */
3379 static __isl_give isl_union_map *body_schedule(struct gpu_gen *gen,
3380         __isl_take isl_union_map *schedule)
3381 {
3382         isl_space *space;
3383         isl_union_map *res;
3384         isl_union_map *shared_sched;
3385         isl_union_map *sched;
3386         isl_map *proj, *map;
3387         int i, j, k, s;
3388
3389         shared_sched = isl_union_map_copy(gen->tiled_sched);
3390         proj = projection(isl_union_map_get_space(shared_sched),
3391                                 gen->tiled_len, gen->shared_len);
3392         shared_sched = isl_union_map_apply_range(shared_sched,
3393                                 isl_union_map_from_map(proj));
3394         space = isl_union_map_get_space(shared_sched);
3395         proj = insert_even(gen, space, -1, 0);
3396         sched = isl_union_map_apply_range(isl_union_map_copy(shared_sched),
3397                                 isl_union_map_from_map(proj));
3398
3399         res = isl_union_map_range_product(isl_union_map_copy(schedule), sched);
3400
3401         s = 0;
3402         for (i = 0; i < gen->kernel->n_array; ++i)
3403                 s += gen->kernel->array[i].n_group;
3404
3405         k = 0;
3406         for (i = 0; i < gen->kernel->n_array; ++i) {
3407                 struct gpu_local_array_info *array = &gen->kernel->array[i];
3408
3409                 for (j = 0; j < array->n_group; ++j) {
3410                         struct gpu_array_ref_group *group;
3411
3412                         group = array->groups[j];
3413                         if (!group->private_tile && !group->shared_tile)
3414                                 continue;
3415                         res = add_group_schedule(gen, res, schedule,
3416                                                 shared_sched, group, 0, k, s);
3417                         res = add_group_schedule(gen, res, schedule,
3418                                                 shared_sched, group, 1, k, s);
3419                         ++k;
3420                 }
3421         }
3422
3423         res = add_sync_schedule(gen, res, schedule, shared_sched,
3424                             gen->shared_len - gen->tile_first, 1 + s);
3425
3426         isl_union_map_free(shared_sched);
3427         isl_union_map_free(schedule);
3428
3429         return res;
3430 }
3431
3432 /* Generate code for "kernel" in the given "context".
3433  *
3434  * We first generate code for the shared tile loops (T1T, T1P and T2)
3435  * in a context that includes the block ids.
3436  * Within each iteration of these loops an additional code generation
3437  * is performed (within create_kernel_leaf) for the rest of the schedule
3438  * in a context that includes the thread ids.
3439  */
3440 static __isl_give isl_ast_node *generate_kernel(struct gpu_gen *gen,
3441         __isl_keep isl_ast_build *build, __isl_keep isl_set *host_domain,
3442         __isl_keep isl_multi_pw_aff *grid_size)
3443 {
3444         isl_space *space;
3445         isl_set *set;
3446         isl_id_list *iterators;
3447         isl_union_map *schedule;
3448         isl_ast_node *tree;
3449         int sched_len;
3450
3451         schedule = isl_ast_build_get_schedule(build);
3452
3453         build = isl_ast_build_copy(build);
3454         build = isl_ast_build_restrict(build, isl_set_copy(host_domain));
3455         space = isl_ast_build_get_schedule_space(build);
3456         set = isl_set_universe(isl_space_copy(space));
3457         set = add_bounded_parameters_dynamic(set, grid_size,
3458                                                 gen->kernel->block_ids);
3459         build = isl_ast_build_restrict(build, set);
3460
3461         schedule = body_schedule(gen, schedule);
3462
3463         sched_len = 2 * (gen->shared_len - gen->tile_first) + 1;
3464
3465         build = set_atomic_and_unroll(build, space, sched_len);
3466         iterators = ppcg_scop_generate_names(gen->prog->scop, sched_len, "g");
3467         build = isl_ast_build_set_iterators(build, iterators);
3468         build = isl_ast_build_set_create_leaf(build, &create_kernel_leaf, gen);
3469         tree = isl_ast_build_node_from_schedule_map(build, schedule);
3470         isl_ast_build_free(build);
3471
3472         return tree;
3473 }
3474
3475 /* Attach "id" to the given node.
3476  */
3477 static __isl_give isl_ast_node *attach_id(__isl_take isl_ast_node *node,
3478         __isl_keep isl_ast_build *build, void *user)
3479 {
3480         isl_id *id = user;
3481
3482         node = isl_ast_node_set_annotation(node, id);
3483
3484         return node;
3485 }
3486
3487 /* Construct an AST node for performing a kernel launch and attach
3488  * the information about the kernel to that node.
3489  * "kernel_id" has name "kernel" and contains a pointer
3490  * to the ppcg_kernel structure.
3491  *
3492  * The kernel AST has been constructed in the context of the range
3493  * of "schedule".  In particular, the grid size has been computed
3494  * in the context.  We therefore still need to make sure that these
3495  * constraints are expressed in the code.  We do this by creating a schedule
3496  *
3497  *      kernel[] -> [S -> []]
3498  *
3499  * where S is the schedule domain, i.e., the range of "schedule".
3500  * The AST generation will then create a single call surrounded by
3501  * all the condition in "S" that have not been expressed yet.
3502  *
3503  * The kernel information is attached to this node in attach_id.
3504  */
3505 static __isl_give isl_ast_node *construct_launch(
3506         __isl_take isl_ast_build *build, __isl_take isl_union_map *schedule,
3507         __isl_take isl_id *kernel_id)
3508 {
3509         isl_ctx *ctx;
3510         isl_union_set *domain;
3511         isl_set *set;
3512         isl_map *map;
3513         isl_ast_node *node;
3514
3515         ctx = isl_ast_build_get_ctx(build);
3516
3517         domain = isl_union_map_range(schedule);
3518         set = isl_set_from_union_set(domain);
3519         map = isl_map_from_domain(set);
3520         map = isl_map_from_range(isl_map_wrap(map));
3521         map = isl_map_set_tuple_name(map, isl_dim_in, "kernel");
3522         schedule = isl_union_map_from_map(map);
3523
3524         build = isl_ast_build_set_at_each_domain(build, &attach_id, kernel_id);
3525         node = isl_ast_build_node_from_schedule_map(build, schedule);
3526         isl_ast_build_free(build);
3527
3528         return node;
3529 }
3530
3531 /* This function is called for each leaf in the AST of the host code.
3532  * We first specialize the schedule to the site of the leaf, compute
3533  * the size of shared memory and then construct the body of the host code
3534  * and the associated kernel.
3535  *
3536  * The necessary information for printing the kernel launch is
3537  * stored in the struct ppcg_kernel that was created in create_kernel and
3538  * attached to an outer mark node in the schedule tree.
3539  * Note that this assumes that a kernel is only launched once.
3540  * The kernel pointer itself is stored in gen->kernel by before_mark,
3541  * while the isl_id containing this pointer is stored in gen->kernel_mark.
3542  * The latter is attached to the leaf AST node created to represent the launch.
3543  */
3544 static __isl_give isl_ast_node *create_host_leaf(
3545         __isl_take isl_ast_build *build, void *user)
3546 {
3547         struct gpu_gen *gen = (struct gpu_gen *) user;
3548         isl_id *id;
3549         isl_ast_node *node;
3550         struct ppcg_kernel *kernel;
3551         isl_set *host_domain;
3552         isl_union_map *schedule;
3553         isl_union_map *local_sched;
3554         isl_union_set *domain;
3555         int i;
3556
3557         schedule = isl_ast_build_get_schedule(build);
3558
3559         kernel = gen->kernel;
3560         if (!kernel)
3561                 goto error;
3562
3563         read_sizes(gen);
3564
3565         domain = isl_union_map_domain(isl_union_map_copy(schedule));
3566
3567         local_sched = isl_union_map_copy(gen->sched);
3568         local_sched = isl_union_map_intersect_domain(local_sched, domain);
3569
3570         kernel->block_ids = ppcg_scop_generate_names(gen->prog->scop,
3571                                                 kernel->n_grid, "b");
3572         kernel->thread_ids = ppcg_scop_generate_names(gen->prog->scop,
3573                                                 kernel->n_block, "t");
3574
3575         gen->tiled_sched = tile_schedule(gen, local_sched);
3576         gen->tiled_sched = parametrize_tiled_schedule(gen, gen->tiled_sched);
3577         gen->tiled_sched = scale_tile_loops(gen, gen->tiled_sched);
3578
3579         gen->local_sched = isl_union_map_copy(gen->tiled_sched);
3580         gen->local_sched = thread_tile_schedule(gen, gen->local_sched);
3581         gen->local_sched = scale_thread_tile_loops(gen, gen->local_sched);
3582
3583         kernel->grid_size = extract_grid_size(gen, kernel);
3584         extract_block_size(gen, kernel);
3585         kernel->space = isl_ast_build_get_schedule_space(build);
3586
3587         compute_shared_sched(gen);
3588         gen->privatization = compute_privatization(gen);
3589         if (gpu_group_references(gen) < 0)
3590                 schedule = isl_union_map_free(schedule);
3591         host_domain = isl_set_from_union_set(isl_union_map_range(
3592                                                 isl_union_map_copy(schedule)));
3593         localize_bounds(gen, kernel, host_domain);
3594
3595         gen->local_sched = interchange_for_unroll(gen, gen->local_sched);
3596         check_shared_memory_bound(gen);
3597         compute_group_tilings(gen);
3598
3599         kernel->tree = generate_kernel(gen, build, host_domain,
3600                                         kernel->grid_size);
3601         create_kernel_vars(gen, kernel);
3602
3603         isl_map_free(gen->privatization);
3604         isl_union_map_free(gen->local_sched);
3605         isl_union_map_free(gen->tiled_sched);
3606         isl_union_map_free(gen->shared_sched);
3607         isl_union_map_free(gen->shared_proj);
3608         isl_set_free(host_domain);
3609
3610         node = construct_launch(build, schedule, isl_id_copy(gen->kernel_mark));
3611
3612         return node;
3613 error:
3614         isl_union_map_free(schedule);
3615         return NULL;
3616 }
3617
3618 /* This function is called before the AST generator starts traversing
3619  * the schedule subtree of a node with mark "mark".
3620  *
3621  * If the mark is called "kernel", store the mark itself in gen->kernel_mark
3622  * and the kernel pointer in gen->kernel for use in create_host_leaf.
3623  */
3624 static int before_mark(__isl_keep isl_id *mark,
3625         __isl_keep isl_ast_build *build, void *user)
3626 {
3627         struct gpu_gen *gen = user;
3628
3629         if (!mark)
3630                 return -1;
3631         if (!strcmp(isl_id_get_name(mark), "kernel")) {
3632                 gen->kernel_mark = isl_id_copy(mark);
3633                 gen->kernel = isl_id_get_user(mark);
3634         }
3635         return 0;
3636 }
3637
3638 /* This function is called after the AST generator has finished traversing
3639  * the schedule subtree of a mark node.  "node" points to the corresponding
3640  * mark AST node.
3641  *
3642  * If the mark is called "kernel", then clear kernel and gen->kernel_mark.
3643  */
3644 static __isl_give isl_ast_node *after_mark(__isl_take isl_ast_node *node,
3645         __isl_keep isl_ast_build *build, void *user)
3646 {
3647         struct gpu_gen *gen = user;
3648         isl_id *id;
3649
3650         id = isl_ast_node_mark_get_id(node);
3651         if (!id)
3652                 return isl_ast_node_free(node);
3653         if (!strcmp(isl_id_get_name(id), "kernel") && gen->kernel) {
3654                 gen->kernel_mark = isl_id_free(gen->kernel_mark);
3655                 gen->kernel = NULL;
3656         }
3657
3658         isl_id_free(id);
3659         return node;
3660 }
3661
3662 /* Use isl to generate host code from gen->host_schedule, which corresponds to
3663  * the outer gen->tile_first loops of the global schedule in gen->sched.
3664  * Within each iteration of this partial schedule, i.e., for each kernel
3665  * launch, create_host_leaf takes care of generating the kernel code.
3666  * The ppcg_kernel objects are stored in mark nodes in the schedule
3667  * tree and are extracted in before_mark.
3668  */
3669 static __isl_give isl_ast_node *generate_host_code(struct gpu_gen *gen)
3670 {
3671         isl_ast_build *build;
3672         isl_ast_node *tree;
3673         isl_schedule *schedule;
3674         isl_id_list *iterators;
3675
3676         isl_options_set_ast_build_group_coscheduled(gen->ctx, 1);
3677         build = isl_ast_build_from_context(isl_set_copy(gen->prog->context));
3678         iterators = ppcg_scop_generate_names(gen->prog->scop,
3679                                                 gen->tile_first, "h");
3680         build = isl_ast_build_set_iterators(build, iterators);
3681         build = isl_ast_build_set_create_leaf(build, &create_host_leaf, gen);
3682         build = isl_ast_build_set_before_each_mark(build, &before_mark, gen);
3683         build = isl_ast_build_set_after_each_mark(build, &after_mark, gen);
3684         schedule = isl_schedule_copy(gen->host_schedule);
3685         tree = isl_ast_build_node_from_schedule(build, schedule);
3686         isl_ast_build_free(build);
3687
3688         return tree;
3689 }
3690
3691 __isl_give isl_union_map *extract_sizes_from_str(isl_ctx *ctx, const char *str)
3692 {
3693         if (!str)
3694                 return NULL;
3695         return isl_union_map_read_from_str(ctx, str);
3696 }
3697
3698 /* Information about the outermost tilable bands in the forest of bands.
3699  *
3700  * prefix is the (padded) schedule leading up to the outermost tilable bands.
3701  *
3702  * tile_first is the number of schedule dimensions in prefix.
3703  *
3704  * suffix is the schedule of the outermost tilable bands and their descendants.
3705  */
3706 struct band_info {
3707         struct gpu_gen *gen;
3708         int tile_first;
3709         isl_union_map *prefix;
3710         isl_union_map *suffix;
3711 };
3712
3713 /* Extract the set of parameter values and outer schedule dimensions
3714  * for which any statement instance
3715  * in the kernel inserted at "node" needs to be executed.
3716  * Intersect the set of parameter values derived from the host schedule
3717  * relation with the context of "prog".
3718  */
3719 static __isl_give isl_set *extract_context(__isl_keep isl_schedule_node *node,
3720         struct gpu_prog *prog)
3721 {
3722         isl_union_map *schedule;
3723         isl_union_set *schedule_domain;
3724         isl_set *context;
3725         int empty;
3726
3727         schedule = isl_schedule_node_get_prefix_schedule_relation(node);
3728         schedule_domain = isl_union_map_range(schedule);
3729         empty = isl_union_set_is_empty(schedule_domain);
3730         if (empty < 0) {
3731                 isl_union_set_free(schedule_domain);
3732                 return NULL;
3733         }
3734         if (empty) {
3735                 int depth;
3736                 isl_space *space;
3737
3738                 space = isl_union_set_get_space(schedule_domain);
3739                 isl_union_set_free(schedule_domain);
3740                 space = isl_space_set_from_params(space);
3741                 depth = isl_schedule_node_get_schedule_depth(node);
3742                 space = isl_space_add_dims(space, isl_dim_set, depth);
3743                 context = isl_set_empty(space);
3744         } else {
3745                 context = isl_set_from_union_set(schedule_domain);
3746         }
3747         context = isl_set_intersect_params(context,
3748                                             isl_set_copy(prog->context));
3749
3750         return context;
3751 }
3752
3753 /* Return the set of outer array elements accessed by
3754  * by the statement instance in "domain" in "prog".
3755  */
3756 static __isl_give isl_union_set *accessed_by_domain(
3757         __isl_take isl_union_set *domain, struct gpu_prog *prog)
3758 {
3759         isl_union_map *access;
3760         isl_union_set *arrays;
3761
3762         access = isl_union_map_union(isl_union_map_copy(prog->read),
3763                                      isl_union_map_copy(prog->may_write));
3764         access = isl_union_map_intersect_domain(access, domain);
3765         arrays = isl_union_map_range(access);
3766         arrays = isl_union_set_apply(arrays,
3767                                 isl_union_map_copy(prog->to_outer));
3768
3769         return arrays;
3770 }
3771
3772 /* Return the number of outer band members of the band node "node"
3773  * that are marked coincident.
3774  */
3775 static int n_outer_coincidence(__isl_keep isl_schedule_node *node)
3776 {
3777         int i, n;
3778
3779         n = isl_schedule_node_band_n_member(node);
3780
3781         for (i = 0; i < n; ++i)
3782                 if (!isl_schedule_node_band_member_get_coincident(node, i))
3783                         break;
3784
3785         return i;
3786 }
3787
3788 /* Mark all dimensions in the current band node atomic.
3789  */
3790 static __isl_give isl_schedule_node *atomic(__isl_take isl_schedule_node *node)
3791 {
3792         int i, n;
3793
3794         n = isl_schedule_node_band_n_member(node);
3795         for (i = 0; i < n; ++i)
3796                 node = isl_schedule_node_band_member_set_ast_loop_type(node, i,
3797                                                         isl_ast_loop_atomic);
3798
3799         return node;
3800 }
3801
3802 /* Mark "node" atomic, if it is a band node.
3803  * Do the same for all ancestors.
3804  * Return a pointer to "node" (in the updated schedule tree).
3805  */
3806 static __isl_give isl_schedule_node *atomic_ancestors(
3807         __isl_take isl_schedule_node *node)
3808 {
3809         int pos;
3810
3811         if (!node)
3812                 return NULL;
3813         if (!isl_schedule_node_has_parent(node))
3814                 return node;
3815
3816         pos = isl_schedule_node_get_child_position(node);
3817         node = isl_schedule_node_parent(node);
3818         if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
3819                 node = atomic(node);
3820         node = atomic_ancestors(node);
3821         node = isl_schedule_node_child(node, pos);
3822
3823         return node;
3824 }
3825
3826 /* Group the domain elements into a single space, named kernelX,
3827  * with X the kernel sequence number "kernel_id".
3828  */
3829 static __isl_give isl_schedule_node *group_statements(
3830         __isl_take isl_schedule_node *node, int kernel_id)
3831 {
3832         char buffer[20];
3833         isl_id *id;
3834
3835         if (!node)
3836                 return NULL;
3837
3838         snprintf(buffer, sizeof(buffer), "kernel%d", kernel_id);
3839         id = isl_id_alloc(isl_schedule_node_get_ctx(node), buffer, NULL);
3840         return isl_schedule_node_group(node, id);
3841 }
3842
3843 /* Create a ppcg_kernel representing the domain instances that reach "node"
3844  * and replace the subtree at "node" by a mark node pointing
3845  * to the ppcg_kernel.
3846  * Mark all outer band nodes as atomic to ensure each kernel is only
3847  * scheduled once.
3848  * If the domain elements that reach "node" live in more than one space,
3849  * then group the domain elements into a single space, named kernelX,
3850  * with X the kernel sequence number.
3851  *
3852  * We keep a copy of the isl_id that points to the kernel to ensure
3853  * that the kernel does not get destroyed if the schedule node
3854  * is freed due to some error condition.
3855  */
3856 static __isl_give isl_schedule_node *create_kernel(struct gpu_gen *gen,
3857         __isl_take isl_schedule_node *node)
3858 {
3859         struct ppcg_kernel *kernel;
3860         isl_id *id;
3861         isl_union_set *domain;
3862         int single_statement;
3863
3864         kernel = isl_calloc_type(gen->ctx, struct ppcg_kernel);
3865         kernel = ppcg_kernel_create_local_arrays(kernel, gen->prog);
3866         if (!kernel)
3867                 return isl_schedule_node_free(node);
3868
3869         domain = isl_schedule_node_get_domain(node);
3870         single_statement = isl_union_set_n_set(domain) == 1;
3871
3872         kernel->ctx = gen->ctx;
3873         kernel->options = gen->options;
3874         kernel->context = extract_context(node, gen->prog);
3875         kernel->arrays = accessed_by_domain(domain, gen->prog);
3876         kernel->tile_len = isl_schedule_node_band_n_member(node);
3877         kernel->n_parallel = n_outer_coincidence(node);
3878         kernel->n_grid = kernel->n_parallel;
3879         kernel->n_block = kernel->n_parallel;
3880         kernel->id = gen->kernel_id++;
3881
3882         node = atomic_ancestors(node);
3883
3884         id = isl_id_alloc(gen->ctx, "kernel", kernel);
3885         id = isl_id_set_free_user(id, &ppcg_kernel_free_wrap);
3886         node = isl_schedule_node_insert_mark(node, isl_id_copy(id));
3887
3888         if (!single_statement)
3889                 node = group_statements(node, kernel->id);
3890
3891         node = isl_schedule_node_child(node, 0);
3892         node = isl_schedule_node_cut(node);
3893         node = isl_schedule_node_parent(node);
3894
3895         if (!single_statement)
3896                 node = isl_schedule_node_parent(node);
3897
3898         isl_id_free(id);
3899         return node;
3900 }
3901
3902 /* Insert a zero-dimensional permutable band at "node".
3903  */
3904 static __isl_give isl_schedule_node *insert_empty_permutable_band(
3905         __isl_take isl_schedule_node *node)
3906 {
3907         isl_space *space;
3908         isl_schedule *schedule;
3909         isl_union_set *domain;
3910         isl_multi_union_pw_aff *mupa;
3911
3912         schedule = isl_schedule_node_get_schedule(node);
3913         domain = isl_schedule_get_domain(schedule);
3914         space = isl_union_set_get_space(domain);
3915         isl_union_set_free(domain);
3916         isl_schedule_free(schedule);
3917
3918         space = isl_space_set_from_params(space);
3919         mupa = isl_multi_union_pw_aff_zero(space);
3920         node = isl_schedule_node_insert_partial_schedule(node, mupa);
3921         node = isl_schedule_node_band_set_permutable(node, 1);
3922
3923         return node;
3924 }
3925
3926 /* Mark "node" as outer permutable.
3927  *
3928  * If "node" originally points to a leaf, then insert a zero-dimensional
3929  * permutable band such that we can assume that "node" always
3930  * points to a band node.
3931  *
3932  * Create a kernel representing the domain instances that reach "node" and
3933  * replace the band node with a mark node pointing to the kernel.
3934  */
3935 static __isl_give isl_schedule_node *mark_outer_permutable(
3936         struct gpu_gen *gen, __isl_take isl_schedule_node *node)
3937 {
3938         if (isl_schedule_node_get_type(node) == isl_schedule_node_leaf)
3939                 node = insert_empty_permutable_band(node);
3940
3941         node = create_kernel(gen, node);
3942
3943         return node;
3944 }
3945
3946 static __isl_give isl_schedule_node *select_outer_band(struct gpu_gen *gen,
3947         __isl_take isl_schedule_node *node, int pos, struct band_info *info);
3948
3949 /* Check if this band node is tilable and has any parallel loops.  If so,
3950  * take it as the outermost tilable band.  If not, continue looking for the
3951  * outermost tilable band in the children of the current band.
3952  * Return a pointer to the same node in a tree where all outermost tilable
3953  * bands in the current subtree have been replaced by mark nodes
3954  * containing a pointer to a ppcg_kernel object.
3955  */
3956 static __isl_give isl_schedule_node *band_select_outer_band(struct gpu_gen *gen,
3957         __isl_take isl_schedule_node *node, int pos, struct band_info *info)
3958 {
3959         int n = isl_schedule_node_band_n_member(node);
3960         int n_parallel;
3961
3962         n_parallel = n_outer_coincidence(node);
3963
3964         if (!isl_schedule_node_band_get_permutable(node) || n_parallel == 0) {
3965                 node = isl_schedule_node_child(node, 0);
3966                 node = select_outer_band(gen, node, pos + n, info);
3967                 return isl_schedule_node_parent(node);
3968         }
3969
3970         gen->any_parallelism = 1;
3971         info->gen = gen;
3972         info->tile_first = pos;
3973         info->prefix = isl_schedule_node_get_prefix_schedule_union_map(node);
3974         info->suffix = isl_schedule_node_get_subtree_schedule_union_map(node);
3975
3976         node = mark_outer_permutable(gen, node);
3977
3978         return node;
3979 }
3980
3981 /* Extend "umap" with coordinates with fixed value "val"
3982  * to a total length of "dst_len", assuming the original dimension is "src_len".
3983  */
3984 static __isl_give isl_union_map *extend_range(
3985         __isl_take isl_union_map *umap, int src_len, int dst_len, int val)
3986 {
3987         isl_space *dim;
3988         isl_map *map;
3989         int i;
3990
3991         dim = isl_union_map_get_space(umap);
3992         map = isl_map_reverse(projection(dim, dst_len, src_len));
3993         for (i = src_len; i < dst_len; ++i)
3994                 map = isl_map_fix_si(map, isl_dim_out, i, val);
3995
3996         umap = isl_union_map_apply_range(umap, isl_union_map_from_map(map));
3997
3998         return umap;
3999 }
4000
4001 /* Select the outermost bands in the elements of the sequence or set
4002  * node "node", align their prefix schedules and combine the resulting
4003  * prefix and suffix schedules into a single pair of prefix and
4004  * suffix schedules for the entire list.
4005  * Return a pointer to the same node in a tree where all outermost tilable
4006  * bands in the current subtree have been replaced by mark nodes
4007  * containing a pointer to a ppcg_kernel object.
4008  */
4009 static __isl_give isl_schedule_node *list_select_outer_band(
4010         struct gpu_gen *gen, __isl_take isl_schedule_node *node, int pos,
4011         struct band_info *list_info)
4012 {
4013         int i;
4014         int n = isl_schedule_node_n_children(node);
4015         isl_ctx *ctx = isl_schedule_node_get_ctx(node);
4016         struct band_info *info;
4017         int max_tile_first;
4018         isl_union_map *prefix;
4019         isl_union_map *suffix;
4020
4021         assert(n >= 1);
4022         info = isl_calloc_array(ctx, struct band_info, n);
4023         assert(info);
4024
4025         max_tile_first = 0;
4026         for (i = 0; i < n; ++i) {
4027                 node = isl_schedule_node_child(node, i);
4028                 node = select_outer_band(gen, node, pos, &info[i]);
4029                 if (info[i].tile_first > max_tile_first)
4030                         max_tile_first = info[i].tile_first;
4031                 node = isl_schedule_node_parent(node);
4032         }
4033
4034         for (i = 0; i < n; ++i) {
4035                 if (info[i].tile_first == max_tile_first)
4036                         continue;
4037                 info[i].prefix = extend_range(info[i].prefix,
4038                                         info[i].tile_first, max_tile_first, 0);
4039                 info[i].tile_first = max_tile_first;
4040         }
4041
4042         prefix = info[0].prefix;
4043         suffix = info[0].suffix;
4044
4045         for (i = 1; i < n; ++i) {
4046                 prefix = isl_union_map_union(prefix, info[i].prefix);
4047                 suffix = isl_union_map_union(suffix, info[i].suffix);
4048         }
4049
4050         list_info->tile_first = info[0].tile_first;
4051         list_info->prefix = prefix;
4052         list_info->suffix = suffix;
4053
4054         free(info);
4055         return node;
4056 }
4057
4058 /* If we reach a leaf node, then we have not found any outer tilable
4059  * band with parallel loops, so consider the leaf node as the outermost
4060  * tilable band.
4061  * Return a pointer to a mark node containing a pointer
4062  * to a ppcg_kernel object inserted at the original leaf node.
4063  */
4064 static __isl_give isl_schedule_node *leaf_select_outer_band(struct gpu_gen *gen,
4065         __isl_take isl_schedule_node *node, int pos, struct band_info *info)
4066 {
4067         info->gen = gen;
4068         info->tile_first = pos;
4069         info->prefix = isl_schedule_node_get_prefix_schedule_union_map(node);
4070         info->suffix = isl_schedule_node_get_subtree_schedule_union_map(node);
4071
4072         node = mark_outer_permutable(gen, node);
4073
4074         return node;
4075 }
4076
4077 /* Select the outermost tilable band in the subtree that "node" points to and
4078  * return a pointer to the same node in a tree where all outermost tilable
4079  * bands in the current subtree have been replaced by mark nodes
4080  * containing a pointer to a ppcg_kernel object.
4081  */
4082 static __isl_give isl_schedule_node *select_outer_band(struct gpu_gen *gen,
4083         __isl_take isl_schedule_node *node, int pos, struct band_info *info)
4084 {
4085         enum isl_schedule_node_type type;
4086
4087         type = isl_schedule_node_get_type(node);
4088         switch (type) {
4089         case isl_schedule_node_domain:
4090         case isl_schedule_node_filter:
4091                 node = isl_schedule_node_child(node, 0);
4092                 node = select_outer_band(gen, node, pos, info);
4093                 return isl_schedule_node_parent(node);
4094         case isl_schedule_node_leaf:
4095                 return leaf_select_outer_band(gen, node, pos, info);
4096         case isl_schedule_node_band:
4097                 return band_select_outer_band(gen, node, pos, info);
4098         case isl_schedule_node_set:
4099         case isl_schedule_node_sequence:
4100                 return list_select_outer_band(gen, node, pos, info);
4101         default:
4102                 isl_die(isl_schedule_node_get_ctx(node),
4103                         isl_error_unsupported, "unhandled schedule node type",
4104                         node = node);
4105         case isl_schedule_node_error:
4106                 info->prefix = NULL;
4107                 info->suffix = NULL;
4108                 break;
4109         }
4110
4111         return isl_schedule_node_free(node);
4112 }
4113
4114 /* Select the outermost tilable band that (by construction)
4115  * has at least one parallel loop.
4116  * The starting position of the aligned band is stored in the pair
4117  * gen->tile_first.
4118  * The sizes and number of parallel loops may be different in different
4119  * parts of the band forest and are therefore stored in the gpu_stmts.
4120  *
4121  * Return the complete schedule, with the tilable bands aligned
4122  * at gen->tile_first and padded with zero, if needed.
4123  * Store a schedule tree corresponding to the outer gen->tile_first
4124  * dimensions, with mark nodes containing pointers to ppcg_kernel objects,
4125  * in gen->host_schedule.
4126  */
4127 static __isl_give isl_union_map *select_outer_tilable_band(struct gpu_gen *gen,
4128         __isl_keep isl_schedule *schedule)
4129 {
4130         isl_schedule_node *node;
4131         struct band_info info;
4132
4133         node = isl_schedule_get_root(schedule);
4134         node = select_outer_band(gen, node, 0, &info);
4135         gen->host_schedule = isl_schedule_node_get_schedule(node);
4136         isl_schedule_node_free(node);
4137
4138         gen->tile_first = info.tile_first;
4139         info.suffix = align_range(info.suffix);
4140
4141         return isl_union_map_flat_range_product(info.prefix, info.suffix);
4142 }
4143
4144 /* Set gen->untiled_len to the number of scheduling dimensions
4145  * for the schedule of the first domain.
4146  * We assume here that this number is the same for all domains.
4147  */
4148 static int set_untiled_len(__isl_take isl_map *map, void *user)
4149 {
4150         unsigned *untiled_len = user;
4151
4152         *untiled_len = isl_map_dim(map, isl_dim_out);
4153
4154         isl_map_free(map);
4155         return -1;
4156 }
4157
4158 /* Compute an appropriate schedule based on the accesses in
4159  * gen->read and gen->write.
4160  *
4161  * We use the dependences in gen->prog->scop to compute
4162  * a schedule that has a parallel loop in each tilable band.
4163  * Finally, we select the outermost tilable band.
4164  *
4165  * If live range reordering is allowed, then we need to make sure
4166  * that live ranges on arrays are not run in parallel since doing
4167  * so would require array expansion.  We therefore add the array
4168  * order dependences to the coincidence dependences.  Non-zero array
4169  * order dependences will then prevent a schedule dimension from being
4170  * considered parallel.
4171  * Live ranges derived from scalars are allowed to be run in parallel
4172  * since we force the scalars to be mapped to private memory in
4173  * check_scalar_live_ranges.
4174  * If live range reordering is allowed, then the false dependences
4175  * are not added to the validity constraints as that would prevent
4176  * reordering.  Instead, the external false dependences that enforce that reads
4177  * from potentially live-in data precede any later write and
4178  * that writes of potentially live-out data follow any other earlier write
4179  * are added to the validity and the coincidence constraints.
4180  * The false dependences are still added to the proximity constraints
4181  * for consistency with the case where live range reordering is not allowed.
4182  * The coincidence constraints then consist of flow dependences,
4183  * external false dependences and array order dependences.
4184  * The independences can be filtered out from the first two sets.
4185  * They have already been filtered out from the array order dependences
4186  * on a per array basis in collect_order_dependences.
4187  * There is no need for a per array handling of the other two sets
4188  * as there should be no flow or external false dependence on local
4189  * variables that can be filtered out.
4190  */
4191 static void compute_schedule(struct gpu_gen *gen)
4192 {
4193         isl_union_set *domain;
4194         isl_union_map *dep_raw, *dep;
4195         isl_union_map *validity, *proximity, *coincidence;
4196         isl_union_map *sched;
4197         isl_schedule_constraints *sc;
4198         isl_schedule *schedule;
4199
4200         domain = isl_union_set_copy(gen->prog->scop->domain);
4201         sc = isl_schedule_constraints_on_domain(isl_union_set_copy(domain));
4202         sc = isl_schedule_constraints_set_context(sc,
4203                                 isl_set_copy(gen->prog->scop->context));
4204         if (gen->options->live_range_reordering) {
4205                 sc = isl_schedule_constraints_set_conditional_validity(sc,
4206                         isl_union_map_copy(gen->prog->scop->tagged_dep_flow),
4207                         isl_union_map_copy(gen->prog->scop->tagged_dep_order));
4208                 proximity = isl_union_map_copy(gen->prog->scop->dep_flow);
4209                 validity = isl_union_map_copy(proximity);
4210                 validity = isl_union_map_union(validity,
4211                             isl_union_map_copy(gen->prog->scop->dep_forced));
4212                 proximity = isl_union_map_union(proximity,
4213                             isl_union_map_copy(gen->prog->scop->dep_false));
4214                 coincidence = isl_union_map_copy(validity);
4215                 coincidence = isl_union_map_subtract(coincidence,
4216                         isl_union_map_copy(gen->prog->scop->independence));
4217                 coincidence = isl_union_map_union(coincidence,
4218                                 isl_union_map_copy(gen->prog->array_order));
4219         } else {
4220                 dep_raw = isl_union_map_copy(gen->prog->scop->dep_flow);
4221                 dep = isl_union_map_copy(gen->prog->scop->dep_false);
4222                 dep = isl_union_map_union(dep, dep_raw);
4223                 dep = isl_union_map_coalesce(dep);
4224                 proximity = isl_union_map_copy(dep);
4225                 coincidence = isl_union_map_copy(dep);
4226                 validity = dep;
4227         }
4228         sc = isl_schedule_constraints_set_validity(sc, validity);
4229         sc = isl_schedule_constraints_set_coincidence(sc, coincidence);
4230         sc = isl_schedule_constraints_set_proximity(sc, proximity);
4231
4232         if (gen->options->debug->dump_schedule_constraints)
4233                 isl_schedule_constraints_dump(sc);
4234         schedule = isl_schedule_constraints_compute_schedule(sc);
4235         if (gen->options->debug->dump_schedule)
4236                 isl_schedule_dump(schedule);
4237
4238         sched = select_outer_tilable_band(gen, schedule);
4239
4240         isl_union_map_foreach_map(sched, &set_untiled_len, &gen->untiled_len);
4241         sched = isl_union_map_intersect_domain(sched, domain);
4242         gen->sched = sched;
4243
4244         isl_schedule_free(schedule);
4245 }
4246
4247 /* Compute the sets of outer array elements that need to be copied in and out.
4248  *
4249  * In particular, for each array that is possibly written anywhere in
4250  * gen->prog and that is visible outside the corresponding scop,
4251  * we copy out its entire extent.
4252  *
4253  * Any array elements that is read without first being written needs
4254  * to be copied in. Furthermore, if there are any array elements that
4255  * are copied out, but that may not be written inside gen->prog, then
4256  * they also need to be copied in to ensure that the value after execution
4257  * is the same as the value before execution, at least for those array
4258  * elements that may have their values preserved by the scop.
4259  * In case the array elements are structures, we need to take into
4260  * account that all members of the structures need to be written
4261  * by gen->prog before we can avoid copying the data structure in.
4262  *
4263  * While computing the set of array elements that are copied out but
4264  * not necessarily written, we intersect both sets with the context.
4265  * This helps in those cases where the arrays are declared with a fixed size,
4266  * while the accesses are parametric and the context assigns a fixed value
4267  * to the parameters.
4268  *
4269  * If an element from a local array is read without first being written,
4270  * then there is no point in copying it in since it cannot have been
4271  * written prior to the scop.  Warn about the uninitialized read instead.
4272  */
4273 static void compute_copy_in_and_out(struct gpu_gen *gen)
4274 {
4275         int i;
4276         isl_union_set *local;
4277         isl_union_set *may_write, *must_write;
4278         isl_union_set *copy_in, *copy_out;
4279         isl_union_set *not_written;
4280         isl_union_map *uninitialized;
4281         isl_union_map *local_uninitialized;
4282
4283         must_write = isl_union_map_range(
4284                                 isl_union_map_copy(gen->prog->must_write));
4285         must_write = isl_union_set_intersect_params(must_write,
4286                                             isl_set_copy(gen->prog->context));
4287         may_write = isl_union_map_range(
4288                                 isl_union_map_copy(gen->prog->may_write));
4289         may_write = isl_union_set_intersect_params(may_write,
4290                                             isl_set_copy(gen->prog->context));
4291         may_write = isl_union_set_universe(may_write);
4292         may_write = isl_union_set_apply(may_write,
4293                                     isl_union_map_copy(gen->prog->to_outer));
4294         copy_out = isl_union_set_empty(isl_union_set_get_space(may_write));
4295         local = isl_union_set_copy(copy_out);
4296
4297         for (i = 0; i < gen->prog->n_array; ++i) {
4298                 isl_space *space;
4299                 isl_set *write_i;
4300                 int empty;
4301
4302                 space = isl_space_copy(gen->prog->array[i].space);
4303
4304                 if (gen->prog->array[i].local) {
4305                         isl_set *set;
4306
4307                         set = isl_set_universe(space);
4308                         local = isl_union_set_add_set(local, set);
4309                         continue;
4310                 }
4311
4312                 write_i = isl_union_set_extract_set(may_write, space);
4313                 empty = isl_set_plain_is_empty(write_i);
4314                 isl_set_free(write_i);
4315                 if (empty)
4316                         continue;
4317
4318                 write_i = isl_set_copy(gen->prog->array[i].extent);
4319                 copy_out = isl_union_set_add_set(copy_out, write_i);
4320         }
4321         isl_union_set_free(may_write);
4322
4323         copy_out = isl_union_set_intersect_params(copy_out,
4324                                             isl_set_copy(gen->prog->context));
4325
4326         gen->prog->copy_out = isl_union_set_copy(copy_out);
4327
4328         copy_out = isl_union_set_apply(copy_out,
4329                                     isl_union_map_copy(gen->prog->to_inner));
4330         copy_out = isl_union_set_intersect(copy_out,
4331                                     isl_union_set_copy(gen->prog->may_persist));
4332         not_written = isl_union_set_subtract(copy_out, must_write);
4333
4334         uninitialized = isl_union_map_copy(gen->prog->scop->live_in);
4335         local_uninitialized = isl_union_map_copy(uninitialized);
4336
4337         local = isl_union_set_apply(local,
4338                                     isl_union_map_copy(gen->prog->to_inner));
4339         local_uninitialized = isl_union_map_intersect_range(local_uninitialized,
4340                                                             local);
4341         if (!isl_union_map_is_empty(local_uninitialized)) {
4342                 fprintf(stderr,
4343                         "possibly uninitialized reads (not copied in):\n");
4344                 isl_union_map_dump(local_uninitialized);
4345         }
4346         uninitialized = isl_union_map_subtract(uninitialized,
4347                                                 local_uninitialized);
4348         copy_in = isl_union_map_range(uninitialized);
4349         copy_in = isl_union_set_union(copy_in, not_written);
4350         copy_in = isl_union_set_apply(copy_in,
4351                                     isl_union_map_copy(gen->prog->to_outer));
4352
4353         gen->prog->copy_in = copy_in;
4354 }
4355
4356 /* Internal data structure for extract_access.
4357  * "next_access" points to the end of a linked list that is extended
4358  * by extract_access.
4359  * "single_expression" is set if the access expressions belong to
4360  * an expression statement (i.e., a statement without internal control).
4361  * "any_to_outer" maps all intermediate arrays to their outer arrays.
4362  */
4363 struct ppcg_extract_access_data {
4364         struct gpu_stmt_access **next_access;
4365         int single_expression;
4366         isl_union_map *any_to_outer;
4367 };
4368
4369 /* Given a tagged access relation to a single array "tagged", extract it
4370  * as a map, taking into account that the input may be empty.
4371  * If the access relation is empty, then it does not contain
4372  * any space information, so we try to recover it from the index
4373  * expression.
4374  * The space of the index expression is of the form I -> A,
4375  * with I the statement instances and A the array, or [I -> F] -> A,
4376  * with F the filters corresponding to arguments.
4377  * We first drop F, if present, obtaining I -> A.
4378  * Then we construct I -> R, with R the reference tag,
4379  * combine the two into I -> [R -> A] and uncurry to obtain
4380  * the final result [I -> R] -> A.
4381  * Note that the index expression may have a lower dimension
4382  * than that of the array, but this dimension is not used
4383  * if the access relation is empty.
4384  */
4385 static __isl_give isl_map *extract_single_tagged_access(
4386         __isl_take isl_union_map *tagged, __isl_keep pet_expr *expr)
4387 {
4388         int empty;
4389         isl_id *id;
4390         isl_space *space, *space2;
4391         isl_multi_pw_aff *index;
4392
4393         empty = isl_union_map_is_empty(tagged);
4394         if (empty < 0)
4395                 goto error;
4396         if (!empty)
4397                 return isl_map_from_union_map(tagged);
4398         isl_union_map_free(tagged);
4399
4400         index = pet_expr_access_get_index(expr);
4401         space = isl_multi_pw_aff_get_space(index);
4402         isl_multi_pw_aff_free(index);
4403         if (isl_space_domain_is_wrapping(space))
4404                 space = isl_space_domain_factor_domain(space);
4405         space2 = isl_space_copy(space);
4406         space2 = isl_space_from_domain(isl_space_domain(space));
4407         id = pet_expr_access_get_ref_id(expr);
4408         space2 = isl_space_set_tuple_id(space2, isl_dim_out, id);
4409         space = isl_space_range_product(space2, space);
4410         space = isl_space_uncurry(space);
4411
4412         return isl_map_empty(space);
4413 error:
4414         isl_union_map_free(tagged);
4415         return NULL;
4416 }
4417
4418 /* Extract a gpu_stmt_access from "expr", append it to the list
4419  * that ends in *data->next_access and update the end of the list.
4420  * If the access expression performs a write, then it is considered
4421  * exact only if it appears in a single expression statement and
4422  * if its may access relation is equal to its must access relation.
4423  *
4424  * The combined set of may accesses may be union if member accesses
4425  * are involved, but the entire set is derived from a single reference and
4426  * therefore from a single index expression.  These accesses therefore
4427  * all map to the same outer array.
4428  */
4429 static int extract_access(__isl_keep pet_expr *expr, void *user)
4430 {
4431         struct ppcg_extract_access_data *data = user;
4432         isl_union_map *tagged;
4433         struct gpu_stmt_access *access;
4434         isl_ctx *ctx = pet_expr_get_ctx(expr);
4435         isl_multi_pw_aff *index;
4436
4437         access = isl_alloc_type(ctx, struct gpu_stmt_access);
4438         assert(access);
4439         access->next = NULL;
4440         access->read = pet_expr_access_is_read(expr);
4441         access->write = pet_expr_access_is_write(expr);
4442         tagged = pet_expr_access_get_tagged_may_read(expr);
4443         tagged = isl_union_map_union(tagged,
4444                                 pet_expr_access_get_tagged_may_write(expr));
4445         tagged = isl_union_map_apply_range(tagged,
4446                                         isl_union_map_copy(data->any_to_outer));
4447         if (!access->write) {
4448                 access->exact_write = 1;
4449         } else if (!data->single_expression) {
4450                 access->exact_write = 0;
4451         } else {
4452                 isl_union_map *must, *may;
4453                 may = isl_union_map_copy(tagged);
4454                 may = isl_union_map_domain_factor_domain(may);
4455                 must = pet_expr_access_get_must_write(expr);
4456                 access->exact_write = isl_union_map_is_equal(must, may);
4457                 isl_union_map_free(must);
4458                 isl_union_map_free(may);
4459         }
4460         index = pet_expr_access_get_index(expr);
4461         access->n_index = isl_multi_pw_aff_dim(index, isl_dim_out);
4462         isl_multi_pw_aff_free(index);
4463         access->ref_id = pet_expr_access_get_ref_id(expr);
4464         access->tagged_access = extract_single_tagged_access(tagged, expr);
4465         access->access = isl_map_copy(access->tagged_access);
4466         access->access = isl_map_domain_factor_domain(access->access);
4467
4468         *data->next_access = access;
4469         data->next_access = &(*data->next_access)->next;
4470
4471         if (!access->access)
4472                 return -1;
4473
4474         return 0;
4475 }
4476
4477 /* Construct a linked list of gpu_stmt_access objects,
4478  * one for each access expression in the statement body.
4479  * "any_to_outer" maps all intermediate arrays to their outer arrays.
4480  */
4481 static int pet_stmt_extract_accesses(struct gpu_stmt *stmt,
4482         __isl_keep isl_union_map *any_to_outer)
4483 {
4484         struct ppcg_extract_access_data data;
4485
4486         stmt->accesses = NULL;
4487         data.next_access = &stmt->accesses;
4488         data.single_expression =
4489                 pet_tree_get_type(stmt->stmt->body) == pet_tree_expr;
4490         data.any_to_outer = any_to_outer;
4491         return pet_tree_foreach_access_expr(stmt->stmt->body,
4492                                                 &extract_access, &data);
4493 }
4494
4495 /* Return an array of gpu_stmt representing the statements in "scop".
4496  */
4497 static struct gpu_stmt *extract_stmts(isl_ctx *ctx, struct ppcg_scop *scop,
4498         __isl_keep isl_set *context, __isl_keep isl_union_map *any_to_outer)
4499 {
4500         int i;
4501         struct gpu_stmt *stmts;
4502
4503         stmts = isl_calloc_array(ctx, struct gpu_stmt, scop->pet->n_stmt);
4504         if (!stmts)
4505                 return NULL;
4506
4507         for (i = 0; i < scop->pet->n_stmt; ++i) {
4508                 struct gpu_stmt *s = &stmts[i];
4509
4510                 s->id = isl_set_get_tuple_id(scop->pet->stmts[i]->domain);
4511                 s->stmt = scop->pet->stmts[i];
4512                 if (pet_stmt_extract_accesses(s, any_to_outer) < 0)
4513                         return free_stmts(stmts, i + 1);
4514         }
4515
4516         return stmts;
4517 }
4518
4519 /* Callback for ppcg_print_guarded that calls the callback for generate_gpu.
4520  */
4521 static __isl_give isl_printer *print_gpu(__isl_take isl_printer *p, void *user)
4522 {
4523         struct gpu_gen *gen = user;
4524
4525         return gen->print(p, gen->prog, gen->tree, &gen->types,
4526                             gen->print_user);
4527 }
4528
4529 /* Generate CUDA code for "scop" and print it to "p".
4530  * After generating an AST for the transformed scop as explained below,
4531  * we call "gen->print" to print the AST in the desired output format
4532  * to "p".
4533  *
4534  * If it turns out that it does not make sense to generate GPU code,
4535  * then we generate CPU code instead.
4536  *
4537  * The GPU code is generated in a context where at least one
4538  * statement instance is executed.  The corresponding guard (if any) is printed
4539  * around the entire generated GPU code, except for the declaration
4540  * of the arrays that are visible outside of the scop and that therefore
4541  * cannot be declared inside the body of any possible guard.
4542  *
4543  * We first compute a schedule that respects the dependences
4544  * of the original program and select the outermost band
4545  * of tilable dimensions that has at least one parallel loop.
4546  * We then have three blocks of dimensions
4547  *
4548  *      H               B                       G
4549  *
4550  * The tilable band "B" is first tiled according to "tile" sizes, resulting
4551  * in
4552  *
4553  *      H       T               P               G
4554  *
4555  * For each iteration of the T loop and for each array, we compute
4556  * the array elements accessed by that iteration, construct a rectangular
4557  * box around it and shift it to the origin.  The result is used
4558  * as shared memory for the array.
4559  *
4560  * We then split off at most 2 parallel loops from the T loops and
4561  * at most 3 parallel loops from the P loops
4562  *
4563  *      H       T1      T2      P1      P2      G
4564  *
4565  * The T1/P1 loops are then tiled or "wrapped" over the blocks/threads,
4566  * according to "grid"/"block" sizes.
4567  *
4568  *      H       T1T T1P T2      P1T P1P P2      G
4569  *
4570  * Finally, the T1P and P1P iterators are equated to the block and
4571  * thread dimensions respectively and so are effectively removed.
4572  * The H loops are run on the host.  The T1T, T2, P1T, P2 and G loops
4573  * are run on the GPU.
4574  *
4575  * Code is generated in three stages.  We first generate code for the
4576  * host (the H loops), with iterators h%d.  Then, for each leaf node
4577  * of the resulting AST, we generate code for the shared loops (up to
4578  * and including T2), with iterators g%d and after equating the H loops
4579  * to h%d parameters and the T1P loops to the block dimensions.
4580  * Finally, we generate code for the remaining loops in a similar fashion.
4581  */
4582 static __isl_give isl_printer *generate(__isl_take isl_printer *p,
4583         struct gpu_gen *gen, struct ppcg_scop *scop,
4584         struct ppcg_options *options)
4585 {
4586         struct gpu_prog *prog;
4587         isl_ctx *ctx;
4588         isl_set *context, *guard;
4589
4590         if (!scop)
4591                 return isl_printer_free(p);
4592
4593         ctx = isl_printer_get_ctx(p);
4594         prog = gpu_prog_alloc(ctx, scop);
4595         if (!prog)
4596                 return isl_printer_free(p);
4597
4598         context = isl_set_copy(prog->context);
4599         guard = isl_union_set_params(isl_union_set_copy(prog->scop->domain));
4600         prog->context = isl_set_intersect(prog->context, isl_set_copy(guard));
4601
4602         gen->prog = prog;
4603         gen->any_parallelism = 0;
4604         compute_schedule(gen);
4605
4606         if (!gen->any_parallelism) {
4607                 isl_set_free(context);
4608                 isl_set_free(guard);
4609                 p = print_cpu(p, scop, options);
4610         } else {
4611                 compute_copy_in_and_out(gen);
4612                 gen->tree = generate_host_code(gen);
4613                 p = ppcg_print_exposed_declarations(p, prog->scop);
4614                 p = ppcg_print_guarded(p, guard, context, &print_gpu, gen);
4615                 isl_ast_node_free(gen->tree);
4616         }
4617
4618         isl_union_map_free(gen->sched);
4619         isl_schedule_free(gen->host_schedule);
4620
4621         gpu_prog_free(prog);
4622
4623         return p;
4624 }
4625
4626 /* Wrapper around generate for use as a ppcg_transform callback.
4627  */
4628 static __isl_give isl_printer *generate_wrap(__isl_take isl_printer *p,
4629         struct ppcg_scop *scop, void *user)
4630 {
4631         struct gpu_gen *gen = user;
4632
4633         return generate(p, gen, scop, gen->options);
4634 }
4635
4636 /* Transform the code in the file called "input" by replacing
4637  * all scops by corresponding GPU code and write the results to "out".
4638  */
4639 int generate_gpu(isl_ctx *ctx, const char *input, FILE *out,
4640         struct ppcg_options *options,
4641         __isl_give isl_printer *(*print)(__isl_take isl_printer *p,
4642                 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
4643                 struct gpu_types *types, void *user), void *user)
4644 {
4645         struct gpu_gen gen;
4646         int r;
4647         int i;
4648
4649         gen.ctx = ctx;
4650         gen.sizes = extract_sizes_from_str(ctx, options->sizes);
4651         gen.options = options;
4652         gen.kernel_id = 0;
4653         gen.print = print;
4654         gen.print_user = user;
4655         gen.types.n = 0;
4656         gen.types.name = NULL;
4657
4658         if (options->debug->dump_sizes) {
4659                 isl_space *space = isl_space_params_alloc(ctx, 0);
4660                 gen.used_sizes = isl_union_map_empty(space);
4661         }
4662
4663         r = ppcg_transform(ctx, input, out, options, &generate_wrap, &gen);
4664
4665         if (options->debug->dump_sizes) {
4666                 isl_union_map_dump(gen.used_sizes);
4667                 isl_union_map_free(gen.used_sizes);
4668         }
4669
4670         isl_union_map_free(gen.sizes);
4671         for (i = 0; i < gen.types.n; ++i)
4672                 free(gen.types.name[i]);
4673         free(gen.types.name);
4674
4675         return r;
4676 }
4677
4678 /* Compute the set of inner array elements that may have their values
4679  * preserved by "prog".  In particular, collect the array elements of
4680  * arrays that are not local to "prog" and remove those elements that
4681  * are definitely killed or definitely written by "prog".
4682  */
4683 static __isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog)
4684 {
4685         int i;
4686         isl_union_set *may_persist, *killed;
4687         isl_union_map *must_kill;
4688
4689         may_persist = isl_union_set_empty(isl_set_get_space(prog->context));
4690         for (i = 0; i < prog->n_array; ++i) {
4691                 isl_set *extent;
4692
4693                 if (prog->array[i].local)
4694                         continue;
4695
4696                 extent = isl_set_copy(prog->array[i].extent);
4697                 may_persist = isl_union_set_add_set(may_persist, extent);
4698         }
4699
4700         may_persist = isl_union_set_intersect_params(may_persist,
4701                                                 isl_set_copy(prog->context));
4702         may_persist = isl_union_set_apply(may_persist,
4703                                         isl_union_map_copy(prog->to_inner));
4704         must_kill = isl_union_map_copy(prog->tagged_must_kill);
4705         killed = isl_union_map_range(must_kill);
4706         must_kill = isl_union_map_copy(prog->must_write);
4707         killed = isl_union_set_union(killed, isl_union_map_range(must_kill));
4708
4709         may_persist = isl_union_set_subtract(may_persist, killed);
4710         return may_persist;
4711 }
4712
4713 struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop)
4714 {
4715         struct gpu_prog *prog;
4716         isl_space *space;
4717         isl_map *id;
4718
4719         if (!scop)
4720                 return NULL;
4721
4722         prog = isl_calloc_type(ctx, struct gpu_prog);
4723         assert(prog);
4724
4725         prog->ctx = ctx;
4726         prog->scop = scop;
4727         prog->context = isl_set_copy(scop->context);
4728         prog->n_stmts = scop->pet->n_stmt;
4729         prog->any_to_outer = pet_scop_compute_outer_to_any(scop->pet);
4730         prog->any_to_outer = isl_union_map_reverse(prog->any_to_outer);
4731         space = isl_union_map_get_space(prog->any_to_outer);
4732         space = isl_space_set_from_params(space);
4733         space = isl_space_add_dims(space, isl_dim_set, 1);
4734         space = isl_space_map_from_set(space);
4735         id = isl_map_identity(space);
4736         prog->any_to_outer = isl_union_map_add_map(prog->any_to_outer, id);
4737         prog->stmts = extract_stmts(ctx, scop,
4738                                         prog->context, prog->any_to_outer);
4739         prog->read = isl_union_map_copy(scop->reads);
4740         prog->may_write = isl_union_map_copy(scop->may_writes);
4741         prog->must_write = isl_union_map_copy(scop->must_writes);
4742         prog->tagged_must_kill = isl_union_map_copy(scop->tagged_must_kills);
4743         prog->to_inner = pet_scop_compute_outer_to_inner(scop->pet);
4744         prog->to_outer = isl_union_map_copy(prog->to_inner);
4745         prog->to_outer = isl_union_map_reverse(prog->to_outer);
4746
4747         if (!prog->stmts)
4748                 return gpu_prog_free(prog);
4749
4750         if (collect_array_info(prog) < 0)
4751                 return gpu_prog_free(prog);
4752         prog->may_persist = compute_may_persist(prog);
4753
4754         return prog;
4755 }
4756
4757 void *gpu_prog_free(struct gpu_prog *prog)
4758 {
4759         if (!prog)
4760                 return NULL;
4761         free_array_info(prog);
4762         free_stmts(prog->stmts, prog->n_stmts);
4763         isl_union_map_free(prog->any_to_outer);
4764         isl_union_map_free(prog->to_outer);
4765         isl_union_map_free(prog->to_inner);
4766         isl_union_set_free(prog->copy_in);
4767         isl_union_set_free(prog->copy_out);
4768         isl_union_map_free(prog->read);
4769         isl_union_map_free(prog->may_write);
4770         isl_union_map_free(prog->must_write);
4771         isl_union_map_free(prog->tagged_must_kill);
4772         isl_union_map_free(prog->array_order);
4773         isl_union_set_free(prog->may_persist);
4774         isl_set_free(prog->context);
4775         free(prog);
4776         return NULL;
4777 }