1 //===- Schedule.cpp - Calculate an optimized schedule ---------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This pass generates an entirely new schedule tree from the data dependences
11 // and iteration domains. The new schedule tree is computed in two steps:
13 // 1) The isl scheduling optimizer is run
15 // The isl scheduling optimizer creates a new schedule tree that maximizes
16 // parallelism and tileability and minimizes data-dependence distances. The
17 // algorithm used is a modified version of the ``Pluto'' algorithm:
19 // U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan.
20 // A Practical Automatic Polyhedral Parallelizer and Locality Optimizer.
21 // In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language
22 // Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008.
24 // 2) A set of post-scheduling transformations is applied on the schedule tree.
26 // These optimizations include:
28 // - Tiling of the innermost tilable bands
29 // - Prevectorization - The choice of a possible outer loop that is strip-mined
30 // to the innermost level to enable inner-loop
32 // - Some optimizations for spatial locality are also planned.
34 // For a detailed description of the schedule tree itself please see section 6
37 // Polyhedral AST generation is more than scanning polyhedra
38 // Tobias Grosser, Sven Verdoolaege, Albert Cohen
39 // ACM Transactions on Programming Languages and Systems (TOPLAS),
41 // http://www.grosser.es/#pub-polyhedral-AST-generation
43 // This publication also contains a detailed discussion of the different options
44 // for polyhedral loop unrolling, full/partial tile separation and other uses
45 // of the schedule tree.
47 //===----------------------------------------------------------------------===//
49 #include "polly/ScheduleOptimizer.h"
50 #include "polly/CodeGen/CodeGeneration.h"
51 #include "polly/DependenceInfo.h"
52 #include "polly/LinkAllPasses.h"
53 #include "polly/Options.h"
54 #include "polly/ScopInfo.h"
55 #include "polly/Support/GICHelper.h"
56 #include "polly/Support/ISLOStream.h"
57 #include "llvm/Analysis/TargetTransformInfo.h"
58 #include "llvm/Support/Debug.h"
61 #include "isl/constraint.h"
63 #include "isl/options.h"
64 #include "isl/printer.h"
65 #include "isl/schedule.h"
66 #include "isl/schedule_node.h"
67 #include "isl/space.h"
68 #include "isl/union_map.h"
69 #include "isl/union_set.h"
72 using namespace polly
;
74 #define DEBUG_TYPE "polly-opt-isl"
76 static cl::opt
<std::string
>
77 OptimizeDeps("polly-opt-optimize-only",
78 cl::desc("Only a certain kind of dependences (all/raw)"),
79 cl::Hidden
, cl::init("all"), cl::ZeroOrMore
,
80 cl::cat(PollyCategory
));
82 static cl::opt
<std::string
>
83 SimplifyDeps("polly-opt-simplify-deps",
84 cl::desc("Dependences should be simplified (yes/no)"),
85 cl::Hidden
, cl::init("yes"), cl::ZeroOrMore
,
86 cl::cat(PollyCategory
));
88 static cl::opt
<int> MaxConstantTerm(
89 "polly-opt-max-constant-term",
90 cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden
,
91 cl::init(20), cl::ZeroOrMore
, cl::cat(PollyCategory
));
93 static cl::opt
<int> MaxCoefficient(
94 "polly-opt-max-coefficient",
95 cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden
,
96 cl::init(20), cl::ZeroOrMore
, cl::cat(PollyCategory
));
98 static cl::opt
<std::string
> FusionStrategy(
99 "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"),
100 cl::Hidden
, cl::init("min"), cl::ZeroOrMore
, cl::cat(PollyCategory
));
102 static cl::opt
<std::string
>
103 MaximizeBandDepth("polly-opt-maximize-bands",
104 cl::desc("Maximize the band depth (yes/no)"), cl::Hidden
,
105 cl::init("yes"), cl::ZeroOrMore
, cl::cat(PollyCategory
));
107 static cl::opt
<std::string
> OuterCoincidence(
108 "polly-opt-outer-coincidence",
109 cl::desc("Try to construct schedules where the outer member of each band "
110 "satisfies the coincidence constraints (yes/no)"),
111 cl::Hidden
, cl::init("no"), cl::ZeroOrMore
, cl::cat(PollyCategory
));
113 static cl::opt
<int> PrevectorWidth(
114 "polly-prevect-width",
116 "The number of loop iterations to strip-mine for pre-vectorization"),
117 cl::Hidden
, cl::init(4), cl::ZeroOrMore
, cl::cat(PollyCategory
));
119 static cl::opt
<bool> FirstLevelTiling("polly-tiling",
120 cl::desc("Enable loop tiling"),
121 cl::init(true), cl::ZeroOrMore
,
122 cl::cat(PollyCategory
));
124 static cl::opt
<int> LatencyVectorFma(
125 "polly-target-latency-vector-fma",
126 cl::desc("The minimal number of cycles between issuing two "
127 "dependent consecutive vector fused multiply-add "
129 cl::Hidden
, cl::init(8), cl::ZeroOrMore
, cl::cat(PollyCategory
));
131 static cl::opt
<int> ThroughputVectorFma(
132 "polly-target-throughput-vector-fma",
133 cl::desc("A throughput of the processor floating-point arithmetic units "
134 "expressed in the number of vector fused multiply-add "
135 "instructions per clock cycle."),
136 cl::Hidden
, cl::init(1), cl::ZeroOrMore
, cl::cat(PollyCategory
));
138 // This option, along with --polly-target-2nd-cache-level-associativity,
139 // --polly-target-1st-cache-level-size, and --polly-target-2st-cache-level-size
140 // represent the parameters of the target cache, which do not have typical
141 // values that can be used by default. However, to apply the pattern matching
142 // optimizations, we use the values of the parameters of Intel Core i7-3820
143 // SandyBridge in case the parameters are not specified. Such an approach helps
144 // also to attain the high-performance on IBM POWER System S822 and IBM Power
145 // 730 Express server.
146 static cl::opt
<int> FirstCacheLevelAssociativity(
147 "polly-target-1st-cache-level-associativity",
148 cl::desc("The associativity of the first cache level."), cl::Hidden
,
149 cl::init(8), cl::ZeroOrMore
, cl::cat(PollyCategory
));
151 static cl::opt
<int> SecondCacheLevelAssociativity(
152 "polly-target-2nd-cache-level-associativity",
153 cl::desc("The associativity of the second cache level."), cl::Hidden
,
154 cl::init(8), cl::ZeroOrMore
, cl::cat(PollyCategory
));
156 static cl::opt
<int> FirstCacheLevelSize(
157 "polly-target-1st-cache-level-size",
158 cl::desc("The size of the first cache level specified in bytes."),
159 cl::Hidden
, cl::init(32768), cl::ZeroOrMore
, cl::cat(PollyCategory
));
161 static cl::opt
<int> SecondCacheLevelSize(
162 "polly-target-2nd-cache-level-size",
163 cl::desc("The size of the second level specified in bytes."), cl::Hidden
,
164 cl::init(262144), cl::ZeroOrMore
, cl::cat(PollyCategory
));
166 static cl::opt
<int> VectorRegisterBitwidth(
167 "polly-target-vector-register-bitwidth",
168 cl::desc("The size in bits of a vector register (if not set, this "
169 "information is taken from LLVM's target information."),
170 cl::Hidden
, cl::init(-1), cl::ZeroOrMore
, cl::cat(PollyCategory
));
172 static cl::opt
<int> FirstLevelDefaultTileSize(
173 "polly-default-tile-size",
174 cl::desc("The default tile size (if not enough were provided by"
175 " --polly-tile-sizes)"),
176 cl::Hidden
, cl::init(32), cl::ZeroOrMore
, cl::cat(PollyCategory
));
179 FirstLevelTileSizes("polly-tile-sizes",
180 cl::desc("A tile size for each loop dimension, filled "
181 "with --polly-default-tile-size"),
182 cl::Hidden
, cl::ZeroOrMore
, cl::CommaSeparated
,
183 cl::cat(PollyCategory
));
186 SecondLevelTiling("polly-2nd-level-tiling",
187 cl::desc("Enable a 2nd level loop of loop tiling"),
188 cl::init(false), cl::ZeroOrMore
, cl::cat(PollyCategory
));
190 static cl::opt
<int> SecondLevelDefaultTileSize(
191 "polly-2nd-level-default-tile-size",
192 cl::desc("The default 2nd-level tile size (if not enough were provided by"
193 " --polly-2nd-level-tile-sizes)"),
194 cl::Hidden
, cl::init(16), cl::ZeroOrMore
, cl::cat(PollyCategory
));
197 SecondLevelTileSizes("polly-2nd-level-tile-sizes",
198 cl::desc("A tile size for each loop dimension, filled "
199 "with --polly-default-tile-size"),
200 cl::Hidden
, cl::ZeroOrMore
, cl::CommaSeparated
,
201 cl::cat(PollyCategory
));
203 static cl::opt
<bool> RegisterTiling("polly-register-tiling",
204 cl::desc("Enable register tiling"),
205 cl::init(false), cl::ZeroOrMore
,
206 cl::cat(PollyCategory
));
208 static cl::opt
<int> RegisterDefaultTileSize(
209 "polly-register-tiling-default-tile-size",
210 cl::desc("The default register tile size (if not enough were provided by"
211 " --polly-register-tile-sizes)"),
212 cl::Hidden
, cl::init(2), cl::ZeroOrMore
, cl::cat(PollyCategory
));
214 static cl::opt
<int> PollyPatternMatchingNcQuotient(
215 "polly-pattern-matching-nc-quotient",
216 cl::desc("Quotient that is obtained by dividing Nc, the parameter of the"
217 "macro-kernel, by Nr, the parameter of the micro-kernel"),
218 cl::Hidden
, cl::init(256), cl::ZeroOrMore
, cl::cat(PollyCategory
));
221 RegisterTileSizes("polly-register-tile-sizes",
222 cl::desc("A tile size for each loop dimension, filled "
223 "with --polly-register-tile-size"),
224 cl::Hidden
, cl::ZeroOrMore
, cl::CommaSeparated
,
225 cl::cat(PollyCategory
));
228 PMBasedOpts("polly-pattern-matching-based-opts",
229 cl::desc("Perform optimizations based on pattern matching"),
230 cl::init(true), cl::ZeroOrMore
, cl::cat(PollyCategory
));
232 static cl::opt
<bool> OptimizedScops(
233 "polly-optimized-scops",
234 cl::desc("Polly - Dump polyhedral description of Scops optimized with "
235 "the isl scheduling optimizer and the set of post-scheduling "
236 "transformations is applied on the schedule tree"),
237 cl::init(false), cl::ZeroOrMore
, cl::cat(PollyCategory
));
239 /// Create an isl::union_set, which describes the isolate option based on
242 /// @param IsolateDomain An isl::set whose @p OutDimsNum last dimensions should
243 /// belong to the current band node.
244 /// @param OutDimsNum A number of dimensions that should belong to
245 /// the current band node.
246 static isl::union_set
getIsolateOptions(isl::set IsolateDomain
,
247 unsigned OutDimsNum
) {
248 unsigned Dims
= IsolateDomain
.dim(isl::dim::set
);
249 assert(OutDimsNum
<= Dims
&&
250 "The isl::set IsolateDomain is used to describe the range of schedule "
251 "dimensions values, which should be isolated. Consequently, the "
252 "number of its dimensions should be greater than or equal to the "
253 "number of the schedule dimensions.");
254 isl::map IsolateRelation
= isl::map::from_domain(IsolateDomain
);
255 IsolateRelation
= IsolateRelation
.move_dims(isl::dim::out
, 0, isl::dim::in
,
256 Dims
- OutDimsNum
, OutDimsNum
);
257 isl::set IsolateOption
= IsolateRelation
.wrap();
258 isl::id Id
= isl::id::alloc(IsolateOption
.get_ctx(), "isolate", nullptr);
259 IsolateOption
= IsolateOption
.set_tuple_id(Id
);
260 return isl::union_set(IsolateOption
);
263 /// Create an isl::union_set, which describes the atomic option for the
264 /// dimension of the current node.
266 /// It may help to reduce the size of generated code.
268 /// @param Ctx An isl::ctx, which is used to create the isl::union_set.
269 static isl::union_set
getAtomicOptions(isl::ctx Ctx
) {
270 isl::space
Space(Ctx
, 0, 1);
271 isl::set AtomicOption
= isl::set::universe(Space
);
272 isl::id Id
= isl::id::alloc(Ctx
, "atomic", nullptr);
273 AtomicOption
= AtomicOption
.set_tuple_id(Id
);
274 return isl::union_set(AtomicOption
);
277 /// Create an isl::union_set, which describes the option of the form
278 /// [isolate[] -> unroll[x]].
280 /// @param Ctx An isl::ctx, which is used to create the isl::union_set.
281 static isl::union_set
getUnrollIsolatedSetOptions(isl::ctx Ctx
) {
282 isl::space Space
= isl::space(Ctx
, 0, 0, 1);
283 isl::map UnrollIsolatedSetOption
= isl::map::universe(Space
);
284 isl::id DimInId
= isl::id::alloc(Ctx
, "isolate", nullptr);
285 isl::id DimOutId
= isl::id::alloc(Ctx
, "unroll", nullptr);
286 UnrollIsolatedSetOption
=
287 UnrollIsolatedSetOption
.set_tuple_id(isl::dim::in
, DimInId
);
288 UnrollIsolatedSetOption
=
289 UnrollIsolatedSetOption
.set_tuple_id(isl::dim::out
, DimOutId
);
290 return UnrollIsolatedSetOption
.wrap();
293 /// Make the last dimension of Set to take values from 0 to VectorWidth - 1.
295 /// @param Set A set, which should be modified.
296 /// @param VectorWidth A parameter, which determines the constraint.
297 static isl::set
addExtentConstraints(isl::set Set
, int VectorWidth
) {
298 unsigned Dims
= Set
.dim(isl::dim::set
);
299 isl::space Space
= Set
.get_space();
300 isl::local_space LocalSpace
= isl::local_space(Space
);
301 isl::constraint ExtConstr
= isl::constraint::alloc_inequality(LocalSpace
);
302 ExtConstr
= ExtConstr
.set_constant_si(0);
303 ExtConstr
= ExtConstr
.set_coefficient_si(isl::dim::set
, Dims
- 1, 1);
304 Set
= Set
.add_constraint(ExtConstr
);
305 ExtConstr
= isl::constraint::alloc_inequality(LocalSpace
);
306 ExtConstr
= ExtConstr
.set_constant_si(VectorWidth
- 1);
307 ExtConstr
= ExtConstr
.set_coefficient_si(isl::dim::set
, Dims
- 1, -1);
308 return Set
.add_constraint(ExtConstr
);
311 /// Build the desired set of partial tile prefixes.
313 /// We build a set of partial tile prefixes, which are prefixes of the vector
314 /// loop that have exactly VectorWidth iterations.
316 /// 1. Get all prefixes of the vector loop.
317 /// 2. Extend it to a set, which has exactly VectorWidth iterations for
318 /// any prefix from the set that was built on the previous step.
319 /// 3. Subtract loop domain from it, project out the vector loop dimension and
320 /// get a set of prefixes, which don't have exactly VectorWidth iterations.
321 /// 4. Subtract it from all prefixes of the vector loop and get the desired
324 /// @param ScheduleRange A range of a map, which describes a prefix schedule
326 static isl::set
getPartialTilePrefixes(isl::set ScheduleRange
,
328 unsigned Dims
= ScheduleRange
.dim(isl::dim::set
);
329 isl::set LoopPrefixes
= ScheduleRange
.project_out(isl::dim::set
, Dims
- 1, 1);
330 isl::set ExtentPrefixes
= LoopPrefixes
.add_dims(isl::dim::set
, 1);
331 ExtentPrefixes
= addExtentConstraints(ExtentPrefixes
, VectorWidth
);
332 isl::set BadPrefixes
= ExtentPrefixes
.subtract(ScheduleRange
);
333 BadPrefixes
= BadPrefixes
.project_out(isl::dim::set
, Dims
- 1, 1);
334 return LoopPrefixes
.subtract(BadPrefixes
);
338 ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node
,
340 assert(isl_schedule_node_get_type(Node
.get()) == isl_schedule_node_band
);
341 Node
= Node
.child(0).child(0);
342 isl::union_map SchedRelUMap
= Node
.get_prefix_schedule_relation();
343 isl::map ScheduleRelation
= isl::map::from_union_map(SchedRelUMap
);
344 isl::set ScheduleRange
= ScheduleRelation
.range();
345 isl::set IsolateDomain
= getPartialTilePrefixes(ScheduleRange
, VectorWidth
);
346 isl::union_set AtomicOption
= getAtomicOptions(IsolateDomain
.get_ctx());
347 isl::union_set IsolateOption
= getIsolateOptions(IsolateDomain
, 1);
348 Node
= Node
.parent().parent();
349 isl::union_set Options
= IsolateOption
.unite(AtomicOption
);
350 Node
= Node
.band_set_ast_build_options(Options
);
354 __isl_give isl_schedule_node
*
355 ScheduleTreeOptimizer::prevectSchedBand(__isl_take isl_schedule_node
*Node
,
356 unsigned DimToVectorize
,
358 assert(isl_schedule_node_get_type(Node
) == isl_schedule_node_band
);
360 auto Space
= isl_schedule_node_band_get_space(Node
);
361 auto ScheduleDimensions
= isl_space_dim(Space
, isl_dim_set
);
362 isl_space_free(Space
);
363 assert(DimToVectorize
< ScheduleDimensions
);
365 if (DimToVectorize
> 0) {
366 Node
= isl_schedule_node_band_split(Node
, DimToVectorize
);
367 Node
= isl_schedule_node_child(Node
, 0);
369 if (DimToVectorize
< ScheduleDimensions
- 1)
370 Node
= isl_schedule_node_band_split(Node
, 1);
371 Space
= isl_schedule_node_band_get_space(Node
);
372 auto Sizes
= isl_multi_val_zero(Space
);
373 auto Ctx
= isl_schedule_node_get_ctx(Node
);
375 isl_multi_val_set_val(Sizes
, 0, isl_val_int_from_si(Ctx
, VectorWidth
));
376 Node
= isl_schedule_node_band_tile(Node
, Sizes
);
377 Node
= isolateFullPartialTiles(give(Node
), VectorWidth
).release();
378 Node
= isl_schedule_node_child(Node
, 0);
379 // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
380 // we will have troubles to match it in the backend.
381 Node
= isl_schedule_node_band_set_ast_build_options(
382 Node
, isl_union_set_read_from_str(Ctx
, "{ unroll[x]: 1 = 0 }"));
383 Node
= isl_schedule_node_band_sink(Node
);
384 Node
= isl_schedule_node_child(Node
, 0);
385 if (isl_schedule_node_get_type(Node
) == isl_schedule_node_leaf
)
386 Node
= isl_schedule_node_parent(Node
);
387 isl_id
*LoopMarker
= isl_id_alloc(Ctx
, "SIMD", nullptr);
388 Node
= isl_schedule_node_insert_mark(Node
, LoopMarker
);
392 __isl_give isl_schedule_node
*
393 ScheduleTreeOptimizer::tileNode(__isl_take isl_schedule_node
*Node
,
394 const char *Identifier
, ArrayRef
<int> TileSizes
,
395 int DefaultTileSize
) {
396 auto Ctx
= isl_schedule_node_get_ctx(Node
);
397 auto Space
= isl_schedule_node_band_get_space(Node
);
398 auto Dims
= isl_space_dim(Space
, isl_dim_set
);
399 auto Sizes
= isl_multi_val_zero(Space
);
400 std::string
IdentifierString(Identifier
);
401 for (unsigned i
= 0; i
< Dims
; i
++) {
402 auto tileSize
= i
< TileSizes
.size() ? TileSizes
[i
] : DefaultTileSize
;
403 Sizes
= isl_multi_val_set_val(Sizes
, i
, isl_val_int_from_si(Ctx
, tileSize
));
405 auto TileLoopMarkerStr
= IdentifierString
+ " - Tiles";
406 isl_id
*TileLoopMarker
=
407 isl_id_alloc(Ctx
, TileLoopMarkerStr
.c_str(), nullptr);
408 Node
= isl_schedule_node_insert_mark(Node
, TileLoopMarker
);
409 Node
= isl_schedule_node_child(Node
, 0);
410 Node
= isl_schedule_node_band_tile(Node
, Sizes
);
411 Node
= isl_schedule_node_child(Node
, 0);
412 auto PointLoopMarkerStr
= IdentifierString
+ " - Points";
413 isl_id
*PointLoopMarker
=
414 isl_id_alloc(Ctx
, PointLoopMarkerStr
.c_str(), nullptr);
415 Node
= isl_schedule_node_insert_mark(Node
, PointLoopMarker
);
416 Node
= isl_schedule_node_child(Node
, 0);
420 __isl_give isl_schedule_node
*
421 ScheduleTreeOptimizer::applyRegisterTiling(__isl_take isl_schedule_node
*Node
,
422 llvm::ArrayRef
<int> TileSizes
,
423 int DefaultTileSize
) {
424 auto *Ctx
= isl_schedule_node_get_ctx(Node
);
425 Node
= tileNode(Node
, "Register tiling", TileSizes
, DefaultTileSize
);
426 Node
= isl_schedule_node_band_set_ast_build_options(
427 Node
, isl_union_set_read_from_str(Ctx
, "{unroll[x]}"));
432 bool isSimpleInnermostBand(const isl::schedule_node
&Node
) {
433 assert(isl_schedule_node_get_type(Node
.keep()) == isl_schedule_node_band
);
434 assert(isl_schedule_node_n_children(Node
.keep()) == 1);
436 auto ChildType
= isl_schedule_node_get_type(Node
.child(0).keep());
438 if (ChildType
== isl_schedule_node_leaf
)
441 if (ChildType
!= isl_schedule_node_sequence
)
444 auto Sequence
= Node
.child(0);
446 for (int c
= 0, nc
= isl_schedule_node_n_children(Sequence
.keep()); c
< nc
;
448 auto Child
= Sequence
.child(c
);
449 if (isl_schedule_node_get_type(Child
.keep()) != isl_schedule_node_filter
)
451 if (isl_schedule_node_get_type(Child
.child(0).keep()) !=
452 isl_schedule_node_leaf
)
459 bool ScheduleTreeOptimizer::isTileableBandNode(
460 __isl_keep isl_schedule_node
*Node
) {
461 if (isl_schedule_node_get_type(Node
) != isl_schedule_node_band
)
464 if (isl_schedule_node_n_children(Node
) != 1)
467 if (!isl_schedule_node_band_get_permutable(Node
))
470 auto Space
= isl_schedule_node_band_get_space(Node
);
471 auto Dims
= isl_space_dim(Space
, isl_dim_set
);
472 isl_space_free(Space
);
477 auto ManagedNode
= isl::manage(isl_schedule_node_copy(Node
));
478 return isSimpleInnermostBand(ManagedNode
);
481 __isl_give isl_schedule_node
*
482 ScheduleTreeOptimizer::standardBandOpts(__isl_take isl_schedule_node
*Node
,
484 if (FirstLevelTiling
)
485 Node
= tileNode(Node
, "1st level tiling", FirstLevelTileSizes
,
486 FirstLevelDefaultTileSize
);
488 if (SecondLevelTiling
)
489 Node
= tileNode(Node
, "2nd level tiling", SecondLevelTileSizes
,
490 SecondLevelDefaultTileSize
);
494 applyRegisterTiling(Node
, RegisterTileSizes
, RegisterDefaultTileSize
);
496 if (PollyVectorizerChoice
== VECTORIZER_NONE
)
499 auto Space
= isl_schedule_node_band_get_space(Node
);
500 auto Dims
= isl_space_dim(Space
, isl_dim_set
);
501 isl_space_free(Space
);
503 for (int i
= Dims
- 1; i
>= 0; i
--)
504 if (isl_schedule_node_band_member_get_coincident(Node
, i
)) {
505 Node
= prevectSchedBand(Node
, i
, PrevectorWidth
);
512 /// Get the position of a dimension with a non-zero coefficient.
514 /// Check that isl constraint @p Constraint has only one non-zero
515 /// coefficient for dimensions that have type @p DimType. If this is true,
516 /// return the position of the dimension corresponding to the non-zero
517 /// coefficient and negative value, otherwise.
519 /// @param Constraint The isl constraint to be checked.
520 /// @param DimType The type of the dimensions.
521 /// @return The position of the dimension in case the isl
522 /// constraint satisfies the requirements, a negative
523 /// value, otherwise.
524 static int getMatMulConstraintDim(__isl_keep isl_constraint
*Constraint
,
525 enum isl_dim_type DimType
) {
527 auto *LocalSpace
= isl_constraint_get_local_space(Constraint
);
528 int LocalSpaceDimNum
= isl_local_space_dim(LocalSpace
, DimType
);
529 for (int i
= 0; i
< LocalSpaceDimNum
; i
++) {
530 auto *Val
= isl_constraint_get_coefficient_val(Constraint
, DimType
, i
);
531 if (isl_val_is_zero(Val
)) {
535 if (DimPos
>= 0 || (DimType
== isl_dim_out
&& !isl_val_is_one(Val
)) ||
536 (DimType
== isl_dim_in
&& !isl_val_is_negone(Val
))) {
538 isl_local_space_free(LocalSpace
);
544 isl_local_space_free(LocalSpace
);
548 /// Check the form of the isl constraint.
550 /// Check that the @p DimInPos input dimension of the isl constraint
551 /// @p Constraint has a coefficient that is equal to negative one, the @p
552 /// DimOutPos has a coefficient that is equal to one and others
553 /// have coefficients equal to zero.
555 /// @param Constraint The isl constraint to be checked.
556 /// @param DimInPos The input dimension of the isl constraint.
557 /// @param DimOutPos The output dimension of the isl constraint.
558 /// @return isl_stat_ok in case the isl constraint satisfies
559 /// the requirements, isl_stat_error otherwise.
560 static isl_stat
isMatMulOperandConstraint(__isl_keep isl_constraint
*Constraint
,
561 int &DimInPos
, int &DimOutPos
) {
562 auto *Val
= isl_constraint_get_constant_val(Constraint
);
563 if (!isl_constraint_is_equality(Constraint
) || !isl_val_is_zero(Val
)) {
565 return isl_stat_error
;
568 DimInPos
= getMatMulConstraintDim(Constraint
, isl_dim_in
);
570 return isl_stat_error
;
571 DimOutPos
= getMatMulConstraintDim(Constraint
, isl_dim_out
);
573 return isl_stat_error
;
577 /// Check that the access relation corresponds to a non-constant operand
578 /// of the matrix multiplication.
580 /// Access relations that correspond to non-constant operands of the matrix
581 /// multiplication depend only on two input dimensions and have two output
582 /// dimensions. The function checks that the isl basic map @p bmap satisfies
583 /// the requirements. The two input dimensions can be specified via @p user
586 /// @param bmap The isl basic map to be checked.
587 /// @param user The input dimensions of @p bmap.
588 /// @return isl_stat_ok in case isl basic map satisfies the requirements,
589 /// isl_stat_error otherwise.
590 static isl_stat
isMatMulOperandBasicMap(__isl_take isl_basic_map
*bmap
,
592 auto *Constraints
= isl_basic_map_get_constraint_list(bmap
);
593 isl_basic_map_free(bmap
);
594 if (isl_constraint_list_n_constraint(Constraints
) != 2) {
595 isl_constraint_list_free(Constraints
);
596 return isl_stat_error
;
598 int InPosPair
[] = {-1, -1};
599 auto DimInPos
= user
? static_cast<int *>(user
) : InPosPair
;
600 for (int i
= 0; i
< 2; i
++) {
601 auto *Constraint
= isl_constraint_list_get_constraint(Constraints
, i
);
603 if (isMatMulOperandConstraint(Constraint
, InPos
, OutPos
) ==
605 OutPos
> 1 || (DimInPos
[OutPos
] >= 0 && DimInPos
[OutPos
] != InPos
)) {
606 isl_constraint_free(Constraint
);
607 isl_constraint_list_free(Constraints
);
608 return isl_stat_error
;
610 DimInPos
[OutPos
] = InPos
;
611 isl_constraint_free(Constraint
);
613 isl_constraint_list_free(Constraints
);
617 /// Permute the two dimensions of the isl map.
619 /// Permute @p DstPos and @p SrcPos dimensions of the isl map @p Map that
620 /// have type @p DimType.
622 /// @param Map The isl map to be modified.
623 /// @param DimType The type of the dimensions.
624 /// @param DstPos The first dimension.
625 /// @param SrcPos The second dimension.
626 /// @return The modified map.
627 __isl_give isl_map
*permuteDimensions(__isl_take isl_map
*Map
,
628 enum isl_dim_type DimType
,
629 unsigned DstPos
, unsigned SrcPos
) {
630 assert(DstPos
< isl_map_dim(Map
, DimType
) &&
631 SrcPos
< isl_map_dim(Map
, DimType
));
632 if (DstPos
== SrcPos
)
634 isl_id
*DimId
= nullptr;
635 if (isl_map_has_tuple_id(Map
, DimType
))
636 DimId
= isl_map_get_tuple_id(Map
, DimType
);
637 auto FreeDim
= DimType
== isl_dim_in
? isl_dim_out
: isl_dim_in
;
638 isl_id
*FreeDimId
= nullptr;
639 if (isl_map_has_tuple_id(Map
, FreeDim
))
640 FreeDimId
= isl_map_get_tuple_id(Map
, FreeDim
);
641 auto MaxDim
= std::max(DstPos
, SrcPos
);
642 auto MinDim
= std::min(DstPos
, SrcPos
);
643 Map
= isl_map_move_dims(Map
, FreeDim
, 0, DimType
, MaxDim
, 1);
644 Map
= isl_map_move_dims(Map
, FreeDim
, 0, DimType
, MinDim
, 1);
645 Map
= isl_map_move_dims(Map
, DimType
, MinDim
, FreeDim
, 1, 1);
646 Map
= isl_map_move_dims(Map
, DimType
, MaxDim
, FreeDim
, 0, 1);
648 Map
= isl_map_set_tuple_id(Map
, DimType
, DimId
);
650 Map
= isl_map_set_tuple_id(Map
, FreeDim
, FreeDimId
);
654 /// Check the form of the access relation.
656 /// Check that the access relation @p AccMap has the form M[i][j], where i
657 /// is a @p FirstPos and j is a @p SecondPos.
659 /// @param AccMap The access relation to be checked.
660 /// @param FirstPos The index of the input dimension that is mapped to
661 /// the first output dimension.
662 /// @param SecondPos The index of the input dimension that is mapped to the
663 /// second output dimension.
664 /// @return True in case @p AccMap has the expected form and false,
666 static bool isMatMulOperandAcc(__isl_keep isl_map
*AccMap
, int &FirstPos
,
668 int DimInPos
[] = {FirstPos
, SecondPos
};
669 if (isl_map_foreach_basic_map(AccMap
, isMatMulOperandBasicMap
,
670 static_cast<void *>(DimInPos
)) != isl_stat_ok
||
671 DimInPos
[0] < 0 || DimInPos
[1] < 0)
673 FirstPos
= DimInPos
[0];
674 SecondPos
= DimInPos
[1];
678 /// Does the memory access represent a non-scalar operand of the matrix
681 /// Check that the memory access @p MemAccess is the read access to a non-scalar
682 /// operand of the matrix multiplication or its result.
684 /// @param MemAccess The memory access to be checked.
685 /// @param MMI Parameters of the matrix multiplication operands.
686 /// @return True in case the memory access represents the read access
687 /// to a non-scalar operand of the matrix multiplication and
688 /// false, otherwise.
689 static bool isMatMulNonScalarReadAccess(MemoryAccess
*MemAccess
,
691 if (!MemAccess
->isLatestArrayKind() || !MemAccess
->isRead())
693 isl_map
*AccMap
= MemAccess
->getLatestAccessRelation();
694 if (isMatMulOperandAcc(AccMap
, MMI
.i
, MMI
.j
) && !MMI
.ReadFromC
&&
695 isl_map_n_basic_map(AccMap
) == 1) {
696 MMI
.ReadFromC
= MemAccess
;
697 isl_map_free(AccMap
);
700 if (isMatMulOperandAcc(AccMap
, MMI
.i
, MMI
.k
) && !MMI
.A
&&
701 isl_map_n_basic_map(AccMap
) == 1) {
703 isl_map_free(AccMap
);
706 if (isMatMulOperandAcc(AccMap
, MMI
.k
, MMI
.j
) && !MMI
.B
&&
707 isl_map_n_basic_map(AccMap
) == 1) {
709 isl_map_free(AccMap
);
712 isl_map_free(AccMap
);
716 /// Check accesses to operands of the matrix multiplication.
718 /// Check that accesses of the SCoP statement, which corresponds to
719 /// the partial schedule @p PartialSchedule, are scalar in terms of loops
720 /// containing the matrix multiplication, in case they do not represent
721 /// accesses to the non-scalar operands of the matrix multiplication or
724 /// @param PartialSchedule The partial schedule of the SCoP statement.
725 /// @param MMI Parameters of the matrix multiplication operands.
726 /// @return True in case the corresponding SCoP statement
727 /// represents matrix multiplication and false,
729 static bool containsOnlyMatrMultAcc(__isl_keep isl_map
*PartialSchedule
,
731 auto *InputDimId
= isl_map_get_tuple_id(PartialSchedule
, isl_dim_in
);
732 auto *Stmt
= static_cast<ScopStmt
*>(isl_id_get_user(InputDimId
));
733 isl_id_free(InputDimId
);
734 unsigned OutDimNum
= isl_map_dim(PartialSchedule
, isl_dim_out
);
735 assert(OutDimNum
> 2 && "In case of the matrix multiplication the loop nest "
736 "and, consequently, the corresponding scheduling "
737 "functions have at least three dimensions.");
738 auto *MapI
= permuteDimensions(isl_map_copy(PartialSchedule
), isl_dim_out
,
739 MMI
.i
, OutDimNum
- 1);
740 auto *MapJ
= permuteDimensions(isl_map_copy(PartialSchedule
), isl_dim_out
,
741 MMI
.j
, OutDimNum
- 1);
742 auto *MapK
= permuteDimensions(isl_map_copy(PartialSchedule
), isl_dim_out
,
743 MMI
.k
, OutDimNum
- 1);
744 for (auto *MemA
= Stmt
->begin(); MemA
!= Stmt
->end() - 1; MemA
++) {
745 auto *MemAccessPtr
= *MemA
;
746 if (MemAccessPtr
->isLatestArrayKind() && MemAccessPtr
!= MMI
.WriteToC
&&
747 !isMatMulNonScalarReadAccess(MemAccessPtr
, MMI
) &&
748 !(MemAccessPtr
->isStrideZero(isl_map_copy(MapI
)) &&
749 MemAccessPtr
->isStrideZero(isl_map_copy(MapJ
)) &&
750 MemAccessPtr
->isStrideZero(isl_map_copy(MapK
)))) {
763 /// Check for dependencies corresponding to the matrix multiplication.
765 /// Check that there is only true dependence of the form
766 /// S(..., k, ...) -> S(..., k + 1, …), where S is the SCoP statement
767 /// represented by @p Schedule and k is @p Pos. Such a dependence corresponds
768 /// to the dependency produced by the matrix multiplication.
770 /// @param Schedule The schedule of the SCoP statement.
771 /// @param D The SCoP dependencies.
772 /// @param Pos The parameter to describe an acceptable true dependence.
773 /// In case it has a negative value, try to determine its
774 /// acceptable value.
775 /// @return True in case dependencies correspond to the matrix multiplication
776 /// and false, otherwise.
777 static bool containsOnlyMatMulDep(__isl_keep isl_map
*Schedule
,
778 const Dependences
*D
, int &Pos
) {
779 auto *Dep
= D
->getDependences(Dependences::TYPE_RAW
);
780 auto *Red
= D
->getDependences(Dependences::TYPE_RED
);
782 Dep
= isl_union_map_union(Dep
, Red
);
783 auto *DomainSpace
= isl_space_domain(isl_map_get_space(Schedule
));
784 auto *Space
= isl_space_map_from_domain_and_range(isl_space_copy(DomainSpace
),
786 auto *Deltas
= isl_map_deltas(isl_union_map_extract_map(Dep
, Space
));
787 isl_union_map_free(Dep
);
788 int DeltasDimNum
= isl_set_dim(Deltas
, isl_dim_set
);
789 for (int i
= 0; i
< DeltasDimNum
; i
++) {
790 auto *Val
= isl_set_plain_get_val_if_fixed(Deltas
, isl_dim_set
, i
);
791 Pos
= Pos
< 0 && isl_val_is_one(Val
) ? i
: Pos
;
792 if (isl_val_is_nan(Val
) ||
793 !(isl_val_is_zero(Val
) || (i
== Pos
&& isl_val_is_one(Val
)))) {
795 isl_set_free(Deltas
);
800 isl_set_free(Deltas
);
801 if (DeltasDimNum
== 0 || Pos
< 0)
806 /// Check if the SCoP statement could probably be optimized with analytical
809 /// containsMatrMult tries to determine whether the following conditions
811 /// 1. The last memory access modeling an array, MA1, represents writing to
812 /// memory and has the form S(..., i1, ..., i2, ...) -> M(i1, i2) or
813 /// S(..., i2, ..., i1, ...) -> M(i1, i2), where S is the SCoP statement
814 /// under consideration.
815 /// 2. There is only one loop-carried true dependency, and it has the
816 /// form S(..., i3, ...) -> S(..., i3 + 1, ...), and there are no
817 /// loop-carried or anti dependencies.
818 /// 3. SCoP contains three access relations, MA2, MA3, and MA4 that represent
819 /// reading from memory and have the form S(..., i3, ...) -> M(i1, i3),
820 /// S(..., i3, ...) -> M(i3, i2), S(...) -> M(i1, i2), respectively,
821 /// and all memory accesses of the SCoP that are different from MA1, MA2,
822 /// MA3, and MA4 have stride 0, if the innermost loop is exchanged with any
823 /// of loops i1, i2 and i3.
825 /// @param PartialSchedule The PartialSchedule that contains a SCoP statement
827 /// @D The SCoP dependencies.
828 /// @MMI Parameters of the matrix multiplication operands.
829 static bool containsMatrMult(__isl_keep isl_map
*PartialSchedule
,
830 const Dependences
*D
, MatMulInfoTy
&MMI
) {
831 auto *InputDimsId
= isl_map_get_tuple_id(PartialSchedule
, isl_dim_in
);
832 auto *Stmt
= static_cast<ScopStmt
*>(isl_id_get_user(InputDimsId
));
833 isl_id_free(InputDimsId
);
834 if (Stmt
->size() <= 1)
836 for (auto *MemA
= Stmt
->end() - 1; MemA
!= Stmt
->begin(); MemA
--) {
837 auto *MemAccessPtr
= *MemA
;
838 if (!MemAccessPtr
->isLatestArrayKind())
840 if (!MemAccessPtr
->isWrite())
842 auto *AccMap
= MemAccessPtr
->getLatestAccessRelation();
843 if (isl_map_n_basic_map(AccMap
) != 1 ||
844 !isMatMulOperandAcc(AccMap
, MMI
.i
, MMI
.j
)) {
845 isl_map_free(AccMap
);
848 isl_map_free(AccMap
);
849 MMI
.WriteToC
= MemAccessPtr
;
853 if (!containsOnlyMatMulDep(PartialSchedule
, D
, MMI
.k
))
856 if (!MMI
.WriteToC
|| !containsOnlyMatrMultAcc(PartialSchedule
, MMI
))
859 if (!MMI
.A
|| !MMI
.B
|| !MMI
.ReadFromC
)
864 /// Permute two dimensions of the band node.
866 /// Permute FirstDim and SecondDim dimensions of the Node.
868 /// @param Node The band node to be modified.
869 /// @param FirstDim The first dimension to be permuted.
870 /// @param SecondDim The second dimension to be permuted.
871 static __isl_give isl_schedule_node
*
872 permuteBandNodeDimensions(__isl_take isl_schedule_node
*Node
, unsigned FirstDim
,
873 unsigned SecondDim
) {
874 assert(isl_schedule_node_get_type(Node
) == isl_schedule_node_band
&&
875 isl_schedule_node_band_n_member(Node
) > std::max(FirstDim
, SecondDim
));
876 auto PartialSchedule
= isl_schedule_node_band_get_partial_schedule(Node
);
877 auto PartialScheduleFirstDim
=
878 isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule
, FirstDim
);
879 auto PartialScheduleSecondDim
=
880 isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule
, SecondDim
);
881 PartialSchedule
= isl_multi_union_pw_aff_set_union_pw_aff(
882 PartialSchedule
, SecondDim
, PartialScheduleFirstDim
);
883 PartialSchedule
= isl_multi_union_pw_aff_set_union_pw_aff(
884 PartialSchedule
, FirstDim
, PartialScheduleSecondDim
);
885 Node
= isl_schedule_node_delete(Node
);
886 Node
= isl_schedule_node_insert_partial_schedule(Node
, PartialSchedule
);
890 __isl_give isl_schedule_node
*ScheduleTreeOptimizer::createMicroKernel(
891 __isl_take isl_schedule_node
*Node
, MicroKernelParamsTy MicroKernelParams
) {
892 applyRegisterTiling(Node
, {MicroKernelParams
.Mr
, MicroKernelParams
.Nr
}, 1);
893 Node
= isl_schedule_node_parent(isl_schedule_node_parent(Node
));
894 Node
= permuteBandNodeDimensions(Node
, 0, 1);
895 return isl_schedule_node_child(isl_schedule_node_child(Node
, 0), 0);
898 __isl_give isl_schedule_node
*ScheduleTreeOptimizer::createMacroKernel(
899 __isl_take isl_schedule_node
*Node
, MacroKernelParamsTy MacroKernelParams
) {
900 assert(isl_schedule_node_get_type(Node
) == isl_schedule_node_band
);
901 if (MacroKernelParams
.Mc
== 1 && MacroKernelParams
.Nc
== 1 &&
902 MacroKernelParams
.Kc
== 1)
904 int DimOutNum
= isl_schedule_node_band_n_member(Node
);
905 std::vector
<int> TileSizes(DimOutNum
, 1);
906 TileSizes
[DimOutNum
- 3] = MacroKernelParams
.Mc
;
907 TileSizes
[DimOutNum
- 2] = MacroKernelParams
.Nc
;
908 TileSizes
[DimOutNum
- 1] = MacroKernelParams
.Kc
;
909 Node
= tileNode(Node
, "1st level tiling", TileSizes
, 1);
910 Node
= isl_schedule_node_parent(isl_schedule_node_parent(Node
));
911 Node
= permuteBandNodeDimensions(Node
, DimOutNum
- 2, DimOutNum
- 1);
912 Node
= permuteBandNodeDimensions(Node
, DimOutNum
- 3, DimOutNum
- 1);
913 return isl_schedule_node_child(isl_schedule_node_child(Node
, 0), 0);
916 /// Get the size of the widest type of the matrix multiplication operands
917 /// in bytes, including alignment padding.
919 /// @param MMI Parameters of the matrix multiplication operands.
920 /// @return The size of the widest type of the matrix multiplication operands
921 /// in bytes, including alignment padding.
922 static uint64_t getMatMulAlignTypeSize(MatMulInfoTy MMI
) {
923 auto *S
= MMI
.A
->getStatement()->getParent();
924 auto &DL
= S
->getFunction().getParent()->getDataLayout();
925 auto ElementSizeA
= DL
.getTypeAllocSize(MMI
.A
->getElementType());
926 auto ElementSizeB
= DL
.getTypeAllocSize(MMI
.B
->getElementType());
927 auto ElementSizeC
= DL
.getTypeAllocSize(MMI
.WriteToC
->getElementType());
928 return std::max({ElementSizeA
, ElementSizeB
, ElementSizeC
});
931 /// Get the size of the widest type of the matrix multiplication operands
934 /// @param MMI Parameters of the matrix multiplication operands.
935 /// @return The size of the widest type of the matrix multiplication operands
937 static uint64_t getMatMulTypeSize(MatMulInfoTy MMI
) {
938 auto *S
= MMI
.A
->getStatement()->getParent();
939 auto &DL
= S
->getFunction().getParent()->getDataLayout();
940 auto ElementSizeA
= DL
.getTypeSizeInBits(MMI
.A
->getElementType());
941 auto ElementSizeB
= DL
.getTypeSizeInBits(MMI
.B
->getElementType());
942 auto ElementSizeC
= DL
.getTypeSizeInBits(MMI
.WriteToC
->getElementType());
943 return std::max({ElementSizeA
, ElementSizeB
, ElementSizeC
});
946 /// Get parameters of the BLIS micro kernel.
948 /// We choose the Mr and Nr parameters of the micro kernel to be large enough
949 /// such that no stalls caused by the combination of latencies and dependencies
950 /// are introduced during the updates of the resulting matrix of the matrix
951 /// multiplication. However, they should also be as small as possible to
952 /// release more registers for entries of multiplied matrices.
954 /// @param TTI Target Transform Info.
955 /// @param MMI Parameters of the matrix multiplication operands.
956 /// @return The structure of type MicroKernelParamsTy.
957 /// @see MicroKernelParamsTy
958 static struct MicroKernelParamsTy
959 getMicroKernelParams(const llvm::TargetTransformInfo
*TTI
, MatMulInfoTy MMI
) {
960 assert(TTI
&& "The target transform info should be provided.");
962 // Nvec - Number of double-precision floating-point numbers that can be hold
963 // by a vector register. Use 2 by default.
964 long RegisterBitwidth
= VectorRegisterBitwidth
;
966 if (RegisterBitwidth
== -1)
967 RegisterBitwidth
= TTI
->getRegisterBitWidth(true);
968 auto ElementSize
= getMatMulTypeSize(MMI
);
969 assert(ElementSize
> 0 && "The element size of the matrix multiplication "
970 "operands should be greater than zero.");
971 auto Nvec
= RegisterBitwidth
/ ElementSize
;
975 ceil(sqrt(Nvec
* LatencyVectorFma
* ThroughputVectorFma
) / Nvec
) * Nvec
;
976 int Mr
= ceil(Nvec
* LatencyVectorFma
* ThroughputVectorFma
/ Nr
);
980 /// Get parameters of the BLIS macro kernel.
982 /// During the computation of matrix multiplication, blocks of partitioned
983 /// matrices are mapped to different layers of the memory hierarchy.
984 /// To optimize data reuse, blocks should be ideally kept in cache between
985 /// iterations. Since parameters of the macro kernel determine sizes of these
986 /// blocks, there are upper and lower bounds on these parameters.
988 /// @param MicroKernelParams Parameters of the micro-kernel
989 /// to be taken into account.
990 /// @param MMI Parameters of the matrix multiplication operands.
991 /// @return The structure of type MacroKernelParamsTy.
992 /// @see MacroKernelParamsTy
993 /// @see MicroKernelParamsTy
994 static struct MacroKernelParamsTy
995 getMacroKernelParams(const MicroKernelParamsTy
&MicroKernelParams
,
997 // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf,
998 // it requires information about the first two levels of a cache to determine
999 // all the parameters of a macro-kernel. It also checks that an associativity
1000 // degree of a cache level is greater than two. Otherwise, another algorithm
1001 // for determination of the parameters should be used.
1002 if (!(MicroKernelParams
.Mr
> 0 && MicroKernelParams
.Nr
> 0 &&
1003 FirstCacheLevelSize
> 0 && SecondCacheLevelSize
> 0 &&
1004 FirstCacheLevelAssociativity
> 2 && SecondCacheLevelAssociativity
> 2))
1006 // The quotient should be greater than zero.
1007 if (PollyPatternMatchingNcQuotient
<= 0)
1010 (FirstCacheLevelAssociativity
- 1) /
1011 (1 + static_cast<double>(MicroKernelParams
.Nr
) / MicroKernelParams
.Mr
));
1013 // Car can be computed to be zero since it is floor to int.
1014 // On Mac OS, division by 0 does not raise a signal. This causes negative
1015 // tile sizes to be computed. Prevent division by Cac==0 by early returning
1020 auto ElementSize
= getMatMulAlignTypeSize(MMI
);
1021 assert(ElementSize
> 0 && "The element size of the matrix multiplication "
1022 "operands should be greater than zero.");
1023 int Kc
= (Car
* FirstCacheLevelSize
) /
1024 (MicroKernelParams
.Mr
* FirstCacheLevelAssociativity
* ElementSize
);
1026 static_cast<double>(Kc
* ElementSize
* SecondCacheLevelAssociativity
) /
1027 SecondCacheLevelSize
;
1028 int Mc
= floor((SecondCacheLevelAssociativity
- 2) / Cac
);
1029 int Nc
= PollyPatternMatchingNcQuotient
* MicroKernelParams
.Nr
;
1031 assert(Mc
> 0 && Nc
> 0 && Kc
> 0 &&
1032 "Matrix block sizes should be greater than zero");
1033 return {Mc
, Nc
, Kc
};
1036 /// Create an access relation that is specific to
1037 /// the matrix multiplication pattern.
1039 /// Create an access relation of the following form:
1040 /// [O0, O1, O2, O3, O4, O5, O6, O7, O8] -> [OI, O5, OJ]
1041 /// where I is @p FirstDim, J is @p SecondDim.
1043 /// It can be used, for example, to create relations that helps to consequently
1044 /// access elements of operands of a matrix multiplication after creation of
1045 /// the BLIS micro and macro kernels.
1047 /// @see ScheduleTreeOptimizer::createMicroKernel
1048 /// @see ScheduleTreeOptimizer::createMacroKernel
1050 /// Subsequently, the described access relation is applied to the range of
1051 /// @p MapOldIndVar, that is used to map original induction variables to
1052 /// the ones, which are produced by schedule transformations. It helps to
1053 /// define relations using a new space and, at the same time, keep them
1054 /// in the original one.
1056 /// @param MapOldIndVar The relation, which maps original induction variables
1057 /// to the ones, which are produced by schedule
1058 /// transformations.
1059 /// @param FirstDim, SecondDim The input dimensions that are used to define
1060 /// the specified access relation.
1061 /// @return The specified access relation.
1062 __isl_give isl_map
*getMatMulAccRel(__isl_take isl_map
*MapOldIndVar
,
1063 unsigned FirstDim
, unsigned SecondDim
) {
1064 auto *Ctx
= isl_map_get_ctx(MapOldIndVar
);
1065 auto *AccessRelSpace
= isl_space_alloc(Ctx
, 0, 9, 3);
1066 auto *AccessRel
= isl_map_universe(AccessRelSpace
);
1067 AccessRel
= isl_map_equate(AccessRel
, isl_dim_in
, FirstDim
, isl_dim_out
, 0);
1068 AccessRel
= isl_map_equate(AccessRel
, isl_dim_in
, 5, isl_dim_out
, 1);
1069 AccessRel
= isl_map_equate(AccessRel
, isl_dim_in
, SecondDim
, isl_dim_out
, 2);
1070 return isl_map_apply_range(MapOldIndVar
, AccessRel
);
1073 __isl_give isl_schedule_node
*
1074 createExtensionNode(__isl_take isl_schedule_node
*Node
,
1075 __isl_take isl_map
*ExtensionMap
) {
1076 auto *Extension
= isl_union_map_from_map(ExtensionMap
);
1077 auto *NewNode
= isl_schedule_node_from_extension(Extension
);
1078 return isl_schedule_node_graft_before(Node
, NewNode
);
1081 /// Apply the packing transformation.
1083 /// The packing transformation can be described as a data-layout
1084 /// transformation that requires to introduce a new array, copy data
1085 /// to the array, and change memory access locations to reference the array.
1086 /// It can be used to ensure that elements of the new array are read in-stride
1087 /// access, aligned to cache lines boundaries, and preloaded into certain cache
1090 /// As an example let us consider the packing of the array A that would help
1091 /// to read its elements with in-stride access. An access to the array A
1092 /// is represented by an access relation that has the form
1093 /// S[i, j, k] -> A[i, k]. The scheduling function of the SCoP statement S has
1094 /// the form S[i,j, k] -> [floor((j mod Nc) / Nr), floor((i mod Mc) / Mr),
1095 /// k mod Kc, j mod Nr, i mod Mr].
1097 /// To ensure that elements of the array A are read in-stride access, we add
1098 /// a new array Packed_A[Mc/Mr][Kc][Mr] to the SCoP, using
1099 /// Scop::createScopArrayInfo, change the access relation
1100 /// S[i, j, k] -> A[i, k] to
1101 /// S[i, j, k] -> Packed_A[floor((i mod Mc) / Mr), k mod Kc, i mod Mr], using
1102 /// MemoryAccess::setNewAccessRelation, and copy the data to the array, using
1103 /// the copy statement created by Scop::addScopStmt.
1105 /// @param Node The schedule node to be optimized.
1106 /// @param MapOldIndVar The relation, which maps original induction variables
1107 /// to the ones, which are produced by schedule
1108 /// transformations.
1109 /// @param MicroParams, MacroParams Parameters of the BLIS kernel
1110 /// to be taken into account.
1111 /// @param MMI Parameters of the matrix multiplication operands.
1112 /// @return The optimized schedule node.
1113 static __isl_give isl_schedule_node
*optimizeDataLayoutMatrMulPattern(
1114 __isl_take isl_schedule_node
*Node
, __isl_take isl_map
*MapOldIndVar
,
1115 MicroKernelParamsTy MicroParams
, MacroKernelParamsTy MacroParams
,
1116 MatMulInfoTy
&MMI
) {
1117 auto InputDimsId
= isl_map_get_tuple_id(MapOldIndVar
, isl_dim_in
);
1118 auto *Stmt
= static_cast<ScopStmt
*>(isl_id_get_user(InputDimsId
));
1119 isl_id_free(InputDimsId
);
1121 // Create a copy statement that corresponds to the memory access to the
1122 // matrix B, the second operand of the matrix multiplication.
1123 Node
= isl_schedule_node_parent(isl_schedule_node_parent(Node
));
1124 Node
= isl_schedule_node_parent(isl_schedule_node_parent(Node
));
1125 Node
= isl_schedule_node_parent(Node
);
1126 Node
= isl_schedule_node_child(isl_schedule_node_band_split(Node
, 2), 0);
1127 auto *AccRel
= getMatMulAccRel(isl_map_copy(MapOldIndVar
), 3, 7);
1128 unsigned FirstDimSize
= MacroParams
.Nc
/ MicroParams
.Nr
;
1129 unsigned SecondDimSize
= MacroParams
.Kc
;
1130 unsigned ThirdDimSize
= MicroParams
.Nr
;
1131 auto *SAI
= Stmt
->getParent()->createScopArrayInfo(
1132 MMI
.B
->getElementType(), "Packed_B",
1133 {FirstDimSize
, SecondDimSize
, ThirdDimSize
});
1134 AccRel
= isl_map_set_tuple_id(AccRel
, isl_dim_out
, SAI
->getBasePtrId());
1135 auto *OldAcc
= MMI
.B
->getLatestAccessRelation();
1136 MMI
.B
->setNewAccessRelation(AccRel
);
1138 isl_map_project_out(isl_map_copy(MapOldIndVar
), isl_dim_out
, 2,
1139 isl_map_dim(MapOldIndVar
, isl_dim_out
) - 2);
1140 ExtMap
= isl_map_reverse(ExtMap
);
1141 ExtMap
= isl_map_fix_si(ExtMap
, isl_dim_out
, MMI
.i
, 0);
1142 auto *Domain
= Stmt
->getDomain();
1144 // Restrict the domains of the copy statements to only execute when also its
1145 // originating statement is executed.
1146 auto *DomainId
= isl_set_get_tuple_id(Domain
);
1147 auto *NewStmt
= Stmt
->getParent()->addScopStmt(
1148 OldAcc
, MMI
.B
->getLatestAccessRelation(), isl_set_copy(Domain
));
1149 ExtMap
= isl_map_set_tuple_id(ExtMap
, isl_dim_out
, isl_id_copy(DomainId
));
1150 ExtMap
= isl_map_intersect_range(ExtMap
, isl_set_copy(Domain
));
1151 ExtMap
= isl_map_set_tuple_id(ExtMap
, isl_dim_out
, NewStmt
->getDomainId());
1152 Node
= createExtensionNode(Node
, ExtMap
);
1154 // Create a copy statement that corresponds to the memory access
1155 // to the matrix A, the first operand of the matrix multiplication.
1156 Node
= isl_schedule_node_child(Node
, 0);
1157 AccRel
= getMatMulAccRel(isl_map_copy(MapOldIndVar
), 4, 6);
1158 FirstDimSize
= MacroParams
.Mc
/ MicroParams
.Mr
;
1159 ThirdDimSize
= MicroParams
.Mr
;
1160 SAI
= Stmt
->getParent()->createScopArrayInfo(
1161 MMI
.A
->getElementType(), "Packed_A",
1162 {FirstDimSize
, SecondDimSize
, ThirdDimSize
});
1163 AccRel
= isl_map_set_tuple_id(AccRel
, isl_dim_out
, SAI
->getBasePtrId());
1164 OldAcc
= MMI
.A
->getLatestAccessRelation();
1165 MMI
.A
->setNewAccessRelation(AccRel
);
1166 ExtMap
= isl_map_project_out(MapOldIndVar
, isl_dim_out
, 3,
1167 isl_map_dim(MapOldIndVar
, isl_dim_out
) - 3);
1168 ExtMap
= isl_map_reverse(ExtMap
);
1169 ExtMap
= isl_map_fix_si(ExtMap
, isl_dim_out
, MMI
.j
, 0);
1170 NewStmt
= Stmt
->getParent()->addScopStmt(
1171 OldAcc
, MMI
.A
->getLatestAccessRelation(), isl_set_copy(Domain
));
1173 // Restrict the domains of the copy statements to only execute when also its
1174 // originating statement is executed.
1175 ExtMap
= isl_map_set_tuple_id(ExtMap
, isl_dim_out
, DomainId
);
1176 ExtMap
= isl_map_intersect_range(ExtMap
, Domain
);
1177 ExtMap
= isl_map_set_tuple_id(ExtMap
, isl_dim_out
, NewStmt
->getDomainId());
1178 Node
= createExtensionNode(Node
, ExtMap
);
1179 Node
= isl_schedule_node_child(isl_schedule_node_child(Node
, 0), 0);
1180 return isl_schedule_node_child(isl_schedule_node_child(Node
, 0), 0);
1183 /// Get a relation mapping induction variables produced by schedule
1184 /// transformations to the original ones.
1186 /// @param Node The schedule node produced as the result of creation
1187 /// of the BLIS kernels.
1188 /// @param MicroKernelParams, MacroKernelParams Parameters of the BLIS kernel
1189 /// to be taken into account.
1190 /// @return The relation mapping original induction variables to the ones
1191 /// produced by schedule transformation.
1192 /// @see ScheduleTreeOptimizer::createMicroKernel
1193 /// @see ScheduleTreeOptimizer::createMacroKernel
1194 /// @see getMacroKernelParams
1195 __isl_give isl_map
*
1196 getInductionVariablesSubstitution(__isl_take isl_schedule_node
*Node
,
1197 MicroKernelParamsTy MicroKernelParams
,
1198 MacroKernelParamsTy MacroKernelParams
) {
1199 auto *Child
= isl_schedule_node_get_child(Node
, 0);
1200 auto *UnMapOldIndVar
= isl_schedule_node_get_prefix_schedule_union_map(Child
);
1201 isl_schedule_node_free(Child
);
1202 auto *MapOldIndVar
= isl_map_from_union_map(UnMapOldIndVar
);
1203 if (isl_map_dim(MapOldIndVar
, isl_dim_out
) > 9)
1205 isl_map_project_out(MapOldIndVar
, isl_dim_out
, 0,
1206 isl_map_dim(MapOldIndVar
, isl_dim_out
) - 9);
1207 return MapOldIndVar
;
1210 /// Isolate a set of partial tile prefixes and unroll the isolated part.
1212 /// The set should ensure that it contains only partial tile prefixes that have
1213 /// exactly Mr x Nr iterations of the two innermost loops produced by
1214 /// the optimization of the matrix multiplication. Mr and Nr are parameters of
1215 /// the micro-kernel.
1217 /// In case of parametric bounds, this helps to auto-vectorize the unrolled
1218 /// innermost loops, using the SLP vectorizer.
1220 /// @param Node The schedule node to be modified.
1221 /// @param MicroKernelParams Parameters of the micro-kernel
1222 /// to be taken into account.
1223 /// @return The modified isl_schedule_node.
1224 static isl::schedule_node
1225 isolateAndUnrollMatMulInnerLoops(isl::schedule_node Node
,
1226 struct MicroKernelParamsTy MicroKernelParams
) {
1227 isl::schedule_node Child
= Node
.get_child(0);
1228 isl::union_map UnMapOldIndVar
= Child
.get_prefix_schedule_relation();
1229 isl::set Prefix
= isl::map::from_union_map(UnMapOldIndVar
).range();
1230 unsigned Dims
= Prefix
.dim(isl::dim::set
);
1231 Prefix
= Prefix
.project_out(isl::dim::set
, Dims
- 1, 1);
1232 Prefix
= getPartialTilePrefixes(Prefix
, MicroKernelParams
.Nr
);
1233 Prefix
= getPartialTilePrefixes(Prefix
, MicroKernelParams
.Mr
);
1235 isl::union_set IsolateOption
=
1236 getIsolateOptions(Prefix
.add_dims(isl::dim::set
, 3), 3);
1237 isl::ctx Ctx
= Node
.get_ctx();
1238 isl::union_set AtomicOption
= getAtomicOptions(Ctx
);
1239 isl::union_set Options
= IsolateOption
.unite(AtomicOption
);
1240 Options
= Options
.unite(getUnrollIsolatedSetOptions(Ctx
));
1241 Node
= Node
.band_set_ast_build_options(Options
);
1242 Node
= Node
.parent().parent();
1243 IsolateOption
= getIsolateOptions(Prefix
, 3);
1244 Options
= IsolateOption
.unite(AtomicOption
);
1245 Node
= Node
.band_set_ast_build_options(Options
);
1246 Node
= Node
.child(0).child(0);
1250 /// Mark @p BasePtr with "Inter iteration alias-free" mark node.
1252 /// @param Node The child of the mark node to be inserted.
1253 /// @param BasePtr The pointer to be marked.
1254 /// @return The modified isl_schedule_node.
1255 static isl_schedule_node
*markInterIterationAliasFree(isl_schedule_node
*Node
,
1256 llvm::Value
*BasePtr
) {
1260 auto *Ctx
= isl_schedule_node_get_ctx(Node
);
1261 auto *Id
= isl_id_alloc(Ctx
, "Inter iteration alias-free", BasePtr
);
1262 return isl_schedule_node_child(isl_schedule_node_insert_mark(Node
, Id
), 0);
1265 /// Restore the initial ordering of dimensions of the band node
1267 /// In case the band node represents all the dimensions of the iteration
1268 /// domain, recreate the band node to restore the initial ordering of the
1271 /// @param Node The band node to be modified.
1272 /// @return The modified schedule node.
1274 isl::schedule_node
getBandNodeWithOriginDimOrder(isl::schedule_node Node
) {
1275 assert(isl_schedule_node_get_type(Node
.keep()) == isl_schedule_node_band
);
1276 if (isl_schedule_node_get_type(Node
.child(0).keep()) !=
1277 isl_schedule_node_leaf
)
1279 auto Domain
= isl::manage(isl_schedule_node_get_universe_domain(Node
.keep()));
1280 assert(isl_union_set_n_set(Domain
.keep()) == 1);
1281 if (isl_schedule_node_get_schedule_depth(Node
.keep()) != 0 ||
1282 (isl::set(isl::manage(Domain
.copy())).dim(isl::dim::set
) !=
1283 isl_schedule_node_band_n_member(Node
.keep())))
1285 Node
= isl::manage(isl_schedule_node_delete(Node
.take()));
1286 auto PartialSchedulePwAff
=
1287 isl::manage(isl_union_set_identity_union_pw_multi_aff(Domain
.take()));
1288 auto PartialScheduleMultiPwAff
=
1289 isl::multi_union_pw_aff(PartialSchedulePwAff
);
1290 PartialScheduleMultiPwAff
= isl::manage(isl_multi_union_pw_aff_reset_tuple_id(
1291 PartialScheduleMultiPwAff
.take(), isl_dim_set
));
1292 return isl::manage(isl_schedule_node_insert_partial_schedule(
1293 Node
.take(), PartialScheduleMultiPwAff
.take()));
1297 __isl_give isl_schedule_node
*ScheduleTreeOptimizer::optimizeMatMulPattern(
1298 __isl_take isl_schedule_node
*Node
, const llvm::TargetTransformInfo
*TTI
,
1299 MatMulInfoTy
&MMI
) {
1300 assert(TTI
&& "The target transform info should be provided.");
1301 Node
= markInterIterationAliasFree(
1302 Node
, MMI
.WriteToC
->getLatestScopArrayInfo()->getBasePtr());
1303 int DimOutNum
= isl_schedule_node_band_n_member(Node
);
1304 assert(DimOutNum
> 2 && "In case of the matrix multiplication the loop nest "
1305 "and, consequently, the corresponding scheduling "
1306 "functions have at least three dimensions.");
1307 Node
= getBandNodeWithOriginDimOrder(isl::manage(Node
)).take();
1308 Node
= permuteBandNodeDimensions(Node
, MMI
.i
, DimOutNum
- 3);
1309 int NewJ
= MMI
.j
== DimOutNum
- 3 ? MMI
.i
: MMI
.j
;
1310 int NewK
= MMI
.k
== DimOutNum
- 3 ? MMI
.i
: MMI
.k
;
1311 Node
= permuteBandNodeDimensions(Node
, NewJ
, DimOutNum
- 2);
1312 NewK
= NewK
== DimOutNum
- 2 ? NewJ
: NewK
;
1313 Node
= permuteBandNodeDimensions(Node
, NewK
, DimOutNum
- 1);
1314 auto MicroKernelParams
= getMicroKernelParams(TTI
, MMI
);
1315 auto MacroKernelParams
= getMacroKernelParams(MicroKernelParams
, MMI
);
1316 Node
= createMacroKernel(Node
, MacroKernelParams
);
1317 Node
= createMicroKernel(Node
, MicroKernelParams
);
1318 if (MacroKernelParams
.Mc
== 1 || MacroKernelParams
.Nc
== 1 ||
1319 MacroKernelParams
.Kc
== 1)
1321 auto *MapOldIndVar
= getInductionVariablesSubstitution(
1322 Node
, MicroKernelParams
, MacroKernelParams
);
1326 isolateAndUnrollMatMulInnerLoops(give(Node
), MicroKernelParams
).release();
1327 return optimizeDataLayoutMatrMulPattern(Node
, MapOldIndVar
, MicroKernelParams
,
1328 MacroKernelParams
, MMI
);
1331 bool ScheduleTreeOptimizer::isMatrMultPattern(
1332 __isl_keep isl_schedule_node
*Node
, const Dependences
*D
,
1333 MatMulInfoTy
&MMI
) {
1334 auto *PartialSchedule
=
1335 isl_schedule_node_band_get_partial_schedule_union_map(Node
);
1336 Node
= isl_schedule_node_child(Node
, 0);
1337 auto LeafType
= isl_schedule_node_get_type(Node
);
1338 Node
= isl_schedule_node_parent(Node
);
1339 if (LeafType
!= isl_schedule_node_leaf
||
1340 isl_schedule_node_band_n_member(Node
) < 3 ||
1341 isl_schedule_node_get_schedule_depth(Node
) != 0 ||
1342 isl_union_map_n_map(PartialSchedule
) != 1) {
1343 isl_union_map_free(PartialSchedule
);
1346 auto *NewPartialSchedule
= isl_map_from_union_map(PartialSchedule
);
1347 if (containsMatrMult(NewPartialSchedule
, D
, MMI
)) {
1348 isl_map_free(NewPartialSchedule
);
1351 isl_map_free(NewPartialSchedule
);
1355 __isl_give isl_schedule_node
*
1356 ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node
*Node
,
1358 if (!isTileableBandNode(Node
))
1361 const OptimizerAdditionalInfoTy
*OAI
=
1362 static_cast<const OptimizerAdditionalInfoTy
*>(User
);
1365 if (PMBasedOpts
&& User
&& isMatrMultPattern(Node
, OAI
->D
, MMI
)) {
1366 DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
1367 return optimizeMatMulPattern(Node
, OAI
->TTI
, MMI
);
1370 return standardBandOpts(Node
, User
);
1373 __isl_give isl_schedule
*
1374 ScheduleTreeOptimizer::optimizeSchedule(__isl_take isl_schedule
*Schedule
,
1375 const OptimizerAdditionalInfoTy
*OAI
) {
1376 isl_schedule_node
*Root
= isl_schedule_get_root(Schedule
);
1377 Root
= optimizeScheduleNode(Root
, OAI
);
1378 isl_schedule_free(Schedule
);
1379 auto S
= isl_schedule_node_get_schedule(Root
);
1380 isl_schedule_node_free(Root
);
1384 __isl_give isl_schedule_node
*ScheduleTreeOptimizer::optimizeScheduleNode(
1385 __isl_take isl_schedule_node
*Node
, const OptimizerAdditionalInfoTy
*OAI
) {
1386 Node
= isl_schedule_node_map_descendant_bottom_up(
1387 Node
, optimizeBand
, const_cast<void *>(static_cast<const void *>(OAI
)));
1391 bool ScheduleTreeOptimizer::isProfitableSchedule(
1392 Scop
&S
, __isl_keep isl_schedule
*NewSchedule
) {
1393 // To understand if the schedule has been optimized we check if the schedule
1394 // has changed at all.
1395 // TODO: We can improve this by tracking if any necessarily beneficial
1396 // transformations have been performed. This can e.g. be tiling, loop
1397 // interchange, or ...) We can track this either at the place where the
1398 // transformation has been performed or, in case of automatic ILP based
1399 // optimizations, by comparing (yet to be defined) performance metrics
1400 // before/after the scheduling optimizer
1401 // (e.g., #stride-one accesses)
1402 if (S
.containsExtensionNode(NewSchedule
))
1404 auto *NewScheduleMap
= isl_schedule_get_map(NewSchedule
);
1405 isl_union_map
*OldSchedule
= S
.getSchedule();
1406 assert(OldSchedule
&& "Only IslScheduleOptimizer can insert extension nodes "
1407 "that make Scop::getSchedule() return nullptr.");
1408 bool changed
= !isl_union_map_is_equal(OldSchedule
, NewScheduleMap
);
1409 isl_union_map_free(OldSchedule
);
1410 isl_union_map_free(NewScheduleMap
);
1415 class IslScheduleOptimizer
: public ScopPass
{
1418 explicit IslScheduleOptimizer() : ScopPass(ID
) { LastSchedule
= nullptr; }
1420 ~IslScheduleOptimizer() { isl_schedule_free(LastSchedule
); }
1422 /// Optimize the schedule of the SCoP @p S.
1423 bool runOnScop(Scop
&S
) override
;
1425 /// Print the new schedule for the SCoP @p S.
1426 void printScop(raw_ostream
&OS
, Scop
&S
) const override
;
1428 /// Register all analyses and transformation required.
1429 void getAnalysisUsage(AnalysisUsage
&AU
) const override
;
1431 /// Release the internal memory.
1432 void releaseMemory() override
{
1433 isl_schedule_free(LastSchedule
);
1434 LastSchedule
= nullptr;
1438 isl_schedule
*LastSchedule
;
1442 char IslScheduleOptimizer::ID
= 0;
1444 bool IslScheduleOptimizer::runOnScop(Scop
&S
) {
1446 // Skip SCoPs in case they're already optimised by PPCGCodeGeneration
1447 if (S
.isToBeSkipped())
1450 // Skip empty SCoPs but still allow code generation as it will delete the
1451 // loops present but not needed.
1452 if (S
.getSize() == 0) {
1453 S
.markAsOptimized();
1457 const Dependences
&D
=
1458 getAnalysis
<DependenceInfo
>().getDependences(Dependences::AL_Statement
);
1460 if (!D
.hasValidDependences())
1463 isl_schedule_free(LastSchedule
);
1464 LastSchedule
= nullptr;
1466 // Build input data.
1468 Dependences::TYPE_RAW
| Dependences::TYPE_WAR
| Dependences::TYPE_WAW
;
1471 if (OptimizeDeps
== "all")
1473 Dependences::TYPE_RAW
| Dependences::TYPE_WAR
| Dependences::TYPE_WAW
;
1474 else if (OptimizeDeps
== "raw")
1475 ProximityKinds
= Dependences::TYPE_RAW
;
1477 errs() << "Do not know how to optimize for '" << OptimizeDeps
<< "'"
1478 << " Falling back to optimizing all dependences.\n";
1480 Dependences::TYPE_RAW
| Dependences::TYPE_WAR
| Dependences::TYPE_WAW
;
1483 isl::union_set Domain
= give(S
.getDomains());
1488 isl::union_map Validity
= give(D
.getDependences(ValidityKinds
));
1489 isl::union_map Proximity
= give(D
.getDependences(ProximityKinds
));
1491 // Simplify the dependences by removing the constraints introduced by the
1492 // domains. This can speed up the scheduling time significantly, as large
1493 // constant coefficients will be removed from the dependences. The
1494 // introduction of some additional dependences reduces the possible
1495 // transformations, but in most cases, such transformation do not seem to be
1496 // interesting anyway. In some cases this option may stop the scheduler to
1497 // find any schedule.
1498 if (SimplifyDeps
== "yes") {
1499 Validity
= Validity
.gist_domain(Domain
);
1500 Validity
= Validity
.gist_range(Domain
);
1501 Proximity
= Proximity
.gist_domain(Domain
);
1502 Proximity
= Proximity
.gist_range(Domain
);
1503 } else if (SimplifyDeps
!= "no") {
1504 errs() << "warning: Option -polly-opt-simplify-deps should either be 'yes' "
1505 "or 'no'. Falling back to default: 'yes'\n";
1508 DEBUG(dbgs() << "\n\nCompute schedule from: ");
1509 DEBUG(dbgs() << "Domain := " << Domain
<< ";\n");
1510 DEBUG(dbgs() << "Proximity := " << Proximity
<< ";\n");
1511 DEBUG(dbgs() << "Validity := " << Validity
<< ";\n");
1513 unsigned IslSerializeSCCs
;
1515 if (FusionStrategy
== "max") {
1516 IslSerializeSCCs
= 0;
1517 } else if (FusionStrategy
== "min") {
1518 IslSerializeSCCs
= 1;
1520 errs() << "warning: Unknown fusion strategy. Falling back to maximal "
1522 IslSerializeSCCs
= 0;
1525 int IslMaximizeBands
;
1527 if (MaximizeBandDepth
== "yes") {
1528 IslMaximizeBands
= 1;
1529 } else if (MaximizeBandDepth
== "no") {
1530 IslMaximizeBands
= 0;
1532 errs() << "warning: Option -polly-opt-maximize-bands should either be 'yes'"
1533 " or 'no'. Falling back to default: 'yes'\n";
1534 IslMaximizeBands
= 1;
1537 int IslOuterCoincidence
;
1539 if (OuterCoincidence
== "yes") {
1540 IslOuterCoincidence
= 1;
1541 } else if (OuterCoincidence
== "no") {
1542 IslOuterCoincidence
= 0;
1544 errs() << "warning: Option -polly-opt-outer-coincidence should either be "
1545 "'yes' or 'no'. Falling back to default: 'no'\n";
1546 IslOuterCoincidence
= 0;
1549 isl_ctx
*Ctx
= S
.getIslCtx();
1551 isl_options_set_schedule_outer_coincidence(Ctx
, IslOuterCoincidence
);
1552 isl_options_set_schedule_serialize_sccs(Ctx
, IslSerializeSCCs
);
1553 isl_options_set_schedule_maximize_band_depth(Ctx
, IslMaximizeBands
);
1554 isl_options_set_schedule_max_constant_term(Ctx
, MaxConstantTerm
);
1555 isl_options_set_schedule_max_coefficient(Ctx
, MaxCoefficient
);
1556 isl_options_set_tile_scale_tile_loops(Ctx
, 0);
1558 auto OnErrorStatus
= isl_options_get_on_error(Ctx
);
1559 isl_options_set_on_error(Ctx
, ISL_ON_ERROR_CONTINUE
);
1561 auto SC
= isl::schedule_constraints::on_domain(Domain
);
1562 SC
= SC
.set_proximity(Proximity
);
1563 SC
= SC
.set_validity(Validity
);
1564 SC
= SC
.set_coincidence(Validity
);
1565 isl_schedule
*Schedule
;
1566 Schedule
= SC
.compute_schedule().release();
1567 isl_options_set_on_error(Ctx
, OnErrorStatus
);
1569 // In cases the scheduler is not able to optimize the code, we just do not
1570 // touch the schedule.
1575 auto *P
= isl_printer_to_str(Ctx
);
1576 P
= isl_printer_set_yaml_style(P
, ISL_YAML_STYLE_BLOCK
);
1577 P
= isl_printer_print_schedule(P
, Schedule
);
1578 auto *str
= isl_printer_get_str(P
);
1579 dbgs() << "NewScheduleTree: \n" << str
<< "\n";
1581 isl_printer_free(P
);
1584 Function
&F
= S
.getFunction();
1585 auto *TTI
= &getAnalysis
<TargetTransformInfoWrapperPass
>().getTTI(F
);
1586 const OptimizerAdditionalInfoTy OAI
= {TTI
, const_cast<Dependences
*>(&D
)};
1587 isl_schedule
*NewSchedule
=
1588 ScheduleTreeOptimizer::optimizeSchedule(Schedule
, &OAI
);
1590 if (!ScheduleTreeOptimizer::isProfitableSchedule(S
, NewSchedule
)) {
1591 isl_schedule_free(NewSchedule
);
1595 S
.setScheduleTree(NewSchedule
);
1596 S
.markAsOptimized();
1604 void IslScheduleOptimizer::printScop(raw_ostream
&OS
, Scop
&) const {
1608 OS
<< "Calculated schedule:\n";
1610 if (!LastSchedule
) {
1615 p
= isl_printer_to_str(isl_schedule_get_ctx(LastSchedule
));
1616 p
= isl_printer_print_schedule(p
, LastSchedule
);
1617 ScheduleStr
= isl_printer_get_str(p
);
1618 isl_printer_free(p
);
1620 OS
<< ScheduleStr
<< "\n";
1623 void IslScheduleOptimizer::getAnalysisUsage(AnalysisUsage
&AU
) const {
1624 ScopPass::getAnalysisUsage(AU
);
1625 AU
.addRequired
<DependenceInfo
>();
1626 AU
.addRequired
<TargetTransformInfoWrapperPass
>();
1629 Pass
*polly::createIslScheduleOptimizerPass() {
1630 return new IslScheduleOptimizer();
1633 INITIALIZE_PASS_BEGIN(IslScheduleOptimizer
, "polly-opt-isl",
1634 "Polly - Optimize schedule of SCoP", false, false);
1635 INITIALIZE_PASS_DEPENDENCY(DependenceInfo
);
1636 INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass
);
1637 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass
);
1638 INITIALIZE_PASS_END(IslScheduleOptimizer
, "polly-opt-isl",
1639 "Polly - Optimize schedule of SCoP", false, false)