1 //===------ polly/ScheduleOptimizer.h - The Schedule Optimizer *- C++ -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 //===----------------------------------------------------------------------===//
12 #ifndef POLLY_SCHEDULE_OPTIMIZER_H
13 #define POLLY_SCHEDULE_OPTIMIZER_H
15 #include "polly/DependenceInfo.h"
16 #include "llvm/ADT/ArrayRef.h"
17 #include "llvm/Analysis/TargetTransformInfo.h"
21 struct isl_schedule_node
;
24 /// Parameters of the micro kernel.
26 /// Parameters, which determine sizes of rank-1 (i.e., outer product) update
27 /// used in the optimized matrix multiplication.
29 struct MicroKernelParamsTy
{
34 /// Parameters of the macro kernel.
36 /// Parameters, which determine sizes of blocks of partitioned matrices
37 /// used in the optimized matrix multiplication.
39 struct MacroKernelParamsTy
{
46 /// Additional parameters of the schedule optimizer.
48 /// Target Transform Info and the SCoP dependencies used by the schedule
51 struct OptimizerAdditionalInfoTy
{
52 const llvm::TargetTransformInfo
*TTI
;
56 /// Parameters of the matrix multiplication operands.
58 /// Parameters, which describe access relations that represent operands of the
59 /// matrix multiplication.
62 MemoryAccess
*A
= nullptr;
63 MemoryAccess
*B
= nullptr;
64 MemoryAccess
*ReadFromC
= nullptr;
65 MemoryAccess
*WriteToC
= nullptr;
71 extern bool DisablePollyTiling
;
75 class ScheduleTreeOptimizer
{
77 /// Apply schedule tree transformations.
79 /// This function takes an (possibly already optimized) schedule tree and
80 /// applies a set of additional optimizations on the schedule tree. The
81 /// transformations applied include:
84 /// - Prevectorization
86 /// @param Schedule The schedule object the transformations will be applied
88 /// @param OAI Target Transform Info and the SCoP dependencies.
89 /// @returns The transformed schedule.
90 static __isl_give isl_schedule
*
91 optimizeSchedule(__isl_take isl_schedule
*Schedule
,
92 const polly::OptimizerAdditionalInfoTy
*OAI
= nullptr);
94 /// Apply schedule tree transformations.
96 /// This function takes a node in an (possibly already optimized) schedule
97 /// tree and applies a set of additional optimizations on this schedule tree
98 /// node and its descendants. The transformations applied include:
101 /// - Prevectorization
103 /// @param Node The schedule object post-transformations will be applied to.
104 /// @param OAI Target Transform Info and the SCoP dependencies.
105 /// @returns The transformed schedule.
106 static __isl_give isl_schedule_node
*
107 optimizeScheduleNode(__isl_take isl_schedule_node
*Node
,
108 const polly::OptimizerAdditionalInfoTy
*OAI
= nullptr);
110 /// Decide if the @p NewSchedule is profitable for @p S.
112 /// @param S The SCoP we optimize.
113 /// @param NewSchedule The new schedule we computed.
115 /// @return True, if we believe @p NewSchedule is an improvement for @p S.
116 static bool isProfitableSchedule(polly::Scop
&S
,
117 __isl_keep isl_schedule
*NewSchedule
);
119 /// Isolate a set of partial tile prefixes.
121 /// This set should ensure that it contains only partial tile prefixes that
122 /// have exactly VectorWidth iterations.
124 /// @param Node A schedule node band, which is a parent of a band node,
125 /// that contains a vector loop.
126 /// @return Modified isl_schedule_node.
127 static isl::schedule_node
isolateFullPartialTiles(isl::schedule_node Node
,
131 /// Tile a schedule node.
133 /// @param Node The node to tile.
134 /// @param Identifier An name that identifies this kind of tiling and
135 /// that is used to mark the tiled loops in the
137 /// @param TileSizes A vector of tile sizes that should be used for
139 /// @param DefaultTileSize A default tile size that is used for dimensions
140 /// that are not covered by the TileSizes vector.
141 static __isl_give isl_schedule_node
*
142 tileNode(__isl_take isl_schedule_node
*Node
, const char *Identifier
,
143 llvm::ArrayRef
<int> TileSizes
, int DefaultTileSize
);
145 /// Tile a schedule node and unroll point loops.
147 /// @param Node The node to register tile.
148 /// @param TileSizes A vector of tile sizes that should be used for
150 /// @param DefaultTileSize A default tile size that is used for dimensions
151 static __isl_give isl_schedule_node
*
152 applyRegisterTiling(__isl_take isl_schedule_node
*Node
,
153 llvm::ArrayRef
<int> TileSizes
, int DefaultTileSize
);
155 /// Apply the BLIS matmul optimization pattern.
157 /// Make the loops containing the matrix multiplication be the innermost
158 /// loops and apply the BLIS matmul optimization pattern. BLIS implements
159 /// gemm as three nested loops around a macro-kernel, plus two packing
160 /// routines. The macro-kernel is implemented in terms of two additional
161 /// loops around a micro-kernel. The micro-kernel is a loop around a rank-1
162 /// (i.e., outer product) update.
164 /// For a detailed description please see [1].
166 /// The order of the loops defines the data reused in the BLIS implementation
167 /// of gemm ([1]). In particular, elements of the matrix B, the second
168 /// operand of matrix multiplication, are reused between iterations of the
169 /// innermost loop. To keep the reused data in cache, only elements of matrix
170 /// A, the first operand of matrix multiplication, should be evicted during
171 /// an iteration of the innermost loop. To provide such a cache replacement
172 /// policy, elements of the matrix A can, in particular, be loaded first and,
173 /// consequently, be least-recently-used.
175 /// In our case matrices are stored in row-major order instead of
176 /// column-major order used in the BLIS implementation ([1]). It affects only
177 /// on the form of the BLIS micro kernel and the computation of its
178 /// parameters. In particular, reused elements of the matrix B are
179 /// successively multiplied by specific elements of the matrix A.
182 /// [1] - Analytical Modeling is Enough for High Performance BLIS
183 /// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti
184 /// Technical Report, 2014
185 /// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
187 /// @see ScheduleTreeOptimizer::createMicroKernel
188 /// @see ScheduleTreeOptimizer::createMacroKernel
189 /// @see getMicroKernelParams
190 /// @see getMacroKernelParams
192 /// TODO: Implement the packing transformation.
194 /// @param Node The node that contains a band to be optimized. The node
195 /// is required to successfully pass
196 /// ScheduleTreeOptimizer::isMatrMultPattern.
197 /// @param TTI Target Transform Info.
198 /// @param MMI Parameters of the matrix multiplication operands.
199 /// @returns The transformed schedule.
200 static __isl_give isl_schedule_node
*
201 optimizeMatMulPattern(__isl_take isl_schedule_node
*Node
,
202 const llvm::TargetTransformInfo
*TTI
,
203 polly::MatMulInfoTy
&MMI
);
205 /// Check if this node is a band node we want to tile.
207 /// We look for innermost band nodes where individual dimensions are marked as
210 /// @param Node The node to check.
211 static bool isTileableBandNode(__isl_keep isl_schedule_node
*Node
);
213 /// Pre-vectorizes one scheduling dimension of a schedule band.
215 /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and
216 /// sinks the resulting point loop.
218 /// Example (DimToVectorize=0, VectorWidth=4):
220 /// | Before transformation:
222 /// | A[i,j] -> [i,j]
224 /// | for (i = 0; i < 128; i++)
225 /// | for (j = 0; j < 128; j++)
228 /// | After transformation:
230 /// | for (it = 0; it < 32; it+=1)
231 /// | for (j = 0; j < 128; j++)
232 /// | for (ip = 0; ip <= 3; ip++)
233 /// | A(4 * it + ip,j);
235 /// The goal of this transformation is to create a trivially vectorizable
236 /// loop. This means a parallel loop at the innermost level that has a
237 /// constant number of iterations corresponding to the target vector width.
239 /// This transformation creates a loop at the innermost level. The loop has
240 /// a constant number of iterations, if the number of loop iterations at
241 /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is
242 /// currently constant and not yet target specific. This function does not
243 /// reason about parallelism.
244 static __isl_give isl_schedule_node
*
245 prevectSchedBand(__isl_take isl_schedule_node
*Node
, unsigned DimToVectorize
,
248 /// Apply additional optimizations on the bands in the schedule tree.
250 /// We are looking for an innermost band node and apply the following
254 /// - if the band is tileable
255 /// - if the band has more than one loop dimension
257 /// - Prevectorize the schedule of the band (or the point loop in case of
259 /// - if vectorization is enabled
261 /// @param Node The schedule node to (possibly) optimize.
262 /// @param User A pointer to forward some use information
263 /// (currently unused).
264 static isl_schedule_node
*optimizeBand(isl_schedule_node
*Node
, void *User
);
266 /// Apply additional optimizations on the bands in the schedule tree.
268 /// We apply the following
272 /// - Prevectorize the schedule of the band (or the point loop in case of
274 /// - if vectorization is enabled
276 /// @param Node The schedule node to (possibly) optimize.
277 /// @param User A pointer to forward some use information
278 /// (currently unused).
279 static isl_schedule_node
*standardBandOpts(__isl_take isl_schedule_node
*Node
,
282 /// Check if this node contains a partial schedule that could
283 /// probably be optimized with analytical modeling.
285 /// isMatrMultPattern tries to determine whether the following conditions
287 /// 1. the partial schedule contains only one statement.
288 /// 2. there are exactly three input dimensions.
289 /// 3. all memory accesses of the statement will have stride 0 or 1, if we
290 /// interchange loops (switch the variable used in the inner loop to
292 /// 4. all memory accesses of the statement except from the last one, are
293 /// read memory access and the last one is write memory access.
294 /// 5. all subscripts of the last memory access of the statement don't
295 /// contain the variable used in the inner loop.
296 /// If this is the case, we could try to use an approach that is similar to
297 /// the one used to get close-to-peak performance of matrix multiplications.
299 /// @param Node The node to check.
300 /// @param D The SCoP dependencies.
301 /// @param MMI Parameters of the matrix multiplication operands.
302 static bool isMatrMultPattern(__isl_keep isl_schedule_node
*Node
,
303 const polly::Dependences
*D
,
304 polly::MatMulInfoTy
&MMI
);
306 /// Create the BLIS macro-kernel.
308 /// We create the BLIS macro-kernel by applying a combination of tiling
309 /// of dimensions of the band node and interchanging of two innermost
310 /// modified dimensions. The values of of MacroKernelParams's fields are used
313 /// @param Node The schedule node to be modified.
314 /// @param MacroKernelParams Parameters of the macro kernel
315 /// to be used as tile sizes.
316 static __isl_give isl_schedule_node
*
317 createMacroKernel(__isl_take isl_schedule_node
*Node
,
318 MacroKernelParamsTy MacroKernelParams
);
320 /// Create the BLIS macro-kernel.
322 /// We create the BLIS macro-kernel by applying a combination of tiling
323 /// of dimensions of the band node and interchanging of two innermost
324 /// modified dimensions. The values passed in MicroKernelParam are used
327 /// @param Node The schedule node to be modified.
328 /// @param MicroKernelParams Parameters of the micro kernel
329 /// to be used as tile sizes.
330 /// @see MicroKernelParamsTy
331 static __isl_give isl_schedule_node
*
332 createMicroKernel(__isl_take isl_schedule_node
*Node
,
333 MicroKernelParamsTy MicroKernelParams
);