[ScheduleOptimizer] Move isolateFullPartialTiles and isolateAndUnrollMatMulInnerLoops...
[polly-mirror.git] / include / polly / ScheduleOptimizer.h
blob0b59a7d4e272a9ba830d66760a251e08d2895059
1 //===------ polly/ScheduleOptimizer.h - The Schedule Optimizer *- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 //===----------------------------------------------------------------------===//
12 #ifndef POLLY_SCHEDULE_OPTIMIZER_H
13 #define POLLY_SCHEDULE_OPTIMIZER_H
15 #include "polly/DependenceInfo.h"
16 #include "llvm/ADT/ArrayRef.h"
17 #include "llvm/Analysis/TargetTransformInfo.h"
18 #include "isl/ctx.h"
20 struct isl_schedule;
21 struct isl_schedule_node;
22 struct isl_union_map;
24 /// Parameters of the micro kernel.
25 ///
26 /// Parameters, which determine sizes of rank-1 (i.e., outer product) update
27 /// used in the optimized matrix multiplication.
28 ///
29 struct MicroKernelParamsTy {
30 int Mr;
31 int Nr;
34 /// Parameters of the macro kernel.
35 ///
36 /// Parameters, which determine sizes of blocks of partitioned matrices
37 /// used in the optimized matrix multiplication.
38 ///
39 struct MacroKernelParamsTy {
40 int Mc;
41 int Nc;
42 int Kc;
45 namespace polly {
46 /// Additional parameters of the schedule optimizer.
47 ///
48 /// Target Transform Info and the SCoP dependencies used by the schedule
49 /// optimizer.
50 ///
51 struct OptimizerAdditionalInfoTy {
52 const llvm::TargetTransformInfo *TTI;
53 const Dependences *D;
56 /// Parameters of the matrix multiplication operands.
57 ///
58 /// Parameters, which describe access relations that represent operands of the
59 /// matrix multiplication.
60 ///
61 struct MatMulInfoTy {
62 MemoryAccess *A = nullptr;
63 MemoryAccess *B = nullptr;
64 MemoryAccess *ReadFromC = nullptr;
65 MemoryAccess *WriteToC = nullptr;
66 int i = -1;
67 int j = -1;
68 int k = -1;
71 extern bool DisablePollyTiling;
72 class Scop;
73 } // namespace polly
75 class ScheduleTreeOptimizer {
76 public:
77 /// Apply schedule tree transformations.
78 ///
79 /// This function takes an (possibly already optimized) schedule tree and
80 /// applies a set of additional optimizations on the schedule tree. The
81 /// transformations applied include:
82 ///
83 /// - Tiling
84 /// - Prevectorization
85 ///
86 /// @param Schedule The schedule object the transformations will be applied
87 /// to.
88 /// @param OAI Target Transform Info and the SCoP dependencies.
89 /// @returns The transformed schedule.
90 static __isl_give isl_schedule *
91 optimizeSchedule(__isl_take isl_schedule *Schedule,
92 const polly::OptimizerAdditionalInfoTy *OAI = nullptr);
94 /// Apply schedule tree transformations.
95 ///
96 /// This function takes a node in an (possibly already optimized) schedule
97 /// tree and applies a set of additional optimizations on this schedule tree
98 /// node and its descendants. The transformations applied include:
99 ///
100 /// - Tiling
101 /// - Prevectorization
103 /// @param Node The schedule object post-transformations will be applied to.
104 /// @param OAI Target Transform Info and the SCoP dependencies.
105 /// @returns The transformed schedule.
106 static __isl_give isl_schedule_node *
107 optimizeScheduleNode(__isl_take isl_schedule_node *Node,
108 const polly::OptimizerAdditionalInfoTy *OAI = nullptr);
110 /// Decide if the @p NewSchedule is profitable for @p S.
112 /// @param S The SCoP we optimize.
113 /// @param NewSchedule The new schedule we computed.
115 /// @return True, if we believe @p NewSchedule is an improvement for @p S.
116 static bool isProfitableSchedule(polly::Scop &S,
117 __isl_keep isl_schedule *NewSchedule);
119 /// Isolate a set of partial tile prefixes.
121 /// This set should ensure that it contains only partial tile prefixes that
122 /// have exactly VectorWidth iterations.
124 /// @param Node A schedule node band, which is a parent of a band node,
125 /// that contains a vector loop.
126 /// @return Modified isl_schedule_node.
127 static isl::schedule_node isolateFullPartialTiles(isl::schedule_node Node,
128 int VectorWidth);
130 private:
131 /// Tile a schedule node.
133 /// @param Node The node to tile.
134 /// @param Identifier An name that identifies this kind of tiling and
135 /// that is used to mark the tiled loops in the
136 /// generated AST.
137 /// @param TileSizes A vector of tile sizes that should be used for
138 /// tiling.
139 /// @param DefaultTileSize A default tile size that is used for dimensions
140 /// that are not covered by the TileSizes vector.
141 static __isl_give isl_schedule_node *
142 tileNode(__isl_take isl_schedule_node *Node, const char *Identifier,
143 llvm::ArrayRef<int> TileSizes, int DefaultTileSize);
145 /// Tile a schedule node and unroll point loops.
147 /// @param Node The node to register tile.
148 /// @param TileSizes A vector of tile sizes that should be used for
149 /// tiling.
150 /// @param DefaultTileSize A default tile size that is used for dimensions
151 static __isl_give isl_schedule_node *
152 applyRegisterTiling(__isl_take isl_schedule_node *Node,
153 llvm::ArrayRef<int> TileSizes, int DefaultTileSize);
155 /// Apply the BLIS matmul optimization pattern.
157 /// Make the loops containing the matrix multiplication be the innermost
158 /// loops and apply the BLIS matmul optimization pattern. BLIS implements
159 /// gemm as three nested loops around a macro-kernel, plus two packing
160 /// routines. The macro-kernel is implemented in terms of two additional
161 /// loops around a micro-kernel. The micro-kernel is a loop around a rank-1
162 /// (i.e., outer product) update.
164 /// For a detailed description please see [1].
166 /// The order of the loops defines the data reused in the BLIS implementation
167 /// of gemm ([1]). In particular, elements of the matrix B, the second
168 /// operand of matrix multiplication, are reused between iterations of the
169 /// innermost loop. To keep the reused data in cache, only elements of matrix
170 /// A, the first operand of matrix multiplication, should be evicted during
171 /// an iteration of the innermost loop. To provide such a cache replacement
172 /// policy, elements of the matrix A can, in particular, be loaded first and,
173 /// consequently, be least-recently-used.
175 /// In our case matrices are stored in row-major order instead of
176 /// column-major order used in the BLIS implementation ([1]). It affects only
177 /// on the form of the BLIS micro kernel and the computation of its
178 /// parameters. In particular, reused elements of the matrix B are
179 /// successively multiplied by specific elements of the matrix A.
181 /// Refs.:
182 /// [1] - Analytical Modeling is Enough for High Performance BLIS
183 /// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti
184 /// Technical Report, 2014
185 /// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
187 /// @see ScheduleTreeOptimizer::createMicroKernel
188 /// @see ScheduleTreeOptimizer::createMacroKernel
189 /// @see getMicroKernelParams
190 /// @see getMacroKernelParams
192 /// TODO: Implement the packing transformation.
194 /// @param Node The node that contains a band to be optimized. The node
195 /// is required to successfully pass
196 /// ScheduleTreeOptimizer::isMatrMultPattern.
197 /// @param TTI Target Transform Info.
198 /// @param MMI Parameters of the matrix multiplication operands.
199 /// @returns The transformed schedule.
200 static __isl_give isl_schedule_node *
201 optimizeMatMulPattern(__isl_take isl_schedule_node *Node,
202 const llvm::TargetTransformInfo *TTI,
203 polly::MatMulInfoTy &MMI);
205 /// Check if this node is a band node we want to tile.
207 /// We look for innermost band nodes where individual dimensions are marked as
208 /// permutable.
210 /// @param Node The node to check.
211 static bool isTileableBandNode(__isl_keep isl_schedule_node *Node);
213 /// Pre-vectorizes one scheduling dimension of a schedule band.
215 /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and
216 /// sinks the resulting point loop.
218 /// Example (DimToVectorize=0, VectorWidth=4):
220 /// | Before transformation:
221 /// |
222 /// | A[i,j] -> [i,j]
223 /// |
224 /// | for (i = 0; i < 128; i++)
225 /// | for (j = 0; j < 128; j++)
226 /// | A(i,j);
228 /// | After transformation:
229 /// |
230 /// | for (it = 0; it < 32; it+=1)
231 /// | for (j = 0; j < 128; j++)
232 /// | for (ip = 0; ip <= 3; ip++)
233 /// | A(4 * it + ip,j);
235 /// The goal of this transformation is to create a trivially vectorizable
236 /// loop. This means a parallel loop at the innermost level that has a
237 /// constant number of iterations corresponding to the target vector width.
239 /// This transformation creates a loop at the innermost level. The loop has
240 /// a constant number of iterations, if the number of loop iterations at
241 /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is
242 /// currently constant and not yet target specific. This function does not
243 /// reason about parallelism.
244 static __isl_give isl_schedule_node *
245 prevectSchedBand(__isl_take isl_schedule_node *Node, unsigned DimToVectorize,
246 int VectorWidth);
248 /// Apply additional optimizations on the bands in the schedule tree.
250 /// We are looking for an innermost band node and apply the following
251 /// transformations:
253 /// - Tile the band
254 /// - if the band is tileable
255 /// - if the band has more than one loop dimension
257 /// - Prevectorize the schedule of the band (or the point loop in case of
258 /// tiling).
259 /// - if vectorization is enabled
261 /// @param Node The schedule node to (possibly) optimize.
262 /// @param User A pointer to forward some use information
263 /// (currently unused).
264 static isl_schedule_node *optimizeBand(isl_schedule_node *Node, void *User);
266 /// Apply additional optimizations on the bands in the schedule tree.
268 /// We apply the following
269 /// transformations:
271 /// - Tile the band
272 /// - Prevectorize the schedule of the band (or the point loop in case of
273 /// tiling).
274 /// - if vectorization is enabled
276 /// @param Node The schedule node to (possibly) optimize.
277 /// @param User A pointer to forward some use information
278 /// (currently unused).
279 static isl_schedule_node *standardBandOpts(__isl_take isl_schedule_node *Node,
280 void *User);
282 /// Check if this node contains a partial schedule that could
283 /// probably be optimized with analytical modeling.
285 /// isMatrMultPattern tries to determine whether the following conditions
286 /// are true:
287 /// 1. the partial schedule contains only one statement.
288 /// 2. there are exactly three input dimensions.
289 /// 3. all memory accesses of the statement will have stride 0 or 1, if we
290 /// interchange loops (switch the variable used in the inner loop to
291 /// the outer loop).
292 /// 4. all memory accesses of the statement except from the last one, are
293 /// read memory access and the last one is write memory access.
294 /// 5. all subscripts of the last memory access of the statement don't
295 /// contain the variable used in the inner loop.
296 /// If this is the case, we could try to use an approach that is similar to
297 /// the one used to get close-to-peak performance of matrix multiplications.
299 /// @param Node The node to check.
300 /// @param D The SCoP dependencies.
301 /// @param MMI Parameters of the matrix multiplication operands.
302 static bool isMatrMultPattern(__isl_keep isl_schedule_node *Node,
303 const polly::Dependences *D,
304 polly::MatMulInfoTy &MMI);
306 /// Create the BLIS macro-kernel.
308 /// We create the BLIS macro-kernel by applying a combination of tiling
309 /// of dimensions of the band node and interchanging of two innermost
310 /// modified dimensions. The values of of MacroKernelParams's fields are used
311 /// as tile sizes.
313 /// @param Node The schedule node to be modified.
314 /// @param MacroKernelParams Parameters of the macro kernel
315 /// to be used as tile sizes.
316 static __isl_give isl_schedule_node *
317 createMacroKernel(__isl_take isl_schedule_node *Node,
318 MacroKernelParamsTy MacroKernelParams);
320 /// Create the BLIS macro-kernel.
322 /// We create the BLIS macro-kernel by applying a combination of tiling
323 /// of dimensions of the band node and interchanging of two innermost
324 /// modified dimensions. The values passed in MicroKernelParam are used
325 /// as tile sizes.
327 /// @param Node The schedule node to be modified.
328 /// @param MicroKernelParams Parameters of the micro kernel
329 /// to be used as tile sizes.
330 /// @see MicroKernelParamsTy
331 static __isl_give isl_schedule_node *
332 createMicroKernel(__isl_take isl_schedule_node *Node,
333 MicroKernelParamsTy MicroKernelParams);
336 #endif