From d8bb18c4ed6f8559d71c6457ffad4c75c71dd8da Mon Sep 17 00:00:00 2001 From: Roman Gareev Date: Wed, 21 Dec 2016 12:51:12 +0000 Subject: [PATCH] Change the determination of parameters of macro-kernel Typically processor architectures do not include an L3 cache, which means that Nc, the parameter of the micro-kernel, is, for all practical purposes, redundant ([1]). However, its small values can cause the redundant packing of the same elements of the matrix A, the first operand of the matrix multiplication. At the same time, big values of the parameter Nc can cause segmentation faults in case the available stack is exceeded. This patch adds an option to specify the parameter Nc as a multiple of the parameter of the micro-kernel Nr. In case of Intel Core i7-3820 SandyBridge and the following options, clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME -march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true -DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8 -mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm -polly-target-latency-vector-fma=8 it helps to improve the performance from 11.303 GFlops/sec (39,247% of theoretical peak) to 17.896 GFlops/sec (62,14% of theoretical peak). Refs.: [1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf Reviewed-by: Tobias Grosser Differential Revision: https://reviews.llvm.org/D28019 git-svn-id: https://llvm.org/svn/llvm-project/polly/trunk@290256 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transform/ScheduleOptimizer.cpp | 11 ++- .../mat_mul_pattern_data_layout.ll | 6 +- .../mat_mul_pattern_data_layout_2.ll | 103 ++++++++++----------- .../pattern-matching-based-opts_3.ll | 91 +++++++++--------- 4 files changed, 109 insertions(+), 102 deletions(-) diff --git a/lib/Transform/ScheduleOptimizer.cpp b/lib/Transform/ScheduleOptimizer.cpp index ae7d7331..54e3d8d0 100644 --- a/lib/Transform/ScheduleOptimizer.cpp +++ b/lib/Transform/ScheduleOptimizer.cpp @@ -185,6 +185,12 @@ static cl::opt RegisterDefaultTileSize( " --polly-register-tile-sizes)"), cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory)); +static cl::opt PollyPatternMatchingNcQuotient( + "polly-pattern-matching-nc-quotient", + cl::desc("Quotient that is obtained by dividing Nc, the parameter of the" + "macro-kernel, by Nr, the parameter of the micro-kernel"), + cl::Hidden, cl::init(256), cl::ZeroOrMore, cl::cat(PollyCategory)); + static cl::list RegisterTileSizes("polly-register-tile-sizes", cl::desc("A tile size for each loop dimension, filled " @@ -610,6 +616,9 @@ getMacroKernelParams(const MicroKernelParamsTy &MicroKernelParams) { CacheLevelSizes[0] > 0 && CacheLevelSizes[1] > 0 && CacheLevelAssociativity[0] > 2 && CacheLevelAssociativity[1] > 2)) return {1, 1, 1}; + // The quotient should be greater than zero. + if (PollyPatternMatchingNcQuotient <= 0) + return {1, 1, 1}; int Car = floor( (CacheLevelAssociativity[0] - 1) / (1 + static_cast(MicroKernelParams.Nr) / MicroKernelParams.Mr)); @@ -618,7 +627,7 @@ getMacroKernelParams(const MicroKernelParamsTy &MicroKernelParams) { double Cac = static_cast(Kc * 8 * CacheLevelAssociativity[1]) / CacheLevelSizes[1]; int Mc = floor((CacheLevelAssociativity[1] - 2) / Cac); - int Nc = floor(1 / Cac); + int Nc = PollyPatternMatchingNcQuotient * MicroKernelParams.Nr; return {Mc, Nc, Kc}; } diff --git a/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll b/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll index e7a1dc82..4f723c59 100644 --- a/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll +++ b/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll @@ -9,14 +9,14 @@ ; C[i][j] += alpha * A[i][k] * B[k][j]; ; } ; -; CHECK: double Packed_B[ { [] -> [(2)] } ][ { [] -> [(256)] } ][ { [] -> [(8)] } ]; // Element size 8 +; CHECK: double Packed_B[ { [] -> [(256)] } ][ { [] -> [(256)] } ][ { [] -> [(8)] } ]; ; CHECK-NEXT: double Packed_A[ { [] -> [(24)] } ][ { [] -> [(256)] } ][ { [] -> [(4)] } ]; // Element size 8 ; ; CHECK: { Stmt_Copy_0[i0, i1, i2] -> MemRef_arg6[i0, i2] }; ; CHECK-NEXT: new: { Stmt_Copy_0[i0, i1, i2] -> Packed_A[o0, o1, o2] : 256*floor((-i2 + o1)/256) = -i2 + o1 and 4*floor((-i0 + o2)/4) = -i0 + o2 and 0 <= o1 <= 255 and 0 <= o2 <= 3 and -3 + i0 - 4o0 <= 96*floor((i0)/96) <= i0 - 4o0 }; ; ; CHECK: { Stmt_Copy_0[i0, i1, i2] -> MemRef_arg7[i2, i1] }; -; CHECK-NEXT: new: { Stmt_Copy_0[i0, i1, i2] -> Packed_B[o0, o1, o2] : 256*floor((-i2 + o1)/256) = -i2 + o1 and 8*floor((-i1 + o2)/8) = -i1 + o2 and 0 <= o1 <= 255 and 0 <= o2 <= 7 and -7 + i1 - 8o0 <= 16*floor((i1)/16) <= i1 - 8o0 }; +; CHECK-NEXT: new: { Stmt_Copy_0[i0, i1, i2] -> Packed_B[o0, o1, o2] : 256*floor((-i2 + o1)/256) = -i2 + o1 and 8*floor((-i1 + o2)/8) = -i1 + o2 and 0 <= o1 <= 255 and 0 <= o2 <= 7 and -7 + i1 - 8o0 <= 2048*floor((i1)/2048) <= i1 - 8o0 }; ; ; CHECK: CopyStmt_0 ; CHECK-NEXT: Domain := @@ -25,7 +25,7 @@ ; CHECK-NEXT: ; ; CHECK-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] ; CHECK-NEXT: null; -; CHECK-NEXT: new: { CopyStmt_0[i0, i1, i2] -> Packed_B[o0, o1, o2] : 256*floor((-i2 + o1)/256) = -i2 + o1 and 8*floor((-i1 + o2)/8) = -i1 + o2 and 0 <= o1 <= 255 and 0 <= o2 <= 7 and -7 + i1 - 8o0 <= 16*floor((i1)/16) <= i1 - 8o0 }; +; CHECK-NEXT: new: { CopyStmt_0[i0, i1, i2] -> Packed_B[o0, o1, o2] : 256*floor((-i2 + o1)/256) = -i2 + o1 and 8*floor((-i1 + o2)/8) = -i1 + o2 and 0 <= o1 <= 255 and 0 <= o2 <= 7 and -7 + i1 - 8o0 <= 2048*floor((i1)/2048) <= i1 - 8o0 }; ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] ; CHECK-NEXT: null; ; CHECK-NEXT: new: { CopyStmt_0[i0, i1, i2] -> MemRef_arg7[i2, i1] }; diff --git a/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll b/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll index 9ff8da19..73d48e30 100644 --- a/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll +++ b/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll @@ -20,60 +20,59 @@ ; CHECK-NEXT: Stmt_bb9(32 * c0 + c2, 32 * c1 + c3); ; CHECK-NEXT: } ; CHECK-NEXT: // 1st level tiling - Tiles -; CHECK-NEXT: for (int c0 = 0; c0 <= 65; c0 += 1) -; CHECK-NEXT: for (int c1 = 0; c1 <= 3; c1 += 1) { -; CHECK-NEXT: for (int c3 = 16 * c0; c3 <= 16 * c0 + 15; c3 += 1) -; CHECK-NEXT: for (int c4 = 256 * c1; c4 <= min(1022, 256 * c1 + 255); c4 += 1) -; CHECK-NEXT: CopyStmt_0(0, c3, c4); -; CHECK-NEXT: for (int c2 = 0; c2 <= 10; c2 += 1) { -; CHECK-NEXT: for (int c3 = 96 * c2; c3 <= 96 * c2 + 95; c3 += 1) -; CHECK-NEXT: for (int c5 = 256 * c1; c5 <= min(1022, 256 * c1 + 255); c5 += 1) -; CHECK-NEXT: CopyStmt_1(c3, 0, c5); -; CHECK-NEXT: // 1st level tiling - Points -; CHECK-NEXT: // Register tiling - Tiles -; CHECK-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1) -; CHECK-NEXT: for (int c4 = 0; c4 <= 23; c4 += 1) -; CHECK-NEXT: for (int c5 = 0; c5 <= min(255, -256 * c1 + 1022); c5 += 1) { -; CHECK-NEXT: // Register tiling - Points -; CHECK-NEXT: // 1st level tiling - Tiles -; CHECK-NEXT: // 1st level tiling - Points -; CHECK-NEXT: { -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); -; CHECK-NEXT: } +; CHECK-NEXT: for (int c1 = 0; c1 <= 3; c1 += 1) { +; CHECK-NEXT: for (int c3 = 0; c3 <= 1055; c3 += 1) +; CHECK-NEXT: for (int c4 = 256 * c1; c4 <= min(1022, 256 * c1 + 255); c4 += 1) +; CHECK-NEXT: CopyStmt_0(0, c3, c4); +; CHECK-NEXT: for (int c2 = 0; c2 <= 10; c2 += 1) { +; CHECK-NEXT: for (int c3 = 96 * c2; c3 <= 96 * c2 + 95; c3 += 1) +; CHECK-NEXT: for (int c5 = 256 * c1; c5 <= min(1022, 256 * c1 + 255); c5 += 1) +; CHECK-NEXT: CopyStmt_1(c3, 0, c5); +; CHECK-NEXT: // 1st level tiling - Points +; CHECK-NEXT: // Register tiling - Tiles +; CHECK-NEXT: for (int c3 = 0; c3 <= 131; c3 += 1) +; CHECK-NEXT: for (int c4 = 0; c4 <= 23; c4 += 1) +; CHECK-NEXT: for (int c5 = 0; c5 <= min(255, -256 * c1 + 1022); c5 += 1) { +; CHECK-NEXT: // Register tiling - Points +; CHECK-NEXT: // 1st level tiling - Tiles +; CHECK-NEXT: // 1st level tiling - Points +; CHECK-NEXT: { +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 7, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 7, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 7, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 7, 256 * c1 + c5); ; CHECK-NEXT: } -; CHECK-NEXT: } +; CHECK-NEXT: } ; CHECK-NEXT: } +; CHECK-NEXT: } ; CHECK-NEXT: } ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll b/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll index f2117eba..f5b99eee 100644 --- a/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll +++ b/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll @@ -73,53 +73,52 @@ ; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb14(32 * c0 + c2, 32 * c1 + c3); ; EXTRACTION-OF-MACRO-KERNEL-NEXT: } ; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Tiles -; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c0 = 0; c0 <= 65; c0 += 1) -; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c1 = 0; c1 <= 3; c1 += 1) -; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c2 = 0; c2 <= 10; c2 += 1) { -; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Points -; EXTRACTION-OF-MACRO-KERNEL-NEXT: // Register tiling - Tiles -; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1) -; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c4 = 0; c4 <= 23; c4 += 1) -; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c5 = 0; c5 <= 255; c5 += 1) { -; EXTRACTION-OF-MACRO-KERNEL-NEXT: // Register tiling - Points -; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Tiles -; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Points -; EXTRACTION-OF-MACRO-KERNEL-NEXT: { -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: } +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c1 = 0; c1 <= 3; c1 += 1) +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c2 = 0; c2 <= 10; c2 += 1) { +; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Points +; EXTRACTION-OF-MACRO-KERNEL-NEXT: // Register tiling - Tiles +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c3 = 0; c3 <= 131; c3 += 1) +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c4 = 0; c4 <= 23; c4 += 1) +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c5 = 0; c5 <= 255; c5 += 1) { +; EXTRACTION-OF-MACRO-KERNEL-NEXT: // Register tiling - Points +; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Tiles +; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Points +; EXTRACTION-OF-MACRO-KERNEL-NEXT: { +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 7, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 7, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 7, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 7, 256 * c1 + c5); ; EXTRACTION-OF-MACRO-KERNEL-NEXT: } -; EXTRACTION-OF-MACRO-KERNEL-NEXT: } +; EXTRACTION-OF-MACRO-KERNEL-NEXT: } +; EXTRACTION-OF-MACRO-KERNEL-NEXT: } ; EXTRACTION-OF-MACRO-KERNEL-NEXT: } ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -- 2.11.4.GIT