Remove debug metadata from copied instruction to prevent GPUModule verification failure
[polly-mirror.git] / lib / CodeGen / BlockGenerators.cpp
blob55e5ed09e4abb8eb2431a3d14e13182741088385
1 //===--- BlockGenerators.cpp - Generate code for statements -----*- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements the BlockGenerator and VectorBlockGenerator classes,
11 // which generate sequential code and vectorized code for a polyhedral
12 // statement, respectively.
14 //===----------------------------------------------------------------------===//
16 #include "polly/CodeGen/BlockGenerators.h"
17 #include "polly/CodeGen/CodeGeneration.h"
18 #include "polly/CodeGen/IslExprBuilder.h"
19 #include "polly/CodeGen/RuntimeDebugBuilder.h"
20 #include "polly/Options.h"
21 #include "polly/ScopInfo.h"
22 #include "polly/Support/GICHelper.h"
23 #include "polly/Support/SCEVValidator.h"
24 #include "polly/Support/ScopHelper.h"
25 #include "polly/Support/VirtualInstruction.h"
26 #include "llvm/Analysis/LoopInfo.h"
27 #include "llvm/Analysis/RegionInfo.h"
28 #include "llvm/Analysis/ScalarEvolution.h"
29 #include "llvm/IR/IntrinsicInst.h"
30 #include "llvm/IR/Module.h"
31 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
32 #include "llvm/Transforms/Utils/Local.h"
33 #include "isl/aff.h"
34 #include "isl/ast.h"
35 #include "isl/ast_build.h"
36 #include "isl/set.h"
37 #include <deque>
39 using namespace llvm;
40 using namespace polly;
42 static cl::opt<bool> Aligned("enable-polly-aligned",
43 cl::desc("Assumed aligned memory accesses."),
44 cl::Hidden, cl::init(false), cl::ZeroOrMore,
45 cl::cat(PollyCategory));
47 bool PollyDebugPrinting;
48 static cl::opt<bool, true> DebugPrintingX(
49 "polly-codegen-add-debug-printing",
50 cl::desc("Add printf calls that show the values loaded/stored."),
51 cl::location(PollyDebugPrinting), cl::Hidden, cl::init(false),
52 cl::ZeroOrMore, cl::cat(PollyCategory));
54 BlockGenerator::BlockGenerator(
55 PollyIRBuilder &B, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT,
56 AllocaMapTy &ScalarMap, EscapeUsersAllocaMapTy &EscapeMap,
57 ValueMapT &GlobalMap, IslExprBuilder *ExprBuilder, BasicBlock *StartBlock)
58 : Builder(B), LI(LI), SE(SE), ExprBuilder(ExprBuilder), DT(DT),
59 EntryBB(nullptr), ScalarMap(ScalarMap), EscapeMap(EscapeMap),
60 GlobalMap(GlobalMap), StartBlock(StartBlock) {}
62 Value *BlockGenerator::trySynthesizeNewValue(ScopStmt &Stmt, Value *Old,
63 ValueMapT &BBMap,
64 LoopToScevMapT &LTS,
65 Loop *L) const {
66 if (!SE.isSCEVable(Old->getType()))
67 return nullptr;
69 const SCEV *Scev = SE.getSCEVAtScope(Old, L);
70 if (!Scev)
71 return nullptr;
73 if (isa<SCEVCouldNotCompute>(Scev))
74 return nullptr;
76 const SCEV *NewScev = SCEVLoopAddRecRewriter::rewrite(Scev, LTS, SE);
77 ValueMapT VTV;
78 VTV.insert(BBMap.begin(), BBMap.end());
79 VTV.insert(GlobalMap.begin(), GlobalMap.end());
81 Scop &S = *Stmt.getParent();
82 const DataLayout &DL = S.getFunction().getParent()->getDataLayout();
83 auto IP = Builder.GetInsertPoint();
85 assert(IP != Builder.GetInsertBlock()->end() &&
86 "Only instructions can be insert points for SCEVExpander");
87 Value *Expanded =
88 expandCodeFor(S, SE, DL, "polly", NewScev, Old->getType(), &*IP, &VTV,
89 StartBlock->getSinglePredecessor());
91 BBMap[Old] = Expanded;
92 return Expanded;
95 Value *BlockGenerator::getNewValue(ScopStmt &Stmt, Value *Old, ValueMapT &BBMap,
96 LoopToScevMapT &LTS, Loop *L) const {
98 auto lookupGlobally = [this](Value *Old) -> Value * {
99 Value *New = GlobalMap.lookup(Old);
100 if (!New)
101 return nullptr;
103 // Required by:
104 // * Isl/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll
105 // * Isl/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll
106 // * Isl/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll
107 // * Isl/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll
108 // * Isl/CodeGen/OpenMP/loop-body-references-outer-values-3.ll
109 // * Isl/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll
110 // GlobalMap should be a mapping from (value in original SCoP) to (copied
111 // value in generated SCoP), without intermediate mappings, which might
112 // easily require transitiveness as well.
113 if (Value *NewRemapped = GlobalMap.lookup(New))
114 New = NewRemapped;
116 // No test case for this code.
117 if (Old->getType()->getScalarSizeInBits() <
118 New->getType()->getScalarSizeInBits())
119 New = Builder.CreateTruncOrBitCast(New, Old->getType());
121 return New;
124 Value *New = nullptr;
125 auto VUse = VirtualUse::create(&Stmt, L, Old, true);
126 switch (VUse.getKind()) {
127 case VirtualUse::Block:
128 // BasicBlock are constants, but the BlockGenerator copies them.
129 New = BBMap.lookup(Old);
130 break;
132 case VirtualUse::Constant:
133 // Used by:
134 // * Isl/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll
135 // Constants should not be redefined. In this case, the GlobalMap just
136 // contains a mapping to the same constant, which is unnecessary, but
137 // harmless.
138 if ((New = lookupGlobally(Old)))
139 break;
141 assert(!BBMap.count(Old));
142 New = Old;
143 break;
145 case VirtualUse::ReadOnly:
146 assert(!GlobalMap.count(Old));
148 // Required for:
149 // * Isl/CodeGen/MemAccess/create_arrays.ll
150 // * Isl/CodeGen/read-only-scalars.ll
151 // * ScheduleOptimizer/pattern-matching-based-opts_10.ll
152 // For some reason these reload a read-only value. The reloaded value ends
153 // up in BBMap, buts its value should be identical.
155 // Required for:
156 // * Isl/CodeGen/OpenMP/single_loop_with_param.ll
157 // The parallel subfunctions need to reference the read-only value from the
158 // parent function, this is done by reloading them locally.
159 if ((New = BBMap.lookup(Old)))
160 break;
162 New = Old;
163 break;
165 case VirtualUse::Synthesizable:
166 // Used by:
167 // * Isl/CodeGen/OpenMP/loop-body-references-outer-values-3.ll
168 // * Isl/CodeGen/OpenMP/recomputed-srem.ll
169 // * Isl/CodeGen/OpenMP/reference-other-bb.ll
170 // * Isl/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll
171 // For some reason synthesizable values end up in GlobalMap. Their values
172 // are the same as trySynthesizeNewValue would return. The legacy
173 // implementation prioritized GlobalMap, so this is what we do here as well.
174 // Ideally, synthesizable values should not end up in GlobalMap.
175 if ((New = lookupGlobally(Old)))
176 break;
178 // Required for:
179 // * Isl/CodeGen/RuntimeDebugBuilder/combine_different_values.ll
180 // * Isl/CodeGen/getNumberOfIterations.ll
181 // * Isl/CodeGen/non_affine_float_compare.ll
182 // * ScheduleOptimizer/pattern-matching-based-opts_10.ll
183 // Ideally, synthesizable values are synthesized by trySynthesizeNewValue,
184 // not precomputed (SCEVExpander has its own caching mechanism).
185 // These tests fail without this, but I think trySynthesizeNewValue would
186 // just re-synthesize the same instructions.
187 if ((New = BBMap.lookup(Old)))
188 break;
190 New = trySynthesizeNewValue(Stmt, Old, BBMap, LTS, L);
191 break;
193 case VirtualUse::Hoisted:
194 // TODO: Hoisted invariant loads should be found in GlobalMap only, but not
195 // redefined locally (which will be ignored anyway). That is, the following
196 // assertion should apply: assert(!BBMap.count(Old))
198 New = lookupGlobally(Old);
199 break;
201 case VirtualUse::Intra:
202 case VirtualUse::Inter:
203 assert(!GlobalMap.count(Old) &&
204 "Intra and inter-stmt values are never global");
205 New = BBMap.lookup(Old);
206 break;
208 assert(New && "Unexpected scalar dependence in region!");
209 return New;
212 void BlockGenerator::copyInstScalar(ScopStmt &Stmt, Instruction *Inst,
213 ValueMapT &BBMap, LoopToScevMapT &LTS) {
214 // We do not generate debug intrinsics as we did not investigate how to
215 // copy them correctly. At the current state, they just crash the code
216 // generation as the meta-data operands are not correctly copied.
217 if (isa<DbgInfoIntrinsic>(Inst))
218 return;
220 Instruction *NewInst = Inst->clone();
222 // Replace old operands with the new ones.
223 for (Value *OldOperand : Inst->operands()) {
224 Value *NewOperand =
225 getNewValue(Stmt, OldOperand, BBMap, LTS, getLoopForStmt(Stmt));
227 if (!NewOperand) {
228 assert(!isa<StoreInst>(NewInst) &&
229 "Store instructions are always needed!");
230 NewInst->deleteValue();
231 return;
234 NewInst->replaceUsesOfWith(OldOperand, NewOperand);
238 Builder.Insert(NewInst);
239 BBMap[Inst] = NewInst;
241 // When copying the instruction onto the Module meant for the GPU,
242 // debug metadata attached to an instruction causes all related
243 // metadata to be pulled into the Module. This includes the DICompileUnit,
244 // which will not be listed in llvm.dbg.cu of the Module since the Module
245 // doesn't contain one. This fails the verification of the Module and the
246 // subsequent generation of the ASM string.
247 if( NewInst->getModule() != Inst->getModule() )
248 NewInst->setDebugLoc(llvm::DebugLoc());
250 if (!NewInst->getType()->isVoidTy())
251 NewInst->setName("p_" + Inst->getName());
254 Value *
255 BlockGenerator::generateLocationAccessed(ScopStmt &Stmt, MemAccInst Inst,
256 ValueMapT &BBMap, LoopToScevMapT &LTS,
257 isl_id_to_ast_expr *NewAccesses) {
258 const MemoryAccess &MA = Stmt.getArrayAccessFor(Inst);
259 return generateLocationAccessed(
260 Stmt, getLoopForStmt(Stmt),
261 Inst.isNull() ? nullptr : Inst.getPointerOperand(), BBMap, LTS,
262 NewAccesses, MA.getId().release(), MA.getAccessValue()->getType());
265 Value *BlockGenerator::generateLocationAccessed(
266 ScopStmt &Stmt, Loop *L, Value *Pointer, ValueMapT &BBMap,
267 LoopToScevMapT &LTS, isl_id_to_ast_expr *NewAccesses, __isl_take isl_id *Id,
268 Type *ExpectedType) {
269 isl_ast_expr *AccessExpr = isl_id_to_ast_expr_get(NewAccesses, Id);
271 if (AccessExpr) {
272 AccessExpr = isl_ast_expr_address_of(AccessExpr);
273 auto Address = ExprBuilder->create(AccessExpr);
275 // Cast the address of this memory access to a pointer type that has the
276 // same element type as the original access, but uses the address space of
277 // the newly generated pointer.
278 auto OldPtrTy = ExpectedType->getPointerTo();
279 auto NewPtrTy = Address->getType();
280 OldPtrTy = PointerType::get(OldPtrTy->getElementType(),
281 NewPtrTy->getPointerAddressSpace());
283 if (OldPtrTy != NewPtrTy)
284 Address = Builder.CreateBitOrPointerCast(Address, OldPtrTy);
285 return Address;
287 assert(
288 Pointer &&
289 "If expression was not generated, must use the original pointer value");
290 return getNewValue(Stmt, Pointer, BBMap, LTS, L);
293 Value *
294 BlockGenerator::getImplicitAddress(MemoryAccess &Access, Loop *L,
295 LoopToScevMapT &LTS, ValueMapT &BBMap,
296 __isl_keep isl_id_to_ast_expr *NewAccesses) {
297 if (Access.isLatestArrayKind())
298 return generateLocationAccessed(*Access.getStatement(), L, nullptr, BBMap,
299 LTS, NewAccesses, Access.getId().release(),
300 Access.getAccessValue()->getType());
302 return getOrCreateAlloca(Access);
305 Loop *BlockGenerator::getLoopForStmt(const ScopStmt &Stmt) const {
306 auto *StmtBB = Stmt.getEntryBlock();
307 return LI.getLoopFor(StmtBB);
310 Value *BlockGenerator::generateArrayLoad(ScopStmt &Stmt, LoadInst *Load,
311 ValueMapT &BBMap, LoopToScevMapT &LTS,
312 isl_id_to_ast_expr *NewAccesses) {
313 if (Value *PreloadLoad = GlobalMap.lookup(Load))
314 return PreloadLoad;
316 Value *NewPointer =
317 generateLocationAccessed(Stmt, Load, BBMap, LTS, NewAccesses);
318 Value *ScalarLoad = Builder.CreateAlignedLoad(
319 NewPointer, Load->getAlignment(), Load->getName() + "_p_scalar_");
321 if (PollyDebugPrinting)
322 RuntimeDebugBuilder::createCPUPrinter(Builder, "Load from ", NewPointer,
323 ": ", ScalarLoad, "\n");
325 return ScalarLoad;
328 void BlockGenerator::generateArrayStore(ScopStmt &Stmt, StoreInst *Store,
329 ValueMapT &BBMap, LoopToScevMapT &LTS,
330 isl_id_to_ast_expr *NewAccesses) {
331 MemoryAccess &MA = Stmt.getArrayAccessFor(Store);
332 isl::set AccDom = MA.getAccessRelation().domain();
333 std::string Subject = MA.getId().get_name();
335 generateConditionalExecution(Stmt, AccDom, Subject.c_str(), [&, this]() {
336 Value *NewPointer =
337 generateLocationAccessed(Stmt, Store, BBMap, LTS, NewAccesses);
338 Value *ValueOperand = getNewValue(Stmt, Store->getValueOperand(), BBMap,
339 LTS, getLoopForStmt(Stmt));
341 if (PollyDebugPrinting)
342 RuntimeDebugBuilder::createCPUPrinter(Builder, "Store to ", NewPointer,
343 ": ", ValueOperand, "\n");
345 Builder.CreateAlignedStore(ValueOperand, NewPointer, Store->getAlignment());
349 bool BlockGenerator::canSyntheziseInStmt(ScopStmt &Stmt, Instruction *Inst) {
350 Loop *L = getLoopForStmt(Stmt);
351 return (Stmt.isBlockStmt() || !Stmt.getRegion()->contains(L)) &&
352 canSynthesize(Inst, *Stmt.getParent(), &SE, L);
355 void BlockGenerator::copyInstruction(ScopStmt &Stmt, Instruction *Inst,
356 ValueMapT &BBMap, LoopToScevMapT &LTS,
357 isl_id_to_ast_expr *NewAccesses) {
358 // Terminator instructions control the control flow. They are explicitly
359 // expressed in the clast and do not need to be copied.
360 if (Inst->isTerminator())
361 return;
363 // Synthesizable statements will be generated on-demand.
364 if (canSyntheziseInStmt(Stmt, Inst))
365 return;
367 if (auto *Load = dyn_cast<LoadInst>(Inst)) {
368 Value *NewLoad = generateArrayLoad(Stmt, Load, BBMap, LTS, NewAccesses);
369 // Compute NewLoad before its insertion in BBMap to make the insertion
370 // deterministic.
371 BBMap[Load] = NewLoad;
372 return;
375 if (auto *Store = dyn_cast<StoreInst>(Inst)) {
376 // Identified as redundant by -polly-simplify.
377 if (!Stmt.getArrayAccessOrNULLFor(Store))
378 return;
380 generateArrayStore(Stmt, Store, BBMap, LTS, NewAccesses);
381 return;
384 if (auto *PHI = dyn_cast<PHINode>(Inst)) {
385 copyPHIInstruction(Stmt, PHI, BBMap, LTS);
386 return;
389 // Skip some special intrinsics for which we do not adjust the semantics to
390 // the new schedule. All others are handled like every other instruction.
391 if (isIgnoredIntrinsic(Inst))
392 return;
394 copyInstScalar(Stmt, Inst, BBMap, LTS);
397 void BlockGenerator::removeDeadInstructions(BasicBlock *BB, ValueMapT &BBMap) {
398 auto NewBB = Builder.GetInsertBlock();
399 for (auto I = NewBB->rbegin(); I != NewBB->rend(); I++) {
400 Instruction *NewInst = &*I;
402 if (!isInstructionTriviallyDead(NewInst))
403 continue;
405 for (auto Pair : BBMap)
406 if (Pair.second == NewInst) {
407 BBMap.erase(Pair.first);
410 NewInst->eraseFromParent();
411 I = NewBB->rbegin();
415 void BlockGenerator::copyStmt(ScopStmt &Stmt, LoopToScevMapT &LTS,
416 isl_id_to_ast_expr *NewAccesses) {
417 assert(Stmt.isBlockStmt() &&
418 "Only block statements can be copied by the block generator");
420 ValueMapT BBMap;
422 BasicBlock *BB = Stmt.getBasicBlock();
423 copyBB(Stmt, BB, BBMap, LTS, NewAccesses);
424 removeDeadInstructions(BB, BBMap);
427 BasicBlock *BlockGenerator::splitBB(BasicBlock *BB) {
428 BasicBlock *CopyBB = SplitBlock(Builder.GetInsertBlock(),
429 &*Builder.GetInsertPoint(), &DT, &LI);
430 CopyBB->setName("polly.stmt." + BB->getName());
431 return CopyBB;
434 BasicBlock *BlockGenerator::copyBB(ScopStmt &Stmt, BasicBlock *BB,
435 ValueMapT &BBMap, LoopToScevMapT &LTS,
436 isl_id_to_ast_expr *NewAccesses) {
437 BasicBlock *CopyBB = splitBB(BB);
438 Builder.SetInsertPoint(&CopyBB->front());
439 generateScalarLoads(Stmt, LTS, BBMap, NewAccesses);
441 copyBB(Stmt, BB, CopyBB, BBMap, LTS, NewAccesses);
443 // After a basic block was copied store all scalars that escape this block in
444 // their alloca.
445 generateScalarStores(Stmt, LTS, BBMap, NewAccesses);
446 return CopyBB;
449 void BlockGenerator::copyBB(ScopStmt &Stmt, BasicBlock *BB, BasicBlock *CopyBB,
450 ValueMapT &BBMap, LoopToScevMapT &LTS,
451 isl_id_to_ast_expr *NewAccesses) {
452 EntryBB = &CopyBB->getParent()->getEntryBlock();
454 if (Stmt.isBlockStmt())
455 for (Instruction *Inst : Stmt.getInstructions())
456 copyInstruction(Stmt, Inst, BBMap, LTS, NewAccesses);
457 else
458 for (Instruction &Inst : *BB)
459 copyInstruction(Stmt, &Inst, BBMap, LTS, NewAccesses);
462 Value *BlockGenerator::getOrCreateAlloca(const MemoryAccess &Access) {
463 assert(!Access.isLatestArrayKind() && "Trying to get alloca for array kind");
465 return getOrCreateAlloca(Access.getLatestScopArrayInfo());
468 Value *BlockGenerator::getOrCreateAlloca(const ScopArrayInfo *Array) {
469 assert(!Array->isArrayKind() && "Trying to get alloca for array kind");
471 auto &Addr = ScalarMap[Array];
473 if (Addr) {
474 // Allow allocas to be (temporarily) redirected once by adding a new
475 // old-alloca-addr to new-addr mapping to GlobalMap. This functionality
476 // is used for example by the OpenMP code generation where a first use
477 // of a scalar while still in the host code allocates a normal alloca with
478 // getOrCreateAlloca. When the values of this scalar are accessed during
479 // the generation of the parallel subfunction, these values are copied over
480 // to the parallel subfunction and each request for a scalar alloca slot
481 // must be forwarded to the temporary in-subfunction slot. This mapping is
482 // removed when the subfunction has been generated and again normal host
483 // code is generated. Due to the following reasons it is not possible to
484 // perform the GlobalMap lookup right after creating the alloca below, but
485 // instead we need to check GlobalMap at each call to getOrCreateAlloca:
487 // 1) GlobalMap may be changed multiple times (for each parallel loop),
488 // 2) The temporary mapping is commonly only known after the initial
489 // alloca has already been generated, and
490 // 3) The original alloca value must be restored after leaving the
491 // sub-function.
492 if (Value *NewAddr = GlobalMap.lookup(&*Addr))
493 return NewAddr;
494 return Addr;
497 Type *Ty = Array->getElementType();
498 Value *ScalarBase = Array->getBasePtr();
499 std::string NameExt;
500 if (Array->isPHIKind())
501 NameExt = ".phiops";
502 else
503 NameExt = ".s2a";
505 const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout();
507 Addr = new AllocaInst(Ty, DL.getAllocaAddrSpace(),
508 ScalarBase->getName() + NameExt);
509 EntryBB = &Builder.GetInsertBlock()->getParent()->getEntryBlock();
510 Addr->insertBefore(&*EntryBB->getFirstInsertionPt());
512 return Addr;
515 void BlockGenerator::handleOutsideUsers(const Scop &S, ScopArrayInfo *Array) {
516 Instruction *Inst = cast<Instruction>(Array->getBasePtr());
518 // If there are escape users we get the alloca for this instruction and put it
519 // in the EscapeMap for later finalization. Lastly, if the instruction was
520 // copied multiple times we already did this and can exit.
521 if (EscapeMap.count(Inst))
522 return;
524 EscapeUserVectorTy EscapeUsers;
525 for (User *U : Inst->users()) {
527 // Non-instruction user will never escape.
528 Instruction *UI = dyn_cast<Instruction>(U);
529 if (!UI)
530 continue;
532 if (S.contains(UI))
533 continue;
535 EscapeUsers.push_back(UI);
538 // Exit if no escape uses were found.
539 if (EscapeUsers.empty())
540 return;
542 // Get or create an escape alloca for this instruction.
543 auto *ScalarAddr = getOrCreateAlloca(Array);
545 // Remember that this instruction has escape uses and the escape alloca.
546 EscapeMap[Inst] = std::make_pair(ScalarAddr, std::move(EscapeUsers));
549 void BlockGenerator::generateScalarLoads(
550 ScopStmt &Stmt, LoopToScevMapT &LTS, ValueMapT &BBMap,
551 __isl_keep isl_id_to_ast_expr *NewAccesses) {
552 for (MemoryAccess *MA : Stmt) {
553 if (MA->isOriginalArrayKind() || MA->isWrite())
554 continue;
556 #ifndef NDEBUG
557 auto *StmtDom = Stmt.getDomain();
558 auto *AccDom = isl_map_domain(MA->getAccessRelation().release());
559 assert(isl_set_is_subset(StmtDom, AccDom) &&
560 "Scalar must be loaded in all statement instances");
561 isl_set_free(StmtDom);
562 isl_set_free(AccDom);
563 #endif
565 auto *Address =
566 getImplicitAddress(*MA, getLoopForStmt(Stmt), LTS, BBMap, NewAccesses);
567 assert((!isa<Instruction>(Address) ||
568 DT.dominates(cast<Instruction>(Address)->getParent(),
569 Builder.GetInsertBlock())) &&
570 "Domination violation");
571 BBMap[MA->getAccessValue()] =
572 Builder.CreateLoad(Address, Address->getName() + ".reload");
576 Value *BlockGenerator::buildContainsCondition(ScopStmt &Stmt,
577 const isl::set &Subdomain) {
578 isl::ast_build AstBuild = give(isl_ast_build_copy(Stmt.getAstBuild()));
579 isl::set Domain = give(Stmt.getDomain());
581 isl::union_map USchedule = AstBuild.get_schedule();
582 USchedule = USchedule.intersect_domain(Domain);
584 assert(!USchedule.is_empty());
585 isl::map Schedule = isl::map::from_union_map(USchedule);
587 isl::set ScheduledDomain = Schedule.range();
588 isl::set ScheduledSet = Subdomain.apply(Schedule);
590 isl::ast_build RestrictedBuild = AstBuild.restrict(ScheduledDomain);
592 isl::ast_expr IsInSet = RestrictedBuild.expr_from(ScheduledSet);
593 Value *IsInSetExpr = ExprBuilder->create(IsInSet.copy());
594 IsInSetExpr = Builder.CreateICmpNE(
595 IsInSetExpr, ConstantInt::get(IsInSetExpr->getType(), 0));
597 return IsInSetExpr;
600 void BlockGenerator::generateConditionalExecution(
601 ScopStmt &Stmt, const isl::set &Subdomain, StringRef Subject,
602 const std::function<void()> &GenThenFunc) {
603 isl::set StmtDom = give(Stmt.getDomain());
605 // Don't call GenThenFunc if it is never executed. An ast index expression
606 // might not be defined in this case.
607 if (Subdomain.is_empty())
608 return;
610 // If the condition is a tautology, don't generate a condition around the
611 // code.
612 bool IsPartialWrite =
613 !StmtDom.intersect_params(give(Stmt.getParent()->getContext()))
614 .is_subset(Subdomain);
615 if (!IsPartialWrite) {
616 GenThenFunc();
617 return;
620 // Generate the condition.
621 Value *Cond = buildContainsCondition(Stmt, Subdomain);
622 BasicBlock *HeadBlock = Builder.GetInsertBlock();
623 StringRef BlockName = HeadBlock->getName();
625 // Generate the conditional block.
626 SplitBlockAndInsertIfThen(Cond, &*Builder.GetInsertPoint(), false, nullptr,
627 &DT, &LI);
628 BranchInst *Branch = cast<BranchInst>(HeadBlock->getTerminator());
629 BasicBlock *ThenBlock = Branch->getSuccessor(0);
630 BasicBlock *TailBlock = Branch->getSuccessor(1);
632 // Assign descriptive names.
633 if (auto *CondInst = dyn_cast<Instruction>(Cond))
634 CondInst->setName("polly." + Subject + ".cond");
635 ThenBlock->setName(BlockName + "." + Subject + ".partial");
636 TailBlock->setName(BlockName + ".cont");
638 // Put the client code into the conditional block and continue in the merge
639 // block afterwards.
640 Builder.SetInsertPoint(ThenBlock, ThenBlock->getFirstInsertionPt());
641 GenThenFunc();
642 Builder.SetInsertPoint(TailBlock, TailBlock->getFirstInsertionPt());
645 void BlockGenerator::generateScalarStores(
646 ScopStmt &Stmt, LoopToScevMapT &LTS, ValueMapT &BBMap,
647 __isl_keep isl_id_to_ast_expr *NewAccesses) {
648 Loop *L = LI.getLoopFor(Stmt.getBasicBlock());
650 assert(Stmt.isBlockStmt() &&
651 "Region statements need to use the generateScalarStores() function in "
652 "the RegionGenerator");
654 for (MemoryAccess *MA : Stmt) {
655 if (MA->isOriginalArrayKind() || MA->isRead())
656 continue;
658 isl::set AccDom = MA->getAccessRelation().domain();
659 std::string Subject = MA->getId().get_name();
661 generateConditionalExecution(
662 Stmt, AccDom, Subject.c_str(), [&, this, MA]() {
663 Value *Val = MA->getAccessValue();
664 if (MA->isAnyPHIKind()) {
665 assert(MA->getIncoming().size() >= 1 &&
666 "Block statements have exactly one exiting block, or "
667 "multiple but "
668 "with same incoming block and value");
669 assert(std::all_of(MA->getIncoming().begin(),
670 MA->getIncoming().end(),
671 [&](std::pair<BasicBlock *, Value *> p) -> bool {
672 return p.first == Stmt.getBasicBlock();
673 }) &&
674 "Incoming block must be statement's block");
675 Val = MA->getIncoming()[0].second;
677 auto Address = getImplicitAddress(*MA, getLoopForStmt(Stmt), LTS,
678 BBMap, NewAccesses);
680 Val = getNewValue(Stmt, Val, BBMap, LTS, L);
681 assert((!isa<Instruction>(Val) ||
682 DT.dominates(cast<Instruction>(Val)->getParent(),
683 Builder.GetInsertBlock())) &&
684 "Domination violation");
685 assert((!isa<Instruction>(Address) ||
686 DT.dominates(cast<Instruction>(Address)->getParent(),
687 Builder.GetInsertBlock())) &&
688 "Domination violation");
689 Builder.CreateStore(Val, Address);
695 void BlockGenerator::createScalarInitialization(Scop &S) {
696 BasicBlock *ExitBB = S.getExit();
697 BasicBlock *PreEntryBB = S.getEnteringBlock();
699 Builder.SetInsertPoint(&*StartBlock->begin());
701 for (auto &Array : S.arrays()) {
702 if (Array->getNumberOfDimensions() != 0)
703 continue;
704 if (Array->isPHIKind()) {
705 // For PHI nodes, the only values we need to store are the ones that
706 // reach the PHI node from outside the region. In general there should
707 // only be one such incoming edge and this edge should enter through
708 // 'PreEntryBB'.
709 auto PHI = cast<PHINode>(Array->getBasePtr());
711 for (auto BI = PHI->block_begin(), BE = PHI->block_end(); BI != BE; BI++)
712 if (!S.contains(*BI) && *BI != PreEntryBB)
713 llvm_unreachable("Incoming edges from outside the scop should always "
714 "come from PreEntryBB");
716 int Idx = PHI->getBasicBlockIndex(PreEntryBB);
717 if (Idx < 0)
718 continue;
720 Value *ScalarValue = PHI->getIncomingValue(Idx);
722 Builder.CreateStore(ScalarValue, getOrCreateAlloca(Array));
723 continue;
726 auto *Inst = dyn_cast<Instruction>(Array->getBasePtr());
728 if (Inst && S.contains(Inst))
729 continue;
731 // PHI nodes that are not marked as such in their SAI object are either exit
732 // PHI nodes we model as common scalars but without initialization, or
733 // incoming phi nodes that need to be initialized. Check if the first is the
734 // case for Inst and do not create and initialize memory if so.
735 if (auto *PHI = dyn_cast_or_null<PHINode>(Inst))
736 if (!S.hasSingleExitEdge() && PHI->getBasicBlockIndex(ExitBB) >= 0)
737 continue;
739 Builder.CreateStore(Array->getBasePtr(), getOrCreateAlloca(Array));
743 void BlockGenerator::createScalarFinalization(Scop &S) {
744 // The exit block of the __unoptimized__ region.
745 BasicBlock *ExitBB = S.getExitingBlock();
746 // The merge block __just after__ the region and the optimized region.
747 BasicBlock *MergeBB = S.getExit();
749 // The exit block of the __optimized__ region.
750 BasicBlock *OptExitBB = *(pred_begin(MergeBB));
751 if (OptExitBB == ExitBB)
752 OptExitBB = *(++pred_begin(MergeBB));
754 Builder.SetInsertPoint(OptExitBB->getTerminator());
755 for (const auto &EscapeMapping : EscapeMap) {
756 // Extract the escaping instruction and the escaping users as well as the
757 // alloca the instruction was demoted to.
758 Instruction *EscapeInst = EscapeMapping.first;
759 const auto &EscapeMappingValue = EscapeMapping.second;
760 const EscapeUserVectorTy &EscapeUsers = EscapeMappingValue.second;
761 Value *ScalarAddr = EscapeMappingValue.first;
763 // Reload the demoted instruction in the optimized version of the SCoP.
764 Value *EscapeInstReload =
765 Builder.CreateLoad(ScalarAddr, EscapeInst->getName() + ".final_reload");
766 EscapeInstReload =
767 Builder.CreateBitOrPointerCast(EscapeInstReload, EscapeInst->getType());
769 // Create the merge PHI that merges the optimized and unoptimized version.
770 PHINode *MergePHI = PHINode::Create(EscapeInst->getType(), 2,
771 EscapeInst->getName() + ".merge");
772 MergePHI->insertBefore(&*MergeBB->getFirstInsertionPt());
774 // Add the respective values to the merge PHI.
775 MergePHI->addIncoming(EscapeInstReload, OptExitBB);
776 MergePHI->addIncoming(EscapeInst, ExitBB);
778 // The information of scalar evolution about the escaping instruction needs
779 // to be revoked so the new merged instruction will be used.
780 if (SE.isSCEVable(EscapeInst->getType()))
781 SE.forgetValue(EscapeInst);
783 // Replace all uses of the demoted instruction with the merge PHI.
784 for (Instruction *EUser : EscapeUsers)
785 EUser->replaceUsesOfWith(EscapeInst, MergePHI);
789 void BlockGenerator::findOutsideUsers(Scop &S) {
790 for (auto &Array : S.arrays()) {
792 if (Array->getNumberOfDimensions() != 0)
793 continue;
795 if (Array->isPHIKind())
796 continue;
798 auto *Inst = dyn_cast<Instruction>(Array->getBasePtr());
800 if (!Inst)
801 continue;
803 // Scop invariant hoisting moves some of the base pointers out of the scop.
804 // We can ignore these, as the invariant load hoisting already registers the
805 // relevant outside users.
806 if (!S.contains(Inst))
807 continue;
809 handleOutsideUsers(S, Array);
813 void BlockGenerator::createExitPHINodeMerges(Scop &S) {
814 if (S.hasSingleExitEdge())
815 return;
817 auto *ExitBB = S.getExitingBlock();
818 auto *MergeBB = S.getExit();
819 auto *AfterMergeBB = MergeBB->getSingleSuccessor();
820 BasicBlock *OptExitBB = *(pred_begin(MergeBB));
821 if (OptExitBB == ExitBB)
822 OptExitBB = *(++pred_begin(MergeBB));
824 Builder.SetInsertPoint(OptExitBB->getTerminator());
826 for (auto &SAI : S.arrays()) {
827 auto *Val = SAI->getBasePtr();
829 // Only Value-like scalars need a merge PHI. Exit block PHIs receive either
830 // the original PHI's value or the reloaded incoming values from the
831 // generated code. An llvm::Value is merged between the original code's
832 // value or the generated one.
833 if (!SAI->isExitPHIKind())
834 continue;
836 PHINode *PHI = dyn_cast<PHINode>(Val);
837 if (!PHI)
838 continue;
840 if (PHI->getParent() != AfterMergeBB)
841 continue;
843 std::string Name = PHI->getName();
844 Value *ScalarAddr = getOrCreateAlloca(SAI);
845 Value *Reload = Builder.CreateLoad(ScalarAddr, Name + ".ph.final_reload");
846 Reload = Builder.CreateBitOrPointerCast(Reload, PHI->getType());
847 Value *OriginalValue = PHI->getIncomingValueForBlock(MergeBB);
848 assert((!isa<Instruction>(OriginalValue) ||
849 cast<Instruction>(OriginalValue)->getParent() != MergeBB) &&
850 "Original value must no be one we just generated.");
851 auto *MergePHI = PHINode::Create(PHI->getType(), 2, Name + ".ph.merge");
852 MergePHI->insertBefore(&*MergeBB->getFirstInsertionPt());
853 MergePHI->addIncoming(Reload, OptExitBB);
854 MergePHI->addIncoming(OriginalValue, ExitBB);
855 int Idx = PHI->getBasicBlockIndex(MergeBB);
856 PHI->setIncomingValue(Idx, MergePHI);
860 void BlockGenerator::invalidateScalarEvolution(Scop &S) {
861 for (auto &Stmt : S)
862 if (Stmt.isCopyStmt())
863 continue;
864 else if (Stmt.isBlockStmt())
865 for (auto &Inst : *Stmt.getBasicBlock())
866 SE.forgetValue(&Inst);
867 else if (Stmt.isRegionStmt())
868 for (auto *BB : Stmt.getRegion()->blocks())
869 for (auto &Inst : *BB)
870 SE.forgetValue(&Inst);
871 else
872 llvm_unreachable("Unexpected statement type found");
874 // Invalidate SCEV of loops surrounding the EscapeUsers.
875 for (const auto &EscapeMapping : EscapeMap) {
876 const EscapeUserVectorTy &EscapeUsers = EscapeMapping.second.second;
877 for (Instruction *EUser : EscapeUsers) {
878 if (Loop *L = LI.getLoopFor(EUser->getParent()))
879 while (L) {
880 SE.forgetLoop(L);
881 L = L->getParentLoop();
887 void BlockGenerator::finalizeSCoP(Scop &S) {
888 findOutsideUsers(S);
889 createScalarInitialization(S);
890 createExitPHINodeMerges(S);
891 createScalarFinalization(S);
892 invalidateScalarEvolution(S);
895 VectorBlockGenerator::VectorBlockGenerator(BlockGenerator &BlockGen,
896 std::vector<LoopToScevMapT> &VLTS,
897 isl_map *Schedule)
898 : BlockGenerator(BlockGen), VLTS(VLTS), Schedule(Schedule) {
899 assert(Schedule && "No statement domain provided");
902 Value *VectorBlockGenerator::getVectorValue(ScopStmt &Stmt, Value *Old,
903 ValueMapT &VectorMap,
904 VectorValueMapT &ScalarMaps,
905 Loop *L) {
906 if (Value *NewValue = VectorMap.lookup(Old))
907 return NewValue;
909 int Width = getVectorWidth();
911 Value *Vector = UndefValue::get(VectorType::get(Old->getType(), Width));
913 for (int Lane = 0; Lane < Width; Lane++)
914 Vector = Builder.CreateInsertElement(
915 Vector, getNewValue(Stmt, Old, ScalarMaps[Lane], VLTS[Lane], L),
916 Builder.getInt32(Lane));
918 VectorMap[Old] = Vector;
920 return Vector;
923 Type *VectorBlockGenerator::getVectorPtrTy(const Value *Val, int Width) {
924 PointerType *PointerTy = dyn_cast<PointerType>(Val->getType());
925 assert(PointerTy && "PointerType expected");
927 Type *ScalarType = PointerTy->getElementType();
928 VectorType *VectorType = VectorType::get(ScalarType, Width);
930 return PointerType::getUnqual(VectorType);
933 Value *VectorBlockGenerator::generateStrideOneLoad(
934 ScopStmt &Stmt, LoadInst *Load, VectorValueMapT &ScalarMaps,
935 __isl_keep isl_id_to_ast_expr *NewAccesses, bool NegativeStride = false) {
936 unsigned VectorWidth = getVectorWidth();
937 auto *Pointer = Load->getPointerOperand();
938 Type *VectorPtrType = getVectorPtrTy(Pointer, VectorWidth);
939 unsigned Offset = NegativeStride ? VectorWidth - 1 : 0;
941 Value *NewPointer = generateLocationAccessed(Stmt, Load, ScalarMaps[Offset],
942 VLTS[Offset], NewAccesses);
943 Value *VectorPtr =
944 Builder.CreateBitCast(NewPointer, VectorPtrType, "vector_ptr");
945 LoadInst *VecLoad =
946 Builder.CreateLoad(VectorPtr, Load->getName() + "_p_vec_full");
947 if (!Aligned)
948 VecLoad->setAlignment(8);
950 if (NegativeStride) {
951 SmallVector<Constant *, 16> Indices;
952 for (int i = VectorWidth - 1; i >= 0; i--)
953 Indices.push_back(ConstantInt::get(Builder.getInt32Ty(), i));
954 Constant *SV = llvm::ConstantVector::get(Indices);
955 Value *RevVecLoad = Builder.CreateShuffleVector(
956 VecLoad, VecLoad, SV, Load->getName() + "_reverse");
957 return RevVecLoad;
960 return VecLoad;
963 Value *VectorBlockGenerator::generateStrideZeroLoad(
964 ScopStmt &Stmt, LoadInst *Load, ValueMapT &BBMap,
965 __isl_keep isl_id_to_ast_expr *NewAccesses) {
966 auto *Pointer = Load->getPointerOperand();
967 Type *VectorPtrType = getVectorPtrTy(Pointer, 1);
968 Value *NewPointer =
969 generateLocationAccessed(Stmt, Load, BBMap, VLTS[0], NewAccesses);
970 Value *VectorPtr = Builder.CreateBitCast(NewPointer, VectorPtrType,
971 Load->getName() + "_p_vec_p");
972 LoadInst *ScalarLoad =
973 Builder.CreateLoad(VectorPtr, Load->getName() + "_p_splat_one");
975 if (!Aligned)
976 ScalarLoad->setAlignment(8);
978 Constant *SplatVector = Constant::getNullValue(
979 VectorType::get(Builder.getInt32Ty(), getVectorWidth()));
981 Value *VectorLoad = Builder.CreateShuffleVector(
982 ScalarLoad, ScalarLoad, SplatVector, Load->getName() + "_p_splat");
983 return VectorLoad;
986 Value *VectorBlockGenerator::generateUnknownStrideLoad(
987 ScopStmt &Stmt, LoadInst *Load, VectorValueMapT &ScalarMaps,
988 __isl_keep isl_id_to_ast_expr *NewAccesses) {
989 int VectorWidth = getVectorWidth();
990 auto *Pointer = Load->getPointerOperand();
991 VectorType *VectorType = VectorType::get(
992 dyn_cast<PointerType>(Pointer->getType())->getElementType(), VectorWidth);
994 Value *Vector = UndefValue::get(VectorType);
996 for (int i = 0; i < VectorWidth; i++) {
997 Value *NewPointer = generateLocationAccessed(Stmt, Load, ScalarMaps[i],
998 VLTS[i], NewAccesses);
999 Value *ScalarLoad =
1000 Builder.CreateLoad(NewPointer, Load->getName() + "_p_scalar_");
1001 Vector = Builder.CreateInsertElement(
1002 Vector, ScalarLoad, Builder.getInt32(i), Load->getName() + "_p_vec_");
1005 return Vector;
1008 void VectorBlockGenerator::generateLoad(
1009 ScopStmt &Stmt, LoadInst *Load, ValueMapT &VectorMap,
1010 VectorValueMapT &ScalarMaps, __isl_keep isl_id_to_ast_expr *NewAccesses) {
1011 if (Value *PreloadLoad = GlobalMap.lookup(Load)) {
1012 VectorMap[Load] = Builder.CreateVectorSplat(getVectorWidth(), PreloadLoad,
1013 Load->getName() + "_p");
1014 return;
1017 if (!VectorType::isValidElementType(Load->getType())) {
1018 for (int i = 0; i < getVectorWidth(); i++)
1019 ScalarMaps[i][Load] =
1020 generateArrayLoad(Stmt, Load, ScalarMaps[i], VLTS[i], NewAccesses);
1021 return;
1024 const MemoryAccess &Access = Stmt.getArrayAccessFor(Load);
1026 // Make sure we have scalar values available to access the pointer to
1027 // the data location.
1028 extractScalarValues(Load, VectorMap, ScalarMaps);
1030 Value *NewLoad;
1031 if (Access.isStrideZero(isl::manage(isl_map_copy(Schedule))))
1032 NewLoad = generateStrideZeroLoad(Stmt, Load, ScalarMaps[0], NewAccesses);
1033 else if (Access.isStrideOne(isl::manage(isl_map_copy(Schedule))))
1034 NewLoad = generateStrideOneLoad(Stmt, Load, ScalarMaps, NewAccesses);
1035 else if (Access.isStrideX(isl::manage(isl_map_copy(Schedule)), -1))
1036 NewLoad = generateStrideOneLoad(Stmt, Load, ScalarMaps, NewAccesses, true);
1037 else
1038 NewLoad = generateUnknownStrideLoad(Stmt, Load, ScalarMaps, NewAccesses);
1040 VectorMap[Load] = NewLoad;
1043 void VectorBlockGenerator::copyUnaryInst(ScopStmt &Stmt, UnaryInstruction *Inst,
1044 ValueMapT &VectorMap,
1045 VectorValueMapT &ScalarMaps) {
1046 int VectorWidth = getVectorWidth();
1047 Value *NewOperand = getVectorValue(Stmt, Inst->getOperand(0), VectorMap,
1048 ScalarMaps, getLoopForStmt(Stmt));
1050 assert(isa<CastInst>(Inst) && "Can not generate vector code for instruction");
1052 const CastInst *Cast = dyn_cast<CastInst>(Inst);
1053 VectorType *DestType = VectorType::get(Inst->getType(), VectorWidth);
1054 VectorMap[Inst] = Builder.CreateCast(Cast->getOpcode(), NewOperand, DestType);
1057 void VectorBlockGenerator::copyBinaryInst(ScopStmt &Stmt, BinaryOperator *Inst,
1058 ValueMapT &VectorMap,
1059 VectorValueMapT &ScalarMaps) {
1060 Loop *L = getLoopForStmt(Stmt);
1061 Value *OpZero = Inst->getOperand(0);
1062 Value *OpOne = Inst->getOperand(1);
1064 Value *NewOpZero, *NewOpOne;
1065 NewOpZero = getVectorValue(Stmt, OpZero, VectorMap, ScalarMaps, L);
1066 NewOpOne = getVectorValue(Stmt, OpOne, VectorMap, ScalarMaps, L);
1068 Value *NewInst = Builder.CreateBinOp(Inst->getOpcode(), NewOpZero, NewOpOne,
1069 Inst->getName() + "p_vec");
1070 VectorMap[Inst] = NewInst;
1073 void VectorBlockGenerator::copyStore(
1074 ScopStmt &Stmt, StoreInst *Store, ValueMapT &VectorMap,
1075 VectorValueMapT &ScalarMaps, __isl_keep isl_id_to_ast_expr *NewAccesses) {
1076 const MemoryAccess &Access = Stmt.getArrayAccessFor(Store);
1078 auto *Pointer = Store->getPointerOperand();
1079 Value *Vector = getVectorValue(Stmt, Store->getValueOperand(), VectorMap,
1080 ScalarMaps, getLoopForStmt(Stmt));
1082 // Make sure we have scalar values available to access the pointer to
1083 // the data location.
1084 extractScalarValues(Store, VectorMap, ScalarMaps);
1086 if (Access.isStrideOne(isl::manage(isl_map_copy(Schedule)))) {
1087 Type *VectorPtrType = getVectorPtrTy(Pointer, getVectorWidth());
1088 Value *NewPointer = generateLocationAccessed(Stmt, Store, ScalarMaps[0],
1089 VLTS[0], NewAccesses);
1091 Value *VectorPtr =
1092 Builder.CreateBitCast(NewPointer, VectorPtrType, "vector_ptr");
1093 StoreInst *Store = Builder.CreateStore(Vector, VectorPtr);
1095 if (!Aligned)
1096 Store->setAlignment(8);
1097 } else {
1098 for (unsigned i = 0; i < ScalarMaps.size(); i++) {
1099 Value *Scalar = Builder.CreateExtractElement(Vector, Builder.getInt32(i));
1100 Value *NewPointer = generateLocationAccessed(Stmt, Store, ScalarMaps[i],
1101 VLTS[i], NewAccesses);
1102 Builder.CreateStore(Scalar, NewPointer);
1107 bool VectorBlockGenerator::hasVectorOperands(const Instruction *Inst,
1108 ValueMapT &VectorMap) {
1109 for (Value *Operand : Inst->operands())
1110 if (VectorMap.count(Operand))
1111 return true;
1112 return false;
1115 bool VectorBlockGenerator::extractScalarValues(const Instruction *Inst,
1116 ValueMapT &VectorMap,
1117 VectorValueMapT &ScalarMaps) {
1118 bool HasVectorOperand = false;
1119 int VectorWidth = getVectorWidth();
1121 for (Value *Operand : Inst->operands()) {
1122 ValueMapT::iterator VecOp = VectorMap.find(Operand);
1124 if (VecOp == VectorMap.end())
1125 continue;
1127 HasVectorOperand = true;
1128 Value *NewVector = VecOp->second;
1130 for (int i = 0; i < VectorWidth; ++i) {
1131 ValueMapT &SM = ScalarMaps[i];
1133 // If there is one scalar extracted, all scalar elements should have
1134 // already been extracted by the code here. So no need to check for the
1135 // existence of all of them.
1136 if (SM.count(Operand))
1137 break;
1139 SM[Operand] =
1140 Builder.CreateExtractElement(NewVector, Builder.getInt32(i));
1144 return HasVectorOperand;
1147 void VectorBlockGenerator::copyInstScalarized(
1148 ScopStmt &Stmt, Instruction *Inst, ValueMapT &VectorMap,
1149 VectorValueMapT &ScalarMaps, __isl_keep isl_id_to_ast_expr *NewAccesses) {
1150 bool HasVectorOperand;
1151 int VectorWidth = getVectorWidth();
1153 HasVectorOperand = extractScalarValues(Inst, VectorMap, ScalarMaps);
1155 for (int VectorLane = 0; VectorLane < getVectorWidth(); VectorLane++)
1156 BlockGenerator::copyInstruction(Stmt, Inst, ScalarMaps[VectorLane],
1157 VLTS[VectorLane], NewAccesses);
1159 if (!VectorType::isValidElementType(Inst->getType()) || !HasVectorOperand)
1160 return;
1162 // Make the result available as vector value.
1163 VectorType *VectorType = VectorType::get(Inst->getType(), VectorWidth);
1164 Value *Vector = UndefValue::get(VectorType);
1166 for (int i = 0; i < VectorWidth; i++)
1167 Vector = Builder.CreateInsertElement(Vector, ScalarMaps[i][Inst],
1168 Builder.getInt32(i));
1170 VectorMap[Inst] = Vector;
1173 int VectorBlockGenerator::getVectorWidth() { return VLTS.size(); }
1175 void VectorBlockGenerator::copyInstruction(
1176 ScopStmt &Stmt, Instruction *Inst, ValueMapT &VectorMap,
1177 VectorValueMapT &ScalarMaps, __isl_keep isl_id_to_ast_expr *NewAccesses) {
1178 // Terminator instructions control the control flow. They are explicitly
1179 // expressed in the clast and do not need to be copied.
1180 if (Inst->isTerminator())
1181 return;
1183 if (canSyntheziseInStmt(Stmt, Inst))
1184 return;
1186 if (auto *Load = dyn_cast<LoadInst>(Inst)) {
1187 generateLoad(Stmt, Load, VectorMap, ScalarMaps, NewAccesses);
1188 return;
1191 if (hasVectorOperands(Inst, VectorMap)) {
1192 if (auto *Store = dyn_cast<StoreInst>(Inst)) {
1193 // Identified as redundant by -polly-simplify.
1194 if (!Stmt.getArrayAccessOrNULLFor(Store))
1195 return;
1197 copyStore(Stmt, Store, VectorMap, ScalarMaps, NewAccesses);
1198 return;
1201 if (auto *Unary = dyn_cast<UnaryInstruction>(Inst)) {
1202 copyUnaryInst(Stmt, Unary, VectorMap, ScalarMaps);
1203 return;
1206 if (auto *Binary = dyn_cast<BinaryOperator>(Inst)) {
1207 copyBinaryInst(Stmt, Binary, VectorMap, ScalarMaps);
1208 return;
1211 // Fallthrough: We generate scalar instructions, if we don't know how to
1212 // generate vector code.
1215 copyInstScalarized(Stmt, Inst, VectorMap, ScalarMaps, NewAccesses);
1218 void VectorBlockGenerator::generateScalarVectorLoads(
1219 ScopStmt &Stmt, ValueMapT &VectorBlockMap) {
1220 for (MemoryAccess *MA : Stmt) {
1221 if (MA->isArrayKind() || MA->isWrite())
1222 continue;
1224 auto *Address = getOrCreateAlloca(*MA);
1225 Type *VectorPtrType = getVectorPtrTy(Address, 1);
1226 Value *VectorPtr = Builder.CreateBitCast(Address, VectorPtrType,
1227 Address->getName() + "_p_vec_p");
1228 auto *Val = Builder.CreateLoad(VectorPtr, Address->getName() + ".reload");
1229 Constant *SplatVector = Constant::getNullValue(
1230 VectorType::get(Builder.getInt32Ty(), getVectorWidth()));
1232 Value *VectorVal = Builder.CreateShuffleVector(
1233 Val, Val, SplatVector, Address->getName() + "_p_splat");
1234 VectorBlockMap[MA->getAccessValue()] = VectorVal;
1238 void VectorBlockGenerator::verifyNoScalarStores(ScopStmt &Stmt) {
1239 for (MemoryAccess *MA : Stmt) {
1240 if (MA->isArrayKind() || MA->isRead())
1241 continue;
1243 llvm_unreachable("Scalar stores not expected in vector loop");
1247 void VectorBlockGenerator::copyStmt(
1248 ScopStmt &Stmt, __isl_keep isl_id_to_ast_expr *NewAccesses) {
1249 assert(Stmt.isBlockStmt() &&
1250 "TODO: Only block statements can be copied by the vector block "
1251 "generator");
1253 BasicBlock *BB = Stmt.getBasicBlock();
1254 BasicBlock *CopyBB = SplitBlock(Builder.GetInsertBlock(),
1255 &*Builder.GetInsertPoint(), &DT, &LI);
1256 CopyBB->setName("polly.stmt." + BB->getName());
1257 Builder.SetInsertPoint(&CopyBB->front());
1259 // Create two maps that store the mapping from the original instructions of
1260 // the old basic block to their copies in the new basic block. Those maps
1261 // are basic block local.
1263 // As vector code generation is supported there is one map for scalar values
1264 // and one for vector values.
1266 // In case we just do scalar code generation, the vectorMap is not used and
1267 // the scalarMap has just one dimension, which contains the mapping.
1269 // In case vector code generation is done, an instruction may either appear
1270 // in the vector map once (as it is calculating >vectorwidth< values at a
1271 // time. Or (if the values are calculated using scalar operations), it
1272 // appears once in every dimension of the scalarMap.
1273 VectorValueMapT ScalarBlockMap(getVectorWidth());
1274 ValueMapT VectorBlockMap;
1276 generateScalarVectorLoads(Stmt, VectorBlockMap);
1278 for (Instruction &Inst : *BB)
1279 copyInstruction(Stmt, &Inst, VectorBlockMap, ScalarBlockMap, NewAccesses);
1281 verifyNoScalarStores(Stmt);
1284 BasicBlock *RegionGenerator::repairDominance(BasicBlock *BB,
1285 BasicBlock *BBCopy) {
1287 BasicBlock *BBIDom = DT.getNode(BB)->getIDom()->getBlock();
1288 BasicBlock *BBCopyIDom = EndBlockMap.lookup(BBIDom);
1290 if (BBCopyIDom)
1291 DT.changeImmediateDominator(BBCopy, BBCopyIDom);
1293 return StartBlockMap.lookup(BBIDom);
1296 // This is to determine whether an llvm::Value (defined in @p BB) is usable when
1297 // leaving a subregion. The straight-forward DT.dominates(BB, R->getExitBlock())
1298 // does not work in cases where the exit block has edges from outside the
1299 // region. In that case the llvm::Value would never be usable in in the exit
1300 // block. The RegionGenerator however creates an new exit block ('ExitBBCopy')
1301 // for the subregion's exiting edges only. We need to determine whether an
1302 // llvm::Value is usable in there. We do this by checking whether it dominates
1303 // all exiting blocks individually.
1304 static bool isDominatingSubregionExit(const DominatorTree &DT, Region *R,
1305 BasicBlock *BB) {
1306 for (auto ExitingBB : predecessors(R->getExit())) {
1307 // Check for non-subregion incoming edges.
1308 if (!R->contains(ExitingBB))
1309 continue;
1311 if (!DT.dominates(BB, ExitingBB))
1312 return false;
1315 return true;
1318 // Find the direct dominator of the subregion's exit block if the subregion was
1319 // simplified.
1320 static BasicBlock *findExitDominator(DominatorTree &DT, Region *R) {
1321 BasicBlock *Common = nullptr;
1322 for (auto ExitingBB : predecessors(R->getExit())) {
1323 // Check for non-subregion incoming edges.
1324 if (!R->contains(ExitingBB))
1325 continue;
1327 // First exiting edge.
1328 if (!Common) {
1329 Common = ExitingBB;
1330 continue;
1333 Common = DT.findNearestCommonDominator(Common, ExitingBB);
1336 assert(Common && R->contains(Common));
1337 return Common;
1340 void RegionGenerator::copyStmt(ScopStmt &Stmt, LoopToScevMapT &LTS,
1341 isl_id_to_ast_expr *IdToAstExp) {
1342 assert(Stmt.isRegionStmt() &&
1343 "Only region statements can be copied by the region generator");
1345 // Forget all old mappings.
1346 StartBlockMap.clear();
1347 EndBlockMap.clear();
1348 RegionMaps.clear();
1349 IncompletePHINodeMap.clear();
1351 // Collection of all values related to this subregion.
1352 ValueMapT ValueMap;
1354 // The region represented by the statement.
1355 Region *R = Stmt.getRegion();
1357 // Create a dedicated entry for the region where we can reload all demoted
1358 // inputs.
1359 BasicBlock *EntryBB = R->getEntry();
1360 BasicBlock *EntryBBCopy = SplitBlock(Builder.GetInsertBlock(),
1361 &*Builder.GetInsertPoint(), &DT, &LI);
1362 EntryBBCopy->setName("polly.stmt." + EntryBB->getName() + ".entry");
1363 Builder.SetInsertPoint(&EntryBBCopy->front());
1365 ValueMapT &EntryBBMap = RegionMaps[EntryBBCopy];
1366 generateScalarLoads(Stmt, LTS, EntryBBMap, IdToAstExp);
1368 for (auto PI = pred_begin(EntryBB), PE = pred_end(EntryBB); PI != PE; ++PI)
1369 if (!R->contains(*PI)) {
1370 StartBlockMap[*PI] = EntryBBCopy;
1371 EndBlockMap[*PI] = EntryBBCopy;
1374 // Iterate over all blocks in the region in a breadth-first search.
1375 std::deque<BasicBlock *> Blocks;
1376 SmallSetVector<BasicBlock *, 8> SeenBlocks;
1377 Blocks.push_back(EntryBB);
1378 SeenBlocks.insert(EntryBB);
1380 while (!Blocks.empty()) {
1381 BasicBlock *BB = Blocks.front();
1382 Blocks.pop_front();
1384 // First split the block and update dominance information.
1385 BasicBlock *BBCopy = splitBB(BB);
1386 BasicBlock *BBCopyIDom = repairDominance(BB, BBCopy);
1388 // Get the mapping for this block and initialize it with either the scalar
1389 // loads from the generated entering block (which dominates all blocks of
1390 // this subregion) or the maps of the immediate dominator, if part of the
1391 // subregion. The latter necessarily includes the former.
1392 ValueMapT *InitBBMap;
1393 if (BBCopyIDom) {
1394 assert(RegionMaps.count(BBCopyIDom));
1395 InitBBMap = &RegionMaps[BBCopyIDom];
1396 } else
1397 InitBBMap = &EntryBBMap;
1398 auto Inserted = RegionMaps.insert(std::make_pair(BBCopy, *InitBBMap));
1399 ValueMapT &RegionMap = Inserted.first->second;
1401 // Copy the block with the BlockGenerator.
1402 Builder.SetInsertPoint(&BBCopy->front());
1403 copyBB(Stmt, BB, BBCopy, RegionMap, LTS, IdToAstExp);
1405 // In order to remap PHI nodes we store also basic block mappings.
1406 StartBlockMap[BB] = BBCopy;
1407 EndBlockMap[BB] = Builder.GetInsertBlock();
1409 // Add values to incomplete PHI nodes waiting for this block to be copied.
1410 for (const PHINodePairTy &PHINodePair : IncompletePHINodeMap[BB])
1411 addOperandToPHI(Stmt, PHINodePair.first, PHINodePair.second, BB, LTS);
1412 IncompletePHINodeMap[BB].clear();
1414 // And continue with new successors inside the region.
1415 for (auto SI = succ_begin(BB), SE = succ_end(BB); SI != SE; SI++)
1416 if (R->contains(*SI) && SeenBlocks.insert(*SI))
1417 Blocks.push_back(*SI);
1419 // Remember value in case it is visible after this subregion.
1420 if (isDominatingSubregionExit(DT, R, BB))
1421 ValueMap.insert(RegionMap.begin(), RegionMap.end());
1424 // Now create a new dedicated region exit block and add it to the region map.
1425 BasicBlock *ExitBBCopy = SplitBlock(Builder.GetInsertBlock(),
1426 &*Builder.GetInsertPoint(), &DT, &LI);
1427 ExitBBCopy->setName("polly.stmt." + R->getExit()->getName() + ".exit");
1428 StartBlockMap[R->getExit()] = ExitBBCopy;
1429 EndBlockMap[R->getExit()] = ExitBBCopy;
1431 BasicBlock *ExitDomBBCopy = EndBlockMap.lookup(findExitDominator(DT, R));
1432 assert(ExitDomBBCopy &&
1433 "Common exit dominator must be within region; at least the entry node "
1434 "must match");
1435 DT.changeImmediateDominator(ExitBBCopy, ExitDomBBCopy);
1437 // As the block generator doesn't handle control flow we need to add the
1438 // region control flow by hand after all blocks have been copied.
1439 for (BasicBlock *BB : SeenBlocks) {
1441 BasicBlock *BBCopyStart = StartBlockMap[BB];
1442 BasicBlock *BBCopyEnd = EndBlockMap[BB];
1443 TerminatorInst *TI = BB->getTerminator();
1444 if (isa<UnreachableInst>(TI)) {
1445 while (!BBCopyEnd->empty())
1446 BBCopyEnd->begin()->eraseFromParent();
1447 new UnreachableInst(BBCopyEnd->getContext(), BBCopyEnd);
1448 continue;
1451 Instruction *BICopy = BBCopyEnd->getTerminator();
1453 ValueMapT &RegionMap = RegionMaps[BBCopyStart];
1454 RegionMap.insert(StartBlockMap.begin(), StartBlockMap.end());
1456 Builder.SetInsertPoint(BICopy);
1457 copyInstScalar(Stmt, TI, RegionMap, LTS);
1458 BICopy->eraseFromParent();
1461 // Add counting PHI nodes to all loops in the region that can be used as
1462 // replacement for SCEVs referring to the old loop.
1463 for (BasicBlock *BB : SeenBlocks) {
1464 Loop *L = LI.getLoopFor(BB);
1465 if (L == nullptr || L->getHeader() != BB || !R->contains(L))
1466 continue;
1468 BasicBlock *BBCopy = StartBlockMap[BB];
1469 Value *NullVal = Builder.getInt32(0);
1470 PHINode *LoopPHI =
1471 PHINode::Create(Builder.getInt32Ty(), 2, "polly.subregion.iv");
1472 Instruction *LoopPHIInc = BinaryOperator::CreateAdd(
1473 LoopPHI, Builder.getInt32(1), "polly.subregion.iv.inc");
1474 LoopPHI->insertBefore(&BBCopy->front());
1475 LoopPHIInc->insertBefore(BBCopy->getTerminator());
1477 for (auto *PredBB : make_range(pred_begin(BB), pred_end(BB))) {
1478 if (!R->contains(PredBB))
1479 continue;
1480 if (L->contains(PredBB))
1481 LoopPHI->addIncoming(LoopPHIInc, EndBlockMap[PredBB]);
1482 else
1483 LoopPHI->addIncoming(NullVal, EndBlockMap[PredBB]);
1486 for (auto *PredBBCopy : make_range(pred_begin(BBCopy), pred_end(BBCopy)))
1487 if (LoopPHI->getBasicBlockIndex(PredBBCopy) < 0)
1488 LoopPHI->addIncoming(NullVal, PredBBCopy);
1490 LTS[L] = SE.getUnknown(LoopPHI);
1493 // Continue generating code in the exit block.
1494 Builder.SetInsertPoint(&*ExitBBCopy->getFirstInsertionPt());
1496 // Write values visible to other statements.
1497 generateScalarStores(Stmt, LTS, ValueMap, IdToAstExp);
1498 StartBlockMap.clear();
1499 EndBlockMap.clear();
1500 RegionMaps.clear();
1501 IncompletePHINodeMap.clear();
1504 PHINode *RegionGenerator::buildExitPHI(MemoryAccess *MA, LoopToScevMapT &LTS,
1505 ValueMapT &BBMap, Loop *L) {
1506 ScopStmt *Stmt = MA->getStatement();
1507 Region *SubR = Stmt->getRegion();
1508 auto Incoming = MA->getIncoming();
1510 PollyIRBuilder::InsertPointGuard IPGuard(Builder);
1511 PHINode *OrigPHI = cast<PHINode>(MA->getAccessInstruction());
1512 BasicBlock *NewSubregionExit = Builder.GetInsertBlock();
1514 // This can happen if the subregion is simplified after the ScopStmts
1515 // have been created; simplification happens as part of CodeGeneration.
1516 if (OrigPHI->getParent() != SubR->getExit()) {
1517 BasicBlock *FormerExit = SubR->getExitingBlock();
1518 if (FormerExit)
1519 NewSubregionExit = StartBlockMap.lookup(FormerExit);
1522 PHINode *NewPHI = PHINode::Create(OrigPHI->getType(), Incoming.size(),
1523 "polly." + OrigPHI->getName(),
1524 NewSubregionExit->getFirstNonPHI());
1526 // Add the incoming values to the PHI.
1527 for (auto &Pair : Incoming) {
1528 BasicBlock *OrigIncomingBlock = Pair.first;
1529 BasicBlock *NewIncomingBlockStart = StartBlockMap.lookup(OrigIncomingBlock);
1530 BasicBlock *NewIncomingBlockEnd = EndBlockMap.lookup(OrigIncomingBlock);
1531 Builder.SetInsertPoint(NewIncomingBlockEnd->getTerminator());
1532 assert(RegionMaps.count(NewIncomingBlockStart));
1533 assert(RegionMaps.count(NewIncomingBlockEnd));
1534 ValueMapT *LocalBBMap = &RegionMaps[NewIncomingBlockStart];
1536 Value *OrigIncomingValue = Pair.second;
1537 Value *NewIncomingValue =
1538 getNewValue(*Stmt, OrigIncomingValue, *LocalBBMap, LTS, L);
1539 NewPHI->addIncoming(NewIncomingValue, NewIncomingBlockEnd);
1542 return NewPHI;
1545 Value *RegionGenerator::getExitScalar(MemoryAccess *MA, LoopToScevMapT &LTS,
1546 ValueMapT &BBMap) {
1547 ScopStmt *Stmt = MA->getStatement();
1549 // TODO: Add some test cases that ensure this is really the right choice.
1550 Loop *L = LI.getLoopFor(Stmt->getRegion()->getExit());
1552 if (MA->isAnyPHIKind()) {
1553 auto Incoming = MA->getIncoming();
1554 assert(!Incoming.empty() &&
1555 "PHI WRITEs must have originate from at least one incoming block");
1557 // If there is only one incoming value, we do not need to create a PHI.
1558 if (Incoming.size() == 1) {
1559 Value *OldVal = Incoming[0].second;
1560 return getNewValue(*Stmt, OldVal, BBMap, LTS, L);
1563 return buildExitPHI(MA, LTS, BBMap, L);
1566 // MemoryKind::Value accesses leaving the subregion must dominate the exit
1567 // block; just pass the copied value.
1568 Value *OldVal = MA->getAccessValue();
1569 return getNewValue(*Stmt, OldVal, BBMap, LTS, L);
1572 void RegionGenerator::generateScalarStores(
1573 ScopStmt &Stmt, LoopToScevMapT &LTS, ValueMapT &BBMap,
1574 __isl_keep isl_id_to_ast_expr *NewAccesses) {
1575 assert(Stmt.getRegion() &&
1576 "Block statements need to use the generateScalarStores() "
1577 "function in the BlockGenerator");
1579 for (MemoryAccess *MA : Stmt) {
1580 if (MA->isOriginalArrayKind() || MA->isRead())
1581 continue;
1583 isl::set AccDom = MA->getAccessRelation().domain();
1584 std::string Subject = MA->getId().get_name();
1585 generateConditionalExecution(
1586 Stmt, AccDom, Subject.c_str(), [&, this, MA]() {
1588 Value *NewVal = getExitScalar(MA, LTS, BBMap);
1589 Value *Address = getImplicitAddress(*MA, getLoopForStmt(Stmt), LTS,
1590 BBMap, NewAccesses);
1591 assert((!isa<Instruction>(NewVal) ||
1592 DT.dominates(cast<Instruction>(NewVal)->getParent(),
1593 Builder.GetInsertBlock())) &&
1594 "Domination violation");
1595 assert((!isa<Instruction>(Address) ||
1596 DT.dominates(cast<Instruction>(Address)->getParent(),
1597 Builder.GetInsertBlock())) &&
1598 "Domination violation");
1599 Builder.CreateStore(NewVal, Address);
1604 void RegionGenerator::addOperandToPHI(ScopStmt &Stmt, PHINode *PHI,
1605 PHINode *PHICopy, BasicBlock *IncomingBB,
1606 LoopToScevMapT &LTS) {
1607 // If the incoming block was not yet copied mark this PHI as incomplete.
1608 // Once the block will be copied the incoming value will be added.
1609 BasicBlock *BBCopyStart = StartBlockMap[IncomingBB];
1610 BasicBlock *BBCopyEnd = EndBlockMap[IncomingBB];
1611 if (!BBCopyStart) {
1612 assert(!BBCopyEnd);
1613 assert(Stmt.represents(IncomingBB) &&
1614 "Bad incoming block for PHI in non-affine region");
1615 IncompletePHINodeMap[IncomingBB].push_back(std::make_pair(PHI, PHICopy));
1616 return;
1619 assert(RegionMaps.count(BBCopyStart) &&
1620 "Incoming PHI block did not have a BBMap");
1621 ValueMapT &BBCopyMap = RegionMaps[BBCopyStart];
1623 Value *OpCopy = nullptr;
1625 if (Stmt.represents(IncomingBB)) {
1626 Value *Op = PHI->getIncomingValueForBlock(IncomingBB);
1628 // If the current insert block is different from the PHIs incoming block
1629 // change it, otherwise do not.
1630 auto IP = Builder.GetInsertPoint();
1631 if (IP->getParent() != BBCopyEnd)
1632 Builder.SetInsertPoint(BBCopyEnd->getTerminator());
1633 OpCopy = getNewValue(Stmt, Op, BBCopyMap, LTS, getLoopForStmt(Stmt));
1634 if (IP->getParent() != BBCopyEnd)
1635 Builder.SetInsertPoint(&*IP);
1636 } else {
1637 // All edges from outside the non-affine region become a single edge
1638 // in the new copy of the non-affine region. Make sure to only add the
1639 // corresponding edge the first time we encounter a basic block from
1640 // outside the non-affine region.
1641 if (PHICopy->getBasicBlockIndex(BBCopyEnd) >= 0)
1642 return;
1644 // Get the reloaded value.
1645 OpCopy = getNewValue(Stmt, PHI, BBCopyMap, LTS, getLoopForStmt(Stmt));
1648 assert(OpCopy && "Incoming PHI value was not copied properly");
1649 PHICopy->addIncoming(OpCopy, BBCopyEnd);
1652 void RegionGenerator::copyPHIInstruction(ScopStmt &Stmt, PHINode *PHI,
1653 ValueMapT &BBMap,
1654 LoopToScevMapT &LTS) {
1655 unsigned NumIncoming = PHI->getNumIncomingValues();
1656 PHINode *PHICopy =
1657 Builder.CreatePHI(PHI->getType(), NumIncoming, "polly." + PHI->getName());
1658 PHICopy->moveBefore(PHICopy->getParent()->getFirstNonPHI());
1659 BBMap[PHI] = PHICopy;
1661 for (BasicBlock *IncomingBB : PHI->blocks())
1662 addOperandToPHI(Stmt, PHI, PHICopy, IncomingBB, LTS);