Remember to clear the shadow kill flag at the same time as clearing the real
[llvm.git] / lib / Target / ARM / ARMLoadStoreOptimizer.cpp
blobd1acacfb1d0eeec9dcd9e4b65bcc335b170a9dd7
1 //===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ----*- C++ -*-=//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file contains a pass that performs load / store related peephole
11 // optimizations. This pass should be run after register allocation.
13 //===----------------------------------------------------------------------===//
15 #define DEBUG_TYPE "arm-ldst-opt"
16 #include "ARM.h"
17 #include "ARMAddressingModes.h"
18 #include "ARMBaseInstrInfo.h"
19 #include "ARMMachineFunctionInfo.h"
20 #include "ARMRegisterInfo.h"
21 #include "llvm/DerivedTypes.h"
22 #include "llvm/Function.h"
23 #include "llvm/CodeGen/MachineBasicBlock.h"
24 #include "llvm/CodeGen/MachineFunctionPass.h"
25 #include "llvm/CodeGen/MachineInstr.h"
26 #include "llvm/CodeGen/MachineInstrBuilder.h"
27 #include "llvm/CodeGen/MachineRegisterInfo.h"
28 #include "llvm/CodeGen/RegisterScavenging.h"
29 #include "llvm/Target/TargetData.h"
30 #include "llvm/Target/TargetInstrInfo.h"
31 #include "llvm/Target/TargetMachine.h"
32 #include "llvm/Target/TargetRegisterInfo.h"
33 #include "llvm/Support/ErrorHandling.h"
34 #include "llvm/ADT/DenseMap.h"
35 #include "llvm/ADT/STLExtras.h"
36 #include "llvm/ADT/SmallPtrSet.h"
37 #include "llvm/ADT/SmallSet.h"
38 #include "llvm/ADT/SmallVector.h"
39 #include "llvm/ADT/Statistic.h"
40 using namespace llvm;
42 STATISTIC(NumLDMGened , "Number of ldm instructions generated");
43 STATISTIC(NumSTMGened , "Number of stm instructions generated");
44 STATISTIC(NumVLDMGened, "Number of vldm instructions generated");
45 STATISTIC(NumVSTMGened, "Number of vstm instructions generated");
46 STATISTIC(NumLdStMoved, "Number of load / store instructions moved");
47 STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation");
48 STATISTIC(NumSTRDFormed,"Number of strd created before allocation");
49 STATISTIC(NumLDRD2LDM, "Number of ldrd instructions turned back into ldm");
50 STATISTIC(NumSTRD2STM, "Number of strd instructions turned back into stm");
51 STATISTIC(NumLDRD2LDR, "Number of ldrd instructions turned back into ldr's");
52 STATISTIC(NumSTRD2STR, "Number of strd instructions turned back into str's");
54 /// ARMAllocLoadStoreOpt - Post- register allocation pass the combine
55 /// load / store instructions to form ldm / stm instructions.
57 namespace {
58 struct ARMLoadStoreOpt : public MachineFunctionPass {
59 static char ID;
60 ARMLoadStoreOpt() : MachineFunctionPass(ID) {}
62 const TargetInstrInfo *TII;
63 const TargetRegisterInfo *TRI;
64 ARMFunctionInfo *AFI;
65 RegScavenger *RS;
66 bool isThumb2;
68 virtual bool runOnMachineFunction(MachineFunction &Fn);
70 virtual const char *getPassName() const {
71 return "ARM load / store optimization pass";
74 private:
75 struct MemOpQueueEntry {
76 int Offset;
77 unsigned Reg;
78 bool isKill;
79 unsigned Position;
80 MachineBasicBlock::iterator MBBI;
81 bool Merged;
82 MemOpQueueEntry(int o, unsigned r, bool k, unsigned p,
83 MachineBasicBlock::iterator i)
84 : Offset(o), Reg(r), isKill(k), Position(p), MBBI(i), Merged(false) {}
86 typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
87 typedef MemOpQueue::iterator MemOpQueueIter;
89 bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
90 int Offset, unsigned Base, bool BaseKill, int Opcode,
91 ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
92 DebugLoc dl, SmallVector<std::pair<unsigned, bool>, 8> &Regs);
93 void MergeOpsUpdate(MachineBasicBlock &MBB,
94 MemOpQueue &MemOps,
95 unsigned memOpsBegin,
96 unsigned memOpsEnd,
97 unsigned insertAfter,
98 int Offset,
99 unsigned Base,
100 bool BaseKill,
101 int Opcode,
102 ARMCC::CondCodes Pred,
103 unsigned PredReg,
104 unsigned Scratch,
105 DebugLoc dl,
106 SmallVector<MachineBasicBlock::iterator, 4> &Merges);
107 void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base,
108 int Opcode, unsigned Size,
109 ARMCC::CondCodes Pred, unsigned PredReg,
110 unsigned Scratch, MemOpQueue &MemOps,
111 SmallVector<MachineBasicBlock::iterator, 4> &Merges);
113 void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps);
114 bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
115 MachineBasicBlock::iterator &MBBI);
116 bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
117 MachineBasicBlock::iterator MBBI,
118 const TargetInstrInfo *TII,
119 bool &Advance,
120 MachineBasicBlock::iterator &I);
121 bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
122 MachineBasicBlock::iterator MBBI,
123 bool &Advance,
124 MachineBasicBlock::iterator &I);
125 bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
126 bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
128 char ARMLoadStoreOpt::ID = 0;
131 static int getLoadStoreMultipleOpcode(int Opcode) {
132 switch (Opcode) {
133 case ARM::LDR:
134 ++NumLDMGened;
135 return ARM::LDM;
136 case ARM::STR:
137 ++NumSTMGened;
138 return ARM::STM;
139 case ARM::t2LDRi8:
140 case ARM::t2LDRi12:
141 ++NumLDMGened;
142 return ARM::t2LDM;
143 case ARM::t2STRi8:
144 case ARM::t2STRi12:
145 ++NumSTMGened;
146 return ARM::t2STM;
147 case ARM::VLDRS:
148 ++NumVLDMGened;
149 return ARM::VLDMS;
150 case ARM::VSTRS:
151 ++NumVSTMGened;
152 return ARM::VSTMS;
153 case ARM::VLDRD:
154 ++NumVLDMGened;
155 return ARM::VLDMD;
156 case ARM::VSTRD:
157 ++NumVSTMGened;
158 return ARM::VSTMD;
159 default: llvm_unreachable("Unhandled opcode!");
161 return 0;
164 static bool isT2i32Load(unsigned Opc) {
165 return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8;
168 static bool isi32Load(unsigned Opc) {
169 return Opc == ARM::LDR || isT2i32Load(Opc);
172 static bool isT2i32Store(unsigned Opc) {
173 return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8;
176 static bool isi32Store(unsigned Opc) {
177 return Opc == ARM::STR || isT2i32Store(Opc);
180 /// MergeOps - Create and insert a LDM or STM with Base as base register and
181 /// registers in Regs as the register operands that would be loaded / stored.
182 /// It returns true if the transformation is done.
183 bool
184 ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
185 MachineBasicBlock::iterator MBBI,
186 int Offset, unsigned Base, bool BaseKill,
187 int Opcode, ARMCC::CondCodes Pred,
188 unsigned PredReg, unsigned Scratch, DebugLoc dl,
189 SmallVector<std::pair<unsigned, bool>, 8> &Regs) {
190 // Only a single register to load / store. Don't bother.
191 unsigned NumRegs = Regs.size();
192 if (NumRegs <= 1)
193 return false;
195 ARM_AM::AMSubMode Mode = ARM_AM::ia;
196 // VFP and Thumb2 do not support IB or DA modes.
197 bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
198 bool haveIBAndDA = isNotVFP && !isThumb2;
199 if (Offset == 4 && haveIBAndDA)
200 Mode = ARM_AM::ib;
201 else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA)
202 Mode = ARM_AM::da;
203 else if (Offset == -4 * (int)NumRegs && isNotVFP)
204 // VLDM/VSTM do not support DB mode without also updating the base reg.
205 Mode = ARM_AM::db;
206 else if (Offset != 0) {
207 // If starting offset isn't zero, insert a MI to materialize a new base.
208 // But only do so if it is cost effective, i.e. merging more than two
209 // loads / stores.
210 if (NumRegs <= 2)
211 return false;
213 unsigned NewBase;
214 if (isi32Load(Opcode))
215 // If it is a load, then just use one of the destination register to
216 // use as the new base.
217 NewBase = Regs[NumRegs-1].first;
218 else {
219 // Use the scratch register to use as a new base.
220 NewBase = Scratch;
221 if (NewBase == 0)
222 return false;
224 int BaseOpc = !isThumb2
225 ? ARM::ADDri
226 : ((Base == ARM::SP) ? ARM::t2ADDrSPi : ARM::t2ADDri);
227 if (Offset < 0) {
228 BaseOpc = !isThumb2
229 ? ARM::SUBri
230 : ((Base == ARM::SP) ? ARM::t2SUBrSPi : ARM::t2SUBri);
231 Offset = - Offset;
233 int ImmedOffset = isThumb2
234 ? ARM_AM::getT2SOImmVal(Offset) : ARM_AM::getSOImmVal(Offset);
235 if (ImmedOffset == -1)
236 // FIXME: Try t2ADDri12 or t2SUBri12?
237 return false; // Probably not worth it then.
239 BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
240 .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
241 .addImm(Pred).addReg(PredReg).addReg(0);
242 Base = NewBase;
243 BaseKill = true; // New base is always killed right its use.
246 bool isDef = (isi32Load(Opcode) || Opcode == ARM::VLDRS ||
247 Opcode == ARM::VLDRD);
248 Opcode = getLoadStoreMultipleOpcode(Opcode);
249 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode))
250 .addReg(Base, getKillRegState(BaseKill))
251 .addImm(ARM_AM::getAM4ModeImm(Mode)).addImm(Pred).addReg(PredReg);
252 for (unsigned i = 0; i != NumRegs; ++i)
253 MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef)
254 | getKillRegState(Regs[i].second));
256 return true;
259 // MergeOpsUpdate - call MergeOps and update MemOps and merges accordingly on
260 // success.
261 void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
262 MemOpQueue &memOps,
263 unsigned memOpsBegin, unsigned memOpsEnd,
264 unsigned insertAfter, int Offset,
265 unsigned Base, bool BaseKill,
266 int Opcode,
267 ARMCC::CondCodes Pred, unsigned PredReg,
268 unsigned Scratch,
269 DebugLoc dl,
270 SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
271 // First calculate which of the registers should be killed by the merged
272 // instruction.
273 const unsigned insertPos = memOps[insertAfter].Position;
275 SmallSet<unsigned, 4> UnavailRegs;
276 SmallSet<unsigned, 4> KilledRegs;
277 DenseMap<unsigned, unsigned> Killer;
278 for (unsigned i = 0; i < memOpsBegin; ++i) {
279 if (memOps[i].Position < insertPos && memOps[i].isKill) {
280 unsigned Reg = memOps[i].Reg;
281 if (memOps[i].Merged)
282 UnavailRegs.insert(Reg);
283 else {
284 KilledRegs.insert(Reg);
285 Killer[Reg] = i;
289 for (unsigned i = memOpsEnd, e = memOps.size(); i != e; ++i) {
290 if (memOps[i].Position < insertPos && memOps[i].isKill) {
291 unsigned Reg = memOps[i].Reg;
292 KilledRegs.insert(Reg);
293 Killer[Reg] = i;
297 SmallVector<std::pair<unsigned, bool>, 8> Regs;
298 for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
299 unsigned Reg = memOps[i].Reg;
300 if (UnavailRegs.count(Reg))
301 // Register is killed before and it's not easy / possible to update the
302 // kill marker on already merged instructions. Abort.
303 return;
305 // If we are inserting the merged operation after an unmerged operation that
306 // uses the same register, make sure to transfer any kill flag.
307 bool isKill = memOps[i].isKill || KilledRegs.count(Reg);
308 Regs.push_back(std::make_pair(Reg, isKill));
311 // Try to do the merge.
312 MachineBasicBlock::iterator Loc = memOps[insertAfter].MBBI;
313 ++Loc;
314 if (!MergeOps(MBB, Loc, Offset, Base, BaseKill, Opcode,
315 Pred, PredReg, Scratch, dl, Regs))
316 return;
318 // Merge succeeded, update records.
319 Merges.push_back(prior(Loc));
320 for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
321 // Remove kill flags from any unmerged memops that come before insertPos.
322 if (Regs[i-memOpsBegin].second) {
323 unsigned Reg = Regs[i-memOpsBegin].first;
324 if (KilledRegs.count(Reg)) {
325 unsigned j = Killer[Reg];
326 memOps[j].MBBI->getOperand(0).setIsKill(false);
327 memOps[j].isKill = false;
330 MBB.erase(memOps[i].MBBI);
331 memOps[i].Merged = true;
335 /// MergeLDR_STR - Merge a number of load / store instructions into one or more
336 /// load / store multiple instructions.
337 void
338 ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
339 unsigned Base, int Opcode, unsigned Size,
340 ARMCC::CondCodes Pred, unsigned PredReg,
341 unsigned Scratch, MemOpQueue &MemOps,
342 SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
343 bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
344 int Offset = MemOps[SIndex].Offset;
345 int SOffset = Offset;
346 unsigned insertAfter = SIndex;
347 MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI;
348 DebugLoc dl = Loc->getDebugLoc();
349 const MachineOperand &PMO = Loc->getOperand(0);
350 unsigned PReg = PMO.getReg();
351 unsigned PRegNum = PMO.isUndef() ? UINT_MAX
352 : ARMRegisterInfo::getRegisterNumbering(PReg);
353 unsigned Count = 1;
355 for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) {
356 int NewOffset = MemOps[i].Offset;
357 const MachineOperand &MO = MemOps[i].MBBI->getOperand(0);
358 unsigned Reg = MO.getReg();
359 unsigned RegNum = MO.isUndef() ? UINT_MAX
360 : ARMRegisterInfo::getRegisterNumbering(Reg);
361 // Register numbers must be in ascending order. For VFP, the registers
362 // must also be consecutive and there is a limit of 16 double-word
363 // registers per instruction.
364 if (Reg != ARM::SP &&
365 NewOffset == Offset + (int)Size &&
366 ((isNotVFP && RegNum > PRegNum)
367 || ((Size < 8 || Count < 16) && RegNum == PRegNum+1))) {
368 Offset += Size;
369 PRegNum = RegNum;
370 ++Count;
371 } else {
372 // Can't merge this in. Try merge the earlier ones first.
373 MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset,
374 Base, false, Opcode, Pred, PredReg, Scratch, dl, Merges);
375 MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch,
376 MemOps, Merges);
377 return;
380 if (MemOps[i].Position > MemOps[insertAfter].Position)
381 insertAfter = i;
384 bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1;
385 MergeOpsUpdate(MBB, MemOps, SIndex, MemOps.size(), insertAfter, SOffset,
386 Base, BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges);
387 return;
390 static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
391 unsigned Bytes, unsigned Limit,
392 ARMCC::CondCodes Pred, unsigned PredReg){
393 unsigned MyPredReg = 0;
394 if (!MI)
395 return false;
396 if (MI->getOpcode() != ARM::t2SUBri &&
397 MI->getOpcode() != ARM::t2SUBrSPi &&
398 MI->getOpcode() != ARM::t2SUBrSPi12 &&
399 MI->getOpcode() != ARM::tSUBspi &&
400 MI->getOpcode() != ARM::SUBri)
401 return false;
403 // Make sure the offset fits in 8 bits.
404 if (Bytes == 0 || (Limit && Bytes >= Limit))
405 return false;
407 unsigned Scale = (MI->getOpcode() == ARM::tSUBspi) ? 4 : 1; // FIXME
408 return (MI->getOperand(0).getReg() == Base &&
409 MI->getOperand(1).getReg() == Base &&
410 (MI->getOperand(2).getImm()*Scale) == Bytes &&
411 llvm::getInstrPredicate(MI, MyPredReg) == Pred &&
412 MyPredReg == PredReg);
415 static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
416 unsigned Bytes, unsigned Limit,
417 ARMCC::CondCodes Pred, unsigned PredReg){
418 unsigned MyPredReg = 0;
419 if (!MI)
420 return false;
421 if (MI->getOpcode() != ARM::t2ADDri &&
422 MI->getOpcode() != ARM::t2ADDrSPi &&
423 MI->getOpcode() != ARM::t2ADDrSPi12 &&
424 MI->getOpcode() != ARM::tADDspi &&
425 MI->getOpcode() != ARM::ADDri)
426 return false;
428 if (Bytes == 0 || (Limit && Bytes >= Limit))
429 // Make sure the offset fits in 8 bits.
430 return false;
432 unsigned Scale = (MI->getOpcode() == ARM::tADDspi) ? 4 : 1; // FIXME
433 return (MI->getOperand(0).getReg() == Base &&
434 MI->getOperand(1).getReg() == Base &&
435 (MI->getOperand(2).getImm()*Scale) == Bytes &&
436 llvm::getInstrPredicate(MI, MyPredReg) == Pred &&
437 MyPredReg == PredReg);
440 static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
441 switch (MI->getOpcode()) {
442 default: return 0;
443 case ARM::LDR:
444 case ARM::STR:
445 case ARM::t2LDRi8:
446 case ARM::t2LDRi12:
447 case ARM::t2STRi8:
448 case ARM::t2STRi12:
449 case ARM::VLDRS:
450 case ARM::VSTRS:
451 return 4;
452 case ARM::VLDRD:
453 case ARM::VSTRD:
454 return 8;
455 case ARM::LDM:
456 case ARM::STM:
457 case ARM::t2LDM:
458 case ARM::t2STM:
459 case ARM::VLDMS:
460 case ARM::VSTMS:
461 case ARM::VLDMD:
462 case ARM::VSTMD:
463 return (MI->getNumOperands() - 4) * 4;
467 static unsigned getUpdatingLSMultipleOpcode(unsigned Opc) {
468 switch (Opc) {
469 case ARM::LDM: return ARM::LDM_UPD;
470 case ARM::STM: return ARM::STM_UPD;
471 case ARM::t2LDM: return ARM::t2LDM_UPD;
472 case ARM::t2STM: return ARM::t2STM_UPD;
473 case ARM::VLDMS: return ARM::VLDMS_UPD;
474 case ARM::VLDMD: return ARM::VLDMD_UPD;
475 case ARM::VSTMS: return ARM::VSTMS_UPD;
476 case ARM::VSTMD: return ARM::VSTMD_UPD;
477 default: llvm_unreachable("Unhandled opcode!");
479 return 0;
482 /// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base
483 /// register into the LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
485 /// stmia rn, <ra, rb, rc>
486 /// rn := rn + 4 * 3;
487 /// =>
488 /// stmia rn!, <ra, rb, rc>
490 /// rn := rn - 4 * 3;
491 /// ldmia rn, <ra, rb, rc>
492 /// =>
493 /// ldmdb rn!, <ra, rb, rc>
494 bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
495 MachineBasicBlock::iterator MBBI,
496 bool &Advance,
497 MachineBasicBlock::iterator &I) {
498 MachineInstr *MI = MBBI;
499 unsigned Base = MI->getOperand(0).getReg();
500 bool BaseKill = MI->getOperand(0).isKill();
501 unsigned Bytes = getLSMultipleTransferSize(MI);
502 unsigned PredReg = 0;
503 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
504 int Opcode = MI->getOpcode();
505 DebugLoc dl = MI->getDebugLoc();
507 bool DoMerge = false;
508 ARM_AM::AMSubMode Mode = ARM_AM::ia;
510 // Can't use an updating ld/st if the base register is also a dest
511 // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
512 for (unsigned i = 3, e = MI->getNumOperands(); i != e; ++i) {
513 if (MI->getOperand(i).getReg() == Base)
514 return false;
516 Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm());
518 // Try merging with the previous instruction.
519 MachineBasicBlock::iterator BeginMBBI = MBB.begin();
520 if (MBBI != BeginMBBI) {
521 MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
522 while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
523 --PrevMBBI;
524 if (Mode == ARM_AM::ia &&
525 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
526 Mode = ARM_AM::db;
527 DoMerge = true;
528 } else if (Mode == ARM_AM::ib &&
529 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
530 Mode = ARM_AM::da;
531 DoMerge = true;
533 if (DoMerge)
534 MBB.erase(PrevMBBI);
537 // Try merging with the next instruction.
538 MachineBasicBlock::iterator EndMBBI = MBB.end();
539 if (!DoMerge && MBBI != EndMBBI) {
540 MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
541 while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
542 ++NextMBBI;
543 if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
544 isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
545 DoMerge = true;
546 } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
547 isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
548 DoMerge = true;
550 if (DoMerge) {
551 if (NextMBBI == I) {
552 Advance = true;
553 ++I;
555 MBB.erase(NextMBBI);
559 if (!DoMerge)
560 return false;
562 unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode);
563 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
564 .addReg(Base, getDefRegState(true)) // WB base register
565 .addReg(Base, getKillRegState(BaseKill))
566 .addImm(ARM_AM::getAM4ModeImm(Mode))
567 .addImm(Pred).addReg(PredReg);
568 // Transfer the rest of operands.
569 for (unsigned OpNum = 4, e = MI->getNumOperands(); OpNum != e; ++OpNum)
570 MIB.addOperand(MI->getOperand(OpNum));
571 // Transfer memoperands.
572 (*MIB).setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
574 MBB.erase(MBBI);
575 return true;
578 static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc) {
579 switch (Opc) {
580 case ARM::LDR: return ARM::LDR_PRE;
581 case ARM::STR: return ARM::STR_PRE;
582 case ARM::VLDRS: return ARM::VLDMS_UPD;
583 case ARM::VLDRD: return ARM::VLDMD_UPD;
584 case ARM::VSTRS: return ARM::VSTMS_UPD;
585 case ARM::VSTRD: return ARM::VSTMD_UPD;
586 case ARM::t2LDRi8:
587 case ARM::t2LDRi12:
588 return ARM::t2LDR_PRE;
589 case ARM::t2STRi8:
590 case ARM::t2STRi12:
591 return ARM::t2STR_PRE;
592 default: llvm_unreachable("Unhandled opcode!");
594 return 0;
597 static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc) {
598 switch (Opc) {
599 case ARM::LDR: return ARM::LDR_POST;
600 case ARM::STR: return ARM::STR_POST;
601 case ARM::VLDRS: return ARM::VLDMS_UPD;
602 case ARM::VLDRD: return ARM::VLDMD_UPD;
603 case ARM::VSTRS: return ARM::VSTMS_UPD;
604 case ARM::VSTRD: return ARM::VSTMD_UPD;
605 case ARM::t2LDRi8:
606 case ARM::t2LDRi12:
607 return ARM::t2LDR_POST;
608 case ARM::t2STRi8:
609 case ARM::t2STRi12:
610 return ARM::t2STR_POST;
611 default: llvm_unreachable("Unhandled opcode!");
613 return 0;
616 /// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base
617 /// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible:
618 bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
619 MachineBasicBlock::iterator MBBI,
620 const TargetInstrInfo *TII,
621 bool &Advance,
622 MachineBasicBlock::iterator &I) {
623 MachineInstr *MI = MBBI;
624 unsigned Base = MI->getOperand(1).getReg();
625 bool BaseKill = MI->getOperand(1).isKill();
626 unsigned Bytes = getLSMultipleTransferSize(MI);
627 int Opcode = MI->getOpcode();
628 DebugLoc dl = MI->getDebugLoc();
629 bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
630 Opcode == ARM::VSTRD || Opcode == ARM::VSTRS);
631 bool isAM2 = (Opcode == ARM::LDR || Opcode == ARM::STR);
632 if (isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0)
633 return false;
634 if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
635 return false;
636 if (isT2i32Load(Opcode) || isT2i32Store(Opcode))
637 if (MI->getOperand(2).getImm() != 0)
638 return false;
640 bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD;
641 // Can't do the merge if the destination register is the same as the would-be
642 // writeback register.
643 if (isLd && MI->getOperand(0).getReg() == Base)
644 return false;
646 unsigned PredReg = 0;
647 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
648 bool DoMerge = false;
649 ARM_AM::AddrOpc AddSub = ARM_AM::add;
650 unsigned NewOpc = 0;
651 // AM2 - 12 bits, thumb2 - 8 bits.
652 unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100);
654 // Try merging with the previous instruction.
655 MachineBasicBlock::iterator BeginMBBI = MBB.begin();
656 if (MBBI != BeginMBBI) {
657 MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
658 while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
659 --PrevMBBI;
660 if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) {
661 DoMerge = true;
662 AddSub = ARM_AM::sub;
663 } else if (!isAM5 &&
664 isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) {
665 DoMerge = true;
667 if (DoMerge) {
668 NewOpc = getPreIndexedLoadStoreOpcode(Opcode);
669 MBB.erase(PrevMBBI);
673 // Try merging with the next instruction.
674 MachineBasicBlock::iterator EndMBBI = MBB.end();
675 if (!DoMerge && MBBI != EndMBBI) {
676 MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
677 while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
678 ++NextMBBI;
679 if (!isAM5 &&
680 isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) {
681 DoMerge = true;
682 AddSub = ARM_AM::sub;
683 } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) {
684 DoMerge = true;
686 if (DoMerge) {
687 NewOpc = getPostIndexedLoadStoreOpcode(Opcode);
688 if (NextMBBI == I) {
689 Advance = true;
690 ++I;
692 MBB.erase(NextMBBI);
696 if (!DoMerge)
697 return false;
699 unsigned Offset = 0;
700 if (isAM5)
701 Offset = ARM_AM::getAM4ModeImm(AddSub == ARM_AM::sub ?
702 ARM_AM::db : ARM_AM::ia);
703 else if (isAM2)
704 Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
705 else
706 Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
708 if (isAM5) {
709 // VLDM[SD}_UPD, VSTM[SD]_UPD
710 // (There are no base-updating versions of VLDR/VSTR instructions, but the
711 // updating load/store-multiple instructions can be used with only one
712 // register.)
713 MachineOperand &MO = MI->getOperand(0);
714 BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
715 .addReg(Base, getDefRegState(true)) // WB base register
716 .addReg(Base, getKillRegState(isLd ? BaseKill : false))
717 .addImm(Offset)
718 .addImm(Pred).addReg(PredReg)
719 .addReg(MO.getReg(), (isLd ? getDefRegState(true) :
720 getKillRegState(MO.isKill())));
721 } else if (isLd) {
722 if (isAM2)
723 // LDR_PRE, LDR_POST,
724 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
725 .addReg(Base, RegState::Define)
726 .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
727 else
728 // t2LDR_PRE, t2LDR_POST
729 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
730 .addReg(Base, RegState::Define)
731 .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
732 } else {
733 MachineOperand &MO = MI->getOperand(0);
734 if (isAM2)
735 // STR_PRE, STR_POST
736 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
737 .addReg(MO.getReg(), getKillRegState(MO.isKill()))
738 .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
739 else
740 // t2STR_PRE, t2STR_POST
741 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
742 .addReg(MO.getReg(), getKillRegState(MO.isKill()))
743 .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
745 MBB.erase(MBBI);
747 return true;
750 /// isMemoryOp - Returns true if instruction is a memory operations (that this
751 /// pass is capable of operating on).
752 static bool isMemoryOp(const MachineInstr *MI) {
753 // When no memory operands are present, conservatively assume unaligned,
754 // volatile, unfoldable.
755 if (!MI->hasOneMemOperand())
756 return false;
758 const MachineMemOperand *MMO = *MI->memoperands_begin();
760 // Don't touch volatile memory accesses - we may be changing their order.
761 if (MMO->isVolatile())
762 return false;
764 // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
765 // not.
766 if (MMO->getAlignment() < 4)
767 return false;
769 // str <undef> could probably be eliminated entirely, but for now we just want
770 // to avoid making a mess of it.
771 // FIXME: Use str <undef> as a wildcard to enable better stm folding.
772 if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg() &&
773 MI->getOperand(0).isUndef())
774 return false;
776 // Likewise don't mess with references to undefined addresses.
777 if (MI->getNumOperands() > 1 && MI->getOperand(1).isReg() &&
778 MI->getOperand(1).isUndef())
779 return false;
781 int Opcode = MI->getOpcode();
782 switch (Opcode) {
783 default: break;
784 case ARM::LDR:
785 case ARM::STR:
786 return MI->getOperand(1).isReg() && MI->getOperand(2).getReg() == 0;
787 case ARM::VLDRS:
788 case ARM::VSTRS:
789 return MI->getOperand(1).isReg();
790 case ARM::VLDRD:
791 case ARM::VSTRD:
792 return MI->getOperand(1).isReg();
793 case ARM::t2LDRi8:
794 case ARM::t2LDRi12:
795 case ARM::t2STRi8:
796 case ARM::t2STRi12:
797 return MI->getOperand(1).isReg();
799 return false;
802 /// AdvanceRS - Advance register scavenger to just before the earliest memory
803 /// op that is being merged.
804 void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) {
805 MachineBasicBlock::iterator Loc = MemOps[0].MBBI;
806 unsigned Position = MemOps[0].Position;
807 for (unsigned i = 1, e = MemOps.size(); i != e; ++i) {
808 if (MemOps[i].Position < Position) {
809 Position = MemOps[i].Position;
810 Loc = MemOps[i].MBBI;
814 if (Loc != MBB.begin())
815 RS->forward(prior(Loc));
818 static int getMemoryOpOffset(const MachineInstr *MI) {
819 int Opcode = MI->getOpcode();
820 bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR;
821 bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
822 unsigned NumOperands = MI->getDesc().getNumOperands();
823 unsigned OffField = MI->getOperand(NumOperands-3).getImm();
825 if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
826 Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
827 Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8)
828 return OffField;
830 int Offset = isAM2
831 ? ARM_AM::getAM2Offset(OffField)
832 : (isAM3 ? ARM_AM::getAM3Offset(OffField)
833 : ARM_AM::getAM5Offset(OffField) * 4);
834 if (isAM2) {
835 if (ARM_AM::getAM2Op(OffField) == ARM_AM::sub)
836 Offset = -Offset;
837 } else if (isAM3) {
838 if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub)
839 Offset = -Offset;
840 } else {
841 if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub)
842 Offset = -Offset;
844 return Offset;
847 static void InsertLDR_STR(MachineBasicBlock &MBB,
848 MachineBasicBlock::iterator &MBBI,
849 int OffImm, bool isDef,
850 DebugLoc dl, unsigned NewOpc,
851 unsigned Reg, bool RegDeadKill, bool RegUndef,
852 unsigned BaseReg, bool BaseKill, bool BaseUndef,
853 unsigned OffReg, bool OffKill, bool OffUndef,
854 ARMCC::CondCodes Pred, unsigned PredReg,
855 const TargetInstrInfo *TII, bool isT2) {
856 int Offset = OffImm;
857 if (!isT2) {
858 if (OffImm < 0)
859 Offset = ARM_AM::getAM2Opc(ARM_AM::sub, -OffImm, ARM_AM::no_shift);
860 else
861 Offset = ARM_AM::getAM2Opc(ARM_AM::add, OffImm, ARM_AM::no_shift);
863 if (isDef) {
864 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
865 TII->get(NewOpc))
866 .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill))
867 .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
868 if (!isT2)
869 MIB.addReg(OffReg, getKillRegState(OffKill)|getUndefRegState(OffUndef));
870 MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
871 } else {
872 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
873 TII->get(NewOpc))
874 .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef))
875 .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
876 if (!isT2)
877 MIB.addReg(OffReg, getKillRegState(OffKill)|getUndefRegState(OffUndef));
878 MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
882 bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
883 MachineBasicBlock::iterator &MBBI) {
884 MachineInstr *MI = &*MBBI;
885 unsigned Opcode = MI->getOpcode();
886 if (Opcode == ARM::LDRD || Opcode == ARM::STRD ||
887 Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) {
888 unsigned EvenReg = MI->getOperand(0).getReg();
889 unsigned OddReg = MI->getOperand(1).getReg();
890 unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false);
891 unsigned OddRegNum = TRI->getDwarfRegNum(OddReg, false);
892 if ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum)
893 return false;
895 MachineBasicBlock::iterator NewBBI = MBBI;
896 bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
897 bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
898 bool EvenDeadKill = isLd ?
899 MI->getOperand(0).isDead() : MI->getOperand(0).isKill();
900 bool EvenUndef = MI->getOperand(0).isUndef();
901 bool OddDeadKill = isLd ?
902 MI->getOperand(1).isDead() : MI->getOperand(1).isKill();
903 bool OddUndef = MI->getOperand(1).isUndef();
904 const MachineOperand &BaseOp = MI->getOperand(2);
905 unsigned BaseReg = BaseOp.getReg();
906 bool BaseKill = BaseOp.isKill();
907 bool BaseUndef = BaseOp.isUndef();
908 unsigned OffReg = isT2 ? 0 : MI->getOperand(3).getReg();
909 bool OffKill = isT2 ? false : MI->getOperand(3).isKill();
910 bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef();
911 int OffImm = getMemoryOpOffset(MI);
912 unsigned PredReg = 0;
913 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
915 if (OddRegNum > EvenRegNum && OffReg == 0 && OffImm == 0) {
916 // Ascending register numbers and no offset. It's safe to change it to a
917 // ldm or stm.
918 unsigned NewOpc = (isLd)
919 ? (isT2 ? ARM::t2LDM : ARM::LDM)
920 : (isT2 ? ARM::t2STM : ARM::STM);
921 if (isLd) {
922 BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
923 .addReg(BaseReg, getKillRegState(BaseKill))
924 .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))
925 .addImm(Pred).addReg(PredReg)
926 .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill))
927 .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill));
928 ++NumLDRD2LDM;
929 } else {
930 BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
931 .addReg(BaseReg, getKillRegState(BaseKill))
932 .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))
933 .addImm(Pred).addReg(PredReg)
934 .addReg(EvenReg,
935 getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef))
936 .addReg(OddReg,
937 getKillRegState(OddDeadKill) | getUndefRegState(OddUndef));
938 ++NumSTRD2STM;
940 NewBBI = llvm::prior(MBBI);
941 } else {
942 // Split into two instructions.
943 assert((!isT2 || !OffReg) &&
944 "Thumb2 ldrd / strd does not encode offset register!");
945 unsigned NewOpc = (isLd)
946 ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDR)
947 : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STR);
948 DebugLoc dl = MBBI->getDebugLoc();
949 // If this is a load and base register is killed, it may have been
950 // re-defed by the load, make sure the first load does not clobber it.
951 if (isLd &&
952 (BaseKill || OffKill) &&
953 (TRI->regsOverlap(EvenReg, BaseReg) ||
954 (OffReg && TRI->regsOverlap(EvenReg, OffReg)))) {
955 assert(!TRI->regsOverlap(OddReg, BaseReg) &&
956 (!OffReg || !TRI->regsOverlap(OddReg, OffReg)));
957 InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc,
958 OddReg, OddDeadKill, false,
959 BaseReg, false, BaseUndef, OffReg, false, OffUndef,
960 Pred, PredReg, TII, isT2);
961 NewBBI = llvm::prior(MBBI);
962 InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
963 EvenReg, EvenDeadKill, false,
964 BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef,
965 Pred, PredReg, TII, isT2);
966 } else {
967 if (OddReg == EvenReg && EvenDeadKill) {
968 // If the two source operands are the same, the kill marker is
969 // probably on the first one. e.g.
970 // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0
971 EvenDeadKill = false;
972 OddDeadKill = true;
974 InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
975 EvenReg, EvenDeadKill, EvenUndef,
976 BaseReg, false, BaseUndef, OffReg, false, OffUndef,
977 Pred, PredReg, TII, isT2);
978 NewBBI = llvm::prior(MBBI);
979 InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc,
980 OddReg, OddDeadKill, OddUndef,
981 BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef,
982 Pred, PredReg, TII, isT2);
984 if (isLd)
985 ++NumLDRD2LDR;
986 else
987 ++NumSTRD2STR;
990 MBB.erase(MI);
991 MBBI = NewBBI;
992 return true;
994 return false;
997 /// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR
998 /// ops of the same base and incrementing offset into LDM / STM ops.
999 bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
1000 unsigned NumMerges = 0;
1001 unsigned NumMemOps = 0;
1002 MemOpQueue MemOps;
1003 unsigned CurrBase = 0;
1004 int CurrOpc = -1;
1005 unsigned CurrSize = 0;
1006 ARMCC::CondCodes CurrPred = ARMCC::AL;
1007 unsigned CurrPredReg = 0;
1008 unsigned Position = 0;
1009 SmallVector<MachineBasicBlock::iterator,4> Merges;
1011 RS->enterBasicBlock(&MBB);
1012 MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1013 while (MBBI != E) {
1014 if (FixInvalidRegPairOp(MBB, MBBI))
1015 continue;
1017 bool Advance = false;
1018 bool TryMerge = false;
1019 bool Clobber = false;
1021 bool isMemOp = isMemoryOp(MBBI);
1022 if (isMemOp) {
1023 int Opcode = MBBI->getOpcode();
1024 unsigned Size = getLSMultipleTransferSize(MBBI);
1025 const MachineOperand &MO = MBBI->getOperand(0);
1026 unsigned Reg = MO.getReg();
1027 bool isKill = MO.isDef() ? false : MO.isKill();
1028 unsigned Base = MBBI->getOperand(1).getReg();
1029 unsigned PredReg = 0;
1030 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MBBI, PredReg);
1031 int Offset = getMemoryOpOffset(MBBI);
1032 // Watch out for:
1033 // r4 := ldr [r5]
1034 // r5 := ldr [r5, #4]
1035 // r6 := ldr [r5, #8]
1037 // The second ldr has effectively broken the chain even though it
1038 // looks like the later ldr(s) use the same base register. Try to
1039 // merge the ldr's so far, including this one. But don't try to
1040 // combine the following ldr(s).
1041 Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg());
1042 if (CurrBase == 0 && !Clobber) {
1043 // Start of a new chain.
1044 CurrBase = Base;
1045 CurrOpc = Opcode;
1046 CurrSize = Size;
1047 CurrPred = Pred;
1048 CurrPredReg = PredReg;
1049 MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI));
1050 ++NumMemOps;
1051 Advance = true;
1052 } else {
1053 if (Clobber) {
1054 TryMerge = true;
1055 Advance = true;
1058 if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
1059 // No need to match PredReg.
1060 // Continue adding to the queue.
1061 if (Offset > MemOps.back().Offset) {
1062 MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill,
1063 Position, MBBI));
1064 ++NumMemOps;
1065 Advance = true;
1066 } else {
1067 for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end();
1068 I != E; ++I) {
1069 if (Offset < I->Offset) {
1070 MemOps.insert(I, MemOpQueueEntry(Offset, Reg, isKill,
1071 Position, MBBI));
1072 ++NumMemOps;
1073 Advance = true;
1074 break;
1075 } else if (Offset == I->Offset) {
1076 // Collision! This can't be merged!
1077 break;
1085 if (MBBI->isDebugValue()) {
1086 ++MBBI;
1087 if (MBBI == E)
1088 // Reach the end of the block, try merging the memory instructions.
1089 TryMerge = true;
1090 } else if (Advance) {
1091 ++Position;
1092 ++MBBI;
1093 if (MBBI == E)
1094 // Reach the end of the block, try merging the memory instructions.
1095 TryMerge = true;
1096 } else
1097 TryMerge = true;
1099 if (TryMerge) {
1100 if (NumMemOps > 1) {
1101 // Try to find a free register to use as a new base in case it's needed.
1102 // First advance to the instruction just before the start of the chain.
1103 AdvanceRS(MBB, MemOps);
1104 // Find a scratch register.
1105 unsigned Scratch = RS->FindUnusedReg(ARM::GPRRegisterClass);
1106 // Process the load / store instructions.
1107 RS->forward(prior(MBBI));
1109 // Merge ops.
1110 Merges.clear();
1111 MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize,
1112 CurrPred, CurrPredReg, Scratch, MemOps, Merges);
1114 // Try folding preceeding/trailing base inc/dec into the generated
1115 // LDM/STM ops.
1116 for (unsigned i = 0, e = Merges.size(); i < e; ++i)
1117 if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI))
1118 ++NumMerges;
1119 NumMerges += Merges.size();
1121 // Try folding preceeding/trailing base inc/dec into those load/store
1122 // that were not merged to form LDM/STM ops.
1123 for (unsigned i = 0; i != NumMemOps; ++i)
1124 if (!MemOps[i].Merged)
1125 if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI))
1126 ++NumMerges;
1128 // RS may be pointing to an instruction that's deleted.
1129 RS->skipTo(prior(MBBI));
1130 } else if (NumMemOps == 1) {
1131 // Try folding preceeding/trailing base inc/dec into the single
1132 // load/store.
1133 if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) {
1134 ++NumMerges;
1135 RS->forward(prior(MBBI));
1139 CurrBase = 0;
1140 CurrOpc = -1;
1141 CurrSize = 0;
1142 CurrPred = ARMCC::AL;
1143 CurrPredReg = 0;
1144 if (NumMemOps) {
1145 MemOps.clear();
1146 NumMemOps = 0;
1149 // If iterator hasn't been advanced and this is not a memory op, skip it.
1150 // It can't start a new chain anyway.
1151 if (!Advance && !isMemOp && MBBI != E) {
1152 ++Position;
1153 ++MBBI;
1157 return NumMerges > 0;
1160 namespace {
1161 struct OffsetCompare {
1162 bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const {
1163 int LOffset = getMemoryOpOffset(LHS);
1164 int ROffset = getMemoryOpOffset(RHS);
1165 assert(LHS == RHS || LOffset != ROffset);
1166 return LOffset > ROffset;
1171 /// MergeReturnIntoLDM - If this is a exit BB, try merging the return ops
1172 /// ("bx lr" and "mov pc, lr") into the preceeding stack restore so it
1173 /// directly restore the value of LR into pc.
1174 /// ldmfd sp!, {..., lr}
1175 /// bx lr
1176 /// or
1177 /// ldmfd sp!, {..., lr}
1178 /// mov pc, lr
1179 /// =>
1180 /// ldmfd sp!, {..., pc}
1181 bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
1182 if (MBB.empty()) return false;
1184 MachineBasicBlock::iterator MBBI = prior(MBB.end());
1185 if (MBBI != MBB.begin() &&
1186 (MBBI->getOpcode() == ARM::BX_RET ||
1187 MBBI->getOpcode() == ARM::tBX_RET ||
1188 MBBI->getOpcode() == ARM::MOVPCLR)) {
1189 MachineInstr *PrevMI = prior(MBBI);
1190 if (PrevMI->getOpcode() == ARM::LDM_UPD ||
1191 PrevMI->getOpcode() == ARM::t2LDM_UPD) {
1192 MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1);
1193 if (MO.getReg() != ARM::LR)
1194 return false;
1195 unsigned NewOpc = isThumb2 ? ARM::t2LDM_RET : ARM::LDM_RET;
1196 PrevMI->setDesc(TII->get(NewOpc));
1197 MO.setReg(ARM::PC);
1198 MBB.erase(MBBI);
1199 return true;
1202 return false;
1205 bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1206 const TargetMachine &TM = Fn.getTarget();
1207 AFI = Fn.getInfo<ARMFunctionInfo>();
1208 TII = TM.getInstrInfo();
1209 TRI = TM.getRegisterInfo();
1210 RS = new RegScavenger();
1211 isThumb2 = AFI->isThumb2Function();
1213 bool Modified = false;
1214 for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
1215 ++MFI) {
1216 MachineBasicBlock &MBB = *MFI;
1217 Modified |= LoadStoreMultipleOpti(MBB);
1218 Modified |= MergeReturnIntoLDM(MBB);
1221 delete RS;
1222 return Modified;
1226 /// ARMPreAllocLoadStoreOpt - Pre- register allocation pass that move
1227 /// load / stores from consecutive locations close to make it more
1228 /// likely they will be combined later.
1230 namespace {
1231 struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
1232 static char ID;
1233 ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
1235 const TargetData *TD;
1236 const TargetInstrInfo *TII;
1237 const TargetRegisterInfo *TRI;
1238 const ARMSubtarget *STI;
1239 MachineRegisterInfo *MRI;
1240 MachineFunction *MF;
1242 virtual bool runOnMachineFunction(MachineFunction &Fn);
1244 virtual const char *getPassName() const {
1245 return "ARM pre- register allocation load / store optimization pass";
1248 private:
1249 bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
1250 unsigned &NewOpc, unsigned &EvenReg,
1251 unsigned &OddReg, unsigned &BaseReg,
1252 unsigned &OffReg, int &Offset,
1253 unsigned &PredReg, ARMCC::CondCodes &Pred,
1254 bool &isT2);
1255 bool RescheduleOps(MachineBasicBlock *MBB,
1256 SmallVector<MachineInstr*, 4> &Ops,
1257 unsigned Base, bool isLd,
1258 DenseMap<MachineInstr*, unsigned> &MI2LocMap);
1259 bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
1261 char ARMPreAllocLoadStoreOpt::ID = 0;
1264 bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1265 TD = Fn.getTarget().getTargetData();
1266 TII = Fn.getTarget().getInstrInfo();
1267 TRI = Fn.getTarget().getRegisterInfo();
1268 STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
1269 MRI = &Fn.getRegInfo();
1270 MF = &Fn;
1272 bool Modified = false;
1273 for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
1274 ++MFI)
1275 Modified |= RescheduleLoadStoreInstrs(MFI);
1277 return Modified;
1280 static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
1281 MachineBasicBlock::iterator I,
1282 MachineBasicBlock::iterator E,
1283 SmallPtrSet<MachineInstr*, 4> &MemOps,
1284 SmallSet<unsigned, 4> &MemRegs,
1285 const TargetRegisterInfo *TRI) {
1286 // Are there stores / loads / calls between them?
1287 // FIXME: This is overly conservative. We should make use of alias information
1288 // some day.
1289 SmallSet<unsigned, 4> AddedRegPressure;
1290 while (++I != E) {
1291 if (I->isDebugValue() || MemOps.count(&*I))
1292 continue;
1293 const TargetInstrDesc &TID = I->getDesc();
1294 if (TID.isCall() || TID.isTerminator() || TID.hasUnmodeledSideEffects())
1295 return false;
1296 if (isLd && TID.mayStore())
1297 return false;
1298 if (!isLd) {
1299 if (TID.mayLoad())
1300 return false;
1301 // It's not safe to move the first 'str' down.
1302 // str r1, [r0]
1303 // strh r5, [r0]
1304 // str r4, [r0, #+4]
1305 if (TID.mayStore())
1306 return false;
1308 for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
1309 MachineOperand &MO = I->getOperand(j);
1310 if (!MO.isReg())
1311 continue;
1312 unsigned Reg = MO.getReg();
1313 if (MO.isDef() && TRI->regsOverlap(Reg, Base))
1314 return false;
1315 if (Reg != Base && !MemRegs.count(Reg))
1316 AddedRegPressure.insert(Reg);
1320 // Estimate register pressure increase due to the transformation.
1321 if (MemRegs.size() <= 4)
1322 // Ok if we are moving small number of instructions.
1323 return true;
1324 return AddedRegPressure.size() <= MemRegs.size() * 2;
1327 bool
1328 ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
1329 DebugLoc &dl,
1330 unsigned &NewOpc, unsigned &EvenReg,
1331 unsigned &OddReg, unsigned &BaseReg,
1332 unsigned &OffReg, int &Offset,
1333 unsigned &PredReg,
1334 ARMCC::CondCodes &Pred,
1335 bool &isT2) {
1336 // Make sure we're allowed to generate LDRD/STRD.
1337 if (!STI->hasV5TEOps())
1338 return false;
1340 // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD
1341 unsigned Scale = 1;
1342 unsigned Opcode = Op0->getOpcode();
1343 if (Opcode == ARM::LDR)
1344 NewOpc = ARM::LDRD;
1345 else if (Opcode == ARM::STR)
1346 NewOpc = ARM::STRD;
1347 else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
1348 NewOpc = ARM::t2LDRDi8;
1349 Scale = 4;
1350 isT2 = true;
1351 } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) {
1352 NewOpc = ARM::t2STRDi8;
1353 Scale = 4;
1354 isT2 = true;
1355 } else
1356 return false;
1358 // Make sure the offset registers match.
1359 if (!isT2 &&
1360 (Op0->getOperand(2).getReg() != Op1->getOperand(2).getReg()))
1361 return false;
1363 // Must sure the base address satisfies i64 ld / st alignment requirement.
1364 if (!Op0->hasOneMemOperand() ||
1365 !(*Op0->memoperands_begin())->getValue() ||
1366 (*Op0->memoperands_begin())->isVolatile())
1367 return false;
1369 unsigned Align = (*Op0->memoperands_begin())->getAlignment();
1370 const Function *Func = MF->getFunction();
1371 unsigned ReqAlign = STI->hasV6Ops()
1372 ? TD->getPrefTypeAlignment(Type::getInt64Ty(Func->getContext()))
1373 : 8; // Pre-v6 need 8-byte align
1374 if (Align < ReqAlign)
1375 return false;
1377 // Then make sure the immediate offset fits.
1378 int OffImm = getMemoryOpOffset(Op0);
1379 if (isT2) {
1380 if (OffImm < 0) {
1381 if (OffImm < -255)
1382 // Can't fall back to t2LDRi8 / t2STRi8.
1383 return false;
1384 } else {
1385 int Limit = (1 << 8) * Scale;
1386 if (OffImm >= Limit || (OffImm & (Scale-1)))
1387 return false;
1389 Offset = OffImm;
1390 } else {
1391 ARM_AM::AddrOpc AddSub = ARM_AM::add;
1392 if (OffImm < 0) {
1393 AddSub = ARM_AM::sub;
1394 OffImm = - OffImm;
1396 int Limit = (1 << 8) * Scale;
1397 if (OffImm >= Limit || (OffImm & (Scale-1)))
1398 return false;
1399 Offset = ARM_AM::getAM3Opc(AddSub, OffImm);
1401 EvenReg = Op0->getOperand(0).getReg();
1402 OddReg = Op1->getOperand(0).getReg();
1403 if (EvenReg == OddReg)
1404 return false;
1405 BaseReg = Op0->getOperand(1).getReg();
1406 if (!isT2)
1407 OffReg = Op0->getOperand(2).getReg();
1408 Pred = llvm::getInstrPredicate(Op0, PredReg);
1409 dl = Op0->getDebugLoc();
1410 return true;
1413 bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
1414 SmallVector<MachineInstr*, 4> &Ops,
1415 unsigned Base, bool isLd,
1416 DenseMap<MachineInstr*, unsigned> &MI2LocMap) {
1417 bool RetVal = false;
1419 // Sort by offset (in reverse order).
1420 std::sort(Ops.begin(), Ops.end(), OffsetCompare());
1422 // The loads / stores of the same base are in order. Scan them from first to
1423 // last and check for the following:
1424 // 1. Any def of base.
1425 // 2. Any gaps.
1426 while (Ops.size() > 1) {
1427 unsigned FirstLoc = ~0U;
1428 unsigned LastLoc = 0;
1429 MachineInstr *FirstOp = 0;
1430 MachineInstr *LastOp = 0;
1431 int LastOffset = 0;
1432 unsigned LastOpcode = 0;
1433 unsigned LastBytes = 0;
1434 unsigned NumMove = 0;
1435 for (int i = Ops.size() - 1; i >= 0; --i) {
1436 MachineInstr *Op = Ops[i];
1437 unsigned Loc = MI2LocMap[Op];
1438 if (Loc <= FirstLoc) {
1439 FirstLoc = Loc;
1440 FirstOp = Op;
1442 if (Loc >= LastLoc) {
1443 LastLoc = Loc;
1444 LastOp = Op;
1447 unsigned Opcode = Op->getOpcode();
1448 if (LastOpcode && Opcode != LastOpcode)
1449 break;
1451 int Offset = getMemoryOpOffset(Op);
1452 unsigned Bytes = getLSMultipleTransferSize(Op);
1453 if (LastBytes) {
1454 if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
1455 break;
1457 LastOffset = Offset;
1458 LastBytes = Bytes;
1459 LastOpcode = Opcode;
1460 if (++NumMove == 8) // FIXME: Tune this limit.
1461 break;
1464 if (NumMove <= 1)
1465 Ops.pop_back();
1466 else {
1467 SmallPtrSet<MachineInstr*, 4> MemOps;
1468 SmallSet<unsigned, 4> MemRegs;
1469 for (int i = NumMove-1; i >= 0; --i) {
1470 MemOps.insert(Ops[i]);
1471 MemRegs.insert(Ops[i]->getOperand(0).getReg());
1474 // Be conservative, if the instructions are too far apart, don't
1475 // move them. We want to limit the increase of register pressure.
1476 bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this.
1477 if (DoMove)
1478 DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp,
1479 MemOps, MemRegs, TRI);
1480 if (!DoMove) {
1481 for (unsigned i = 0; i != NumMove; ++i)
1482 Ops.pop_back();
1483 } else {
1484 // This is the new location for the loads / stores.
1485 MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
1486 while (InsertPos != MBB->end()
1487 && (MemOps.count(InsertPos) || InsertPos->isDebugValue()))
1488 ++InsertPos;
1490 // If we are moving a pair of loads / stores, see if it makes sense
1491 // to try to allocate a pair of registers that can form register pairs.
1492 MachineInstr *Op0 = Ops.back();
1493 MachineInstr *Op1 = Ops[Ops.size()-2];
1494 unsigned EvenReg = 0, OddReg = 0;
1495 unsigned BaseReg = 0, OffReg = 0, PredReg = 0;
1496 ARMCC::CondCodes Pred = ARMCC::AL;
1497 bool isT2 = false;
1498 unsigned NewOpc = 0;
1499 int Offset = 0;
1500 DebugLoc dl;
1501 if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc,
1502 EvenReg, OddReg, BaseReg, OffReg,
1503 Offset, PredReg, Pred, isT2)) {
1504 Ops.pop_back();
1505 Ops.pop_back();
1507 // Form the pair instruction.
1508 if (isLd) {
1509 MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos,
1510 dl, TII->get(NewOpc))
1511 .addReg(EvenReg, RegState::Define)
1512 .addReg(OddReg, RegState::Define)
1513 .addReg(BaseReg);
1514 if (!isT2)
1515 MIB.addReg(OffReg);
1516 MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1517 ++NumLDRDFormed;
1518 } else {
1519 MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos,
1520 dl, TII->get(NewOpc))
1521 .addReg(EvenReg)
1522 .addReg(OddReg)
1523 .addReg(BaseReg);
1524 if (!isT2)
1525 MIB.addReg(OffReg);
1526 MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1527 ++NumSTRDFormed;
1529 MBB->erase(Op0);
1530 MBB->erase(Op1);
1532 // Add register allocation hints to form register pairs.
1533 MRI->setRegAllocationHint(EvenReg, ARMRI::RegPairEven, OddReg);
1534 MRI->setRegAllocationHint(OddReg, ARMRI::RegPairOdd, EvenReg);
1535 } else {
1536 for (unsigned i = 0; i != NumMove; ++i) {
1537 MachineInstr *Op = Ops.back();
1538 Ops.pop_back();
1539 MBB->splice(InsertPos, MBB, Op);
1543 NumLdStMoved += NumMove;
1544 RetVal = true;
1549 return RetVal;
1552 bool
1553 ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
1554 bool RetVal = false;
1556 DenseMap<MachineInstr*, unsigned> MI2LocMap;
1557 DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap;
1558 DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap;
1559 SmallVector<unsigned, 4> LdBases;
1560 SmallVector<unsigned, 4> StBases;
1562 unsigned Loc = 0;
1563 MachineBasicBlock::iterator MBBI = MBB->begin();
1564 MachineBasicBlock::iterator E = MBB->end();
1565 while (MBBI != E) {
1566 for (; MBBI != E; ++MBBI) {
1567 MachineInstr *MI = MBBI;
1568 const TargetInstrDesc &TID = MI->getDesc();
1569 if (TID.isCall() || TID.isTerminator()) {
1570 // Stop at barriers.
1571 ++MBBI;
1572 break;
1575 if (!MI->isDebugValue())
1576 MI2LocMap[MI] = ++Loc;
1578 if (!isMemoryOp(MI))
1579 continue;
1580 unsigned PredReg = 0;
1581 if (llvm::getInstrPredicate(MI, PredReg) != ARMCC::AL)
1582 continue;
1584 int Opc = MI->getOpcode();
1585 bool isLd = isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD;
1586 unsigned Base = MI->getOperand(1).getReg();
1587 int Offset = getMemoryOpOffset(MI);
1589 bool StopHere = false;
1590 if (isLd) {
1591 DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
1592 Base2LdsMap.find(Base);
1593 if (BI != Base2LdsMap.end()) {
1594 for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
1595 if (Offset == getMemoryOpOffset(BI->second[i])) {
1596 StopHere = true;
1597 break;
1600 if (!StopHere)
1601 BI->second.push_back(MI);
1602 } else {
1603 SmallVector<MachineInstr*, 4> MIs;
1604 MIs.push_back(MI);
1605 Base2LdsMap[Base] = MIs;
1606 LdBases.push_back(Base);
1608 } else {
1609 DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
1610 Base2StsMap.find(Base);
1611 if (BI != Base2StsMap.end()) {
1612 for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
1613 if (Offset == getMemoryOpOffset(BI->second[i])) {
1614 StopHere = true;
1615 break;
1618 if (!StopHere)
1619 BI->second.push_back(MI);
1620 } else {
1621 SmallVector<MachineInstr*, 4> MIs;
1622 MIs.push_back(MI);
1623 Base2StsMap[Base] = MIs;
1624 StBases.push_back(Base);
1628 if (StopHere) {
1629 // Found a duplicate (a base+offset combination that's seen earlier).
1630 // Backtrack.
1631 --Loc;
1632 break;
1636 // Re-schedule loads.
1637 for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
1638 unsigned Base = LdBases[i];
1639 SmallVector<MachineInstr*, 4> &Lds = Base2LdsMap[Base];
1640 if (Lds.size() > 1)
1641 RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap);
1644 // Re-schedule stores.
1645 for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
1646 unsigned Base = StBases[i];
1647 SmallVector<MachineInstr*, 4> &Sts = Base2StsMap[Base];
1648 if (Sts.size() > 1)
1649 RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap);
1652 if (MBBI != E) {
1653 Base2LdsMap.clear();
1654 Base2StsMap.clear();
1655 LdBases.clear();
1656 StBases.clear();
1660 return RetVal;
1664 /// createARMLoadStoreOptimizationPass - returns an instance of the load / store
1665 /// optimization pass.
1666 FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
1667 if (PreAlloc)
1668 return new ARMPreAllocLoadStoreOpt();
1669 return new ARMLoadStoreOpt();