From 3110f6b18ee13c1a73bb86497381a1fddf7795e4 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 12 Sep 2019 16:00:07 +0000
Subject: [PATCH] [CGP] Ensure sinking multiple instructions does not
 invalidate dominance checks

In MVE, as of rL371218, we are attempting to sink chains of instructions such as:
  %l1 = insertelement <8 x i8> undef, i8 %l0, i32 0
  %broadcast.splat26 = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> zeroinitializer
In certain situations though, we can end up breaking the dominance relations of
instructions. This happens when we sink the instruction into a loop, but cannot
remove the originals. The Use is updated, which might in fact be a Use from the
second instruction to the first.

This attempts to fix that by reversing the order of instruction that are sunk,
and ensuring that we update the uses on new instructions if they have already
been sunk, not the old ones.

Differential Revision: https://reviews.llvm.org/D67366


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@371743 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/CodeGenPrepare.cpp                     |  32 ++++--
 .../ARM/sink-add-mul-shufflevector.ll              |   7 +-
 test/Transforms/CodeGenPrepare/ARM/sinkchain.ll    | 107 +++++++++++++++++++++
 3 files changed, 135 insertions(+), 11 deletions(-)
 create mode 100644 test/Transforms/CodeGenPrepare/ARM/sinkchain.ll
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index aebfa2e1c11..b3a6e284f4f 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -6191,35 +6191,49 @@ bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {
 
   // OpsToSink can contain multiple uses in a use chain (e.g.
   // (%u1 with %u1 = shufflevector), (%u2 with %u2 = zext %u1)). The dominating
-  // uses must come first, which means they are sunk first, temporarily creating
-  // invalid IR. This will be fixed once their dominated users are sunk and
-  // updated.
+  // uses must come first, so we process the ops in reverse order so as to not
+  // create invalid IR.
   BasicBlock *TargetBB = I->getParent();
   bool Changed = false;
   SmallVector<Use *, 4> ToReplace;
-  for (Use *U : OpsToSink) {
+  for (Use *U : reverse(OpsToSink)) {
     auto *UI = cast<Instruction>(U->get());
     if (UI->getParent() == TargetBB || isa<PHINode>(UI))
       continue;
     ToReplace.push_back(U);
   }
 
-  SmallPtrSet<Instruction *, 4> MaybeDead;
+  SetVector<Instruction *> MaybeDead;
+  DenseMap<Instruction *, Instruction *> NewInstructions;
+  Instruction *InsertPoint = I;
   for (Use *U : ToReplace) {
     auto *UI = cast<Instruction>(U->get());
     Instruction *NI = UI->clone();
+    NewInstructions[UI] = NI;
     MaybeDead.insert(UI);
     LLVM_DEBUG(dbgs() << "Sinking " << *UI << " to user " << *I << "\n");
-    NI->insertBefore(I);
+    NI->insertBefore(InsertPoint);
+    InsertPoint = NI;
     InsertedInsts.insert(NI);
-    U->set(NI);
+
+    // Update the use for the new instruction, making sure that we update the
+    // sunk instruction uses, if it is part of a chain that has already been
+    // sunk.
+    Instruction *OldI = cast<Instruction>(U->getUser());
+    if (NewInstructions.count(OldI))
+      NewInstructions[OldI]->setOperand(U->getOperandNo(), NI);
+    else
+      U->set(NI);
     Changed = true;
   }
 
   // Remove instructions that are dead after sinking.
-  for (auto *I : MaybeDead)
-    if (!I->hasNUsesOrMore(1))
+  for (auto *I : MaybeDead) {
+    if (!I->hasNUsesOrMore(1)) {
+      LLVM_DEBUG(dbgs() << "Removing dead instruction: " << *I << "\n");
       I->eraseFromParent();
+    }
+  }
 
   return Changed;
 }
diff --git a/test/Transforms/CodeGenPrepare/ARM/sink-add-mul-shufflevector.ll b/test/Transforms/CodeGenPrepare/ARM/sink-add-mul-shufflevector.ll
index 739877b3f0c..cb0737ffaeb 100644
--- a/test/Transforms/CodeGenPrepare/ARM/sink-add-mul-shufflevector.ll
+++ b/test/Transforms/CodeGenPrepare/ARM/sink-add-mul-shufflevector.ll
@@ -45,9 +45,12 @@ define void @sink_add_mul_multiple(i32* %s1, i32* %s2, i32 %x, i32* %d, i32* %d2
 ; CHECK-NOT:  [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
 ; CHECK-NOT:  [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK:    vector.body:
-; CHECK:      [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; CHECK:      [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 %x, i32 0
 ; CHECK:      [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK:      [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:      mul nsw <4 x i32> %wide.load, [[TMP3]]
+; CHECK:      [[TMP2b:%.*]] = insertelement <4 x i32> undef, i32 %x, i32 0
+; CHECK:      [[TMP3b:%.*]] = shufflevector <4 x i32> [[TMP2b]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:      mul nsw <4 x i32> %wide.load18, [[TMP3b]]
 ;
 entry:
   %cmp13 = icmp sgt i32 %n, 0
diff --git a/test/Transforms/CodeGenPrepare/ARM/sinkchain.ll b/test/Transforms/CodeGenPrepare/ARM/sinkchain.ll
new file mode 100644
index 00000000000..c0da3eb5666
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/ARM/sinkchain.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp < %s -codegenprepare -S | FileCheck -check-prefix=CHECK %s
+
+; Sink the shufflevector/insertelement pair, followed by the trunc. The sunk instruction end up dead.
+define signext i8 @dead(i16* noalias nocapture readonly %s1, i16 zeroext %x, i8* noalias nocapture %d, i32 %n) {
+; CHECK-LABEL: @dead(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N:%.*]], -8
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[L6:%.*]] = getelementptr inbounds i16, i16* [[S1:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[L7:%.*]] = bitcast i16* [[L6]] to <8 x i16>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[L7]], align 2
+; CHECK-NEXT:    [[L8:%.*]] = trunc <8 x i16> [[WIDE_LOAD]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[L9:%.*]] = mul <8 x i8> [[TMP2]], [[L8]]
+; CHECK-NEXT:    [[L13:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[L14:%.*]] = bitcast i8* [[L13]] to <8 x i8>*
+; CHECK-NEXT:    store <8 x i8> [[L9]], <8 x i8>* [[L14]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT:    [[L15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[L15]], label [[EXIT:%.*]], label [[VECTOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i8 0
+;
+entry:
+  %n.vec = and i32 %n, -8
+  %l0 = trunc i16 %x to i8
+  %l1 = insertelement <8 x i8> undef, i8 %l0, i32 0
+  %broadcast.splat26 = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
+  %l6 = getelementptr inbounds i16, i16* %s1, i32 %index
+  %l7 = bitcast i16* %l6 to <8 x i16>*
+  %wide.load = load <8 x i16>, <8 x i16>* %l7, align 2
+  %l8 = trunc <8 x i16> %wide.load to <8 x i8>
+  %l9 = mul <8 x i8> %broadcast.splat26, %l8
+  %l13 = getelementptr inbounds i8, i8* %d, i32 %index
+  %l14 = bitcast i8* %l13 to <8 x i8>*
+  store <8 x i8> %l9, <8 x i8>* %l14, align 1
+  %index.next = add i32 %index, 8
+  %l15 = icmp eq i32 %index.next, %n.vec
+  br i1 %l15, label %exit, label %vector.body
+
+exit:                                     ; preds = %vector.body
+  ret i8 0
+}
+
+; Same as above, but the shuffle has an extra use meaning it shouldnt be deleted
+define signext i8 @alive(i16* noalias nocapture readonly %s1, i16 zeroext %x, i8* noalias nocapture %d, i32 %n) {
+; CHECK-LABEL: @alive(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N:%.*]], -8
+; CHECK-NEXT:    [[L0:%.*]] = trunc i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[L1:%.*]] = insertelement <8 x i8> undef, i8 [[L0]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT26:%.*]] = shufflevector <8 x i8> [[L1]], <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[L2:%.*]] = sub <8 x i8> zeroinitializer, [[BROADCAST_SPLAT26]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i16 [[X]] to i8
+; CHECK-NEXT:    [[L6:%.*]] = getelementptr inbounds i16, i16* [[S1:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[L7:%.*]] = bitcast i16* [[L6]] to <8 x i16>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[L7]], align 2
+; CHECK-NEXT:    [[L8:%.*]] = trunc <8 x i16> [[WIDE_LOAD]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[L9:%.*]] = mul <8 x i8> [[TMP2]], [[L8]]
+; CHECK-NEXT:    [[L13:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[L14:%.*]] = bitcast i8* [[L13]] to <8 x i8>*
+; CHECK-NEXT:    store <8 x i8> [[L9]], <8 x i8>* [[L14]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT:    [[L15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[L15]], label [[EXIT:%.*]], label [[VECTOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i8 0
+;
+entry:
+  %n.vec = and i32 %n, -8
+  %l0 = trunc i16 %x to i8
+  %l1 = insertelement <8 x i8> undef, i8 %l0, i32 0
+  %broadcast.splat26 = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %l2 = sub <8 x i8> zeroinitializer, %broadcast.splat26
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
+  %l6 = getelementptr inbounds i16, i16* %s1, i32 %index
+  %l7 = bitcast i16* %l6 to <8 x i16>*
+  %wide.load = load <8 x i16>, <8 x i16>* %l7, align 2
+  %l8 = trunc <8 x i16> %wide.load to <8 x i8>
+  %l9 = mul <8 x i8> %broadcast.splat26, %l8
+  %l13 = getelementptr inbounds i8, i8* %d, i32 %index
+  %l14 = bitcast i8* %l13 to <8 x i8>*
+  store <8 x i8> %l9, <8 x i8>* %l14, align 1
+  %index.next = add i32 %index, 8
+  %l15 = icmp eq i32 %index.next, %n.vec
+  br i1 %l15, label %exit, label %vector.body
+
+exit:                                     ; preds = %vector.body
+  ret i8 0
+}
-- 
2.11.4.GIT