test/CodeGen/ARM/loop-indexing.ll

   1 ; RUN: llc --mtriple=thumbv7em -mattr=+fp-armv8 -O3 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
   2 ; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -O3 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
   3 ; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-backedge-indexing=false %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
   4 ; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
   5 ; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
   6 ; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -O3 -lsr-complexity-limit=2147483647 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX --check-prefix=CHECK-T2
   7
   8 ; Tests to check that post increment addressing modes are used instead of
   9 ; updating base pointers with add instructions.
  10
  11 ; TODO: I think we should be able to use post inc addressing with VLDM
  12 ; instructions.
  13 ; CHECK-LABEL: test_fma
  14 ; CHECK: @ %loop
  15
  16 ; CHECK-DEFAULT: vldr s{{.*}}, #8]
  17 ; CHECK-DEFAULT: vldr s{{.*}}, #8]
  18 ; CHECK-DEFAULT: vldr s{{.*}}, #12]
  19 ; CHECK-DEFAULT: vldr s{{.*}}, #12]
  20
  21 ; CHECK-COMPLEX: vldr s{{.*}}, #8]
  22 ; CHECK-COMPLEX: vldr s{{.*}}, #8]
  23 ; CHECK-COMPLEX: vldr s{{.*}}, #12]
  24 ; CHECK-COMPLEX: vldr s{{.*}}, #12]
  25
  26 define float @test_fma(float* %a, float* %b, i32 %N) {
  27 entry:
  28   br label %loop
  29
  30 loop:
  31   %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
  32   %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
  33   %res = phi float [ 0.0, %entry ], [ %fma.2, %loop ]
  34   %gep.a.1 = getelementptr inbounds float, float* %a, i32 %idx.1
  35   %a.1 = load float, float* %gep.a.1
  36   %gep.b.1 = getelementptr inbounds float, float* %b, i32 %idx.1
  37   %b.1 = load float, float* %gep.b.1
  38   %fmul.1 = fmul float %a.1, %b.1
  39   %fma.1 = fadd float %fmul.1, %res
  40   %idx.2 = or i32 %idx.1, 1
  41   %gep.a.2 = getelementptr inbounds float, float* %a, i32 %idx.2
  42   %a.2 = load float, float* %gep.a.2
  43   %gep.b.2 = getelementptr inbounds float, float* %b, i32 %idx.2
  44   %b.2 = load float, float* %gep.b.2
  45   %fmul.2 = fmul float %a.2, %b.2
  46   %fma.2 = fadd float %fmul.2, %fma.1
  47   %i.next = add nsw nuw i32 %i, -2
  48   %idx.next = add nsw nuw i32 %idx.1, 2
  49   %cmp = icmp ult i32 %i.next, %N
  50   br i1 %cmp, label %loop, label %exit
  51
  52 exit:
  53   ret float %fma.2
  54 }
  55
  56 ; CHECK-LABEL: convolve_16bit
  57 ; TODO: Both arrays should use indexing
  58 ; CHECK-DEFAULT: ldr{{.*}}, #8]!
  59 ; CHECK-DEFAULT-NOT: ldr{{.*}}]!
  60
  61 ; CHECK-COMPLEX: ldr{{.*}}, #8]!
  62 ; CHECK-COMPLEX-NOT: ldr{{.*}}]!
  63
  64 ; DISABLED-NOT: ldr{{.*}}]!
  65 ; DISABLED-NOT: str{{.*}}]!
  66
  67 define void @convolve_16bit(i16** nocapture readonly %input_image, i16** nocapture readonly %filter,
  68                             i32 %filter_dim, i32 %out_width, i32 %out_height,
  69                             i32** nocapture readonly %convolved) {
  70 entry:
  71   %cmp92 = icmp eq i32 %out_height, 0
  72   br i1 %cmp92, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
  73
  74 for.cond1.preheader.lr.ph:                        ; preds = %entry
  75   %xtraiter = and i32 %filter_dim, 3
  76   %unroll_iter = sub i32 %filter_dim, %xtraiter
  77   br label %for.cond1.preheader
  78
  79 for.cond1.preheader:                              ; preds = %for.cond.cleanup3, %for.cond1.preheader.lr.ph
  80   %res_y.093 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %add28, %for.cond.cleanup3 ]
  81   %arrayidx22 = getelementptr inbounds i32*, i32** %convolved, i32 %res_y.093
  82   %tmp3 = load i32*, i32** %arrayidx22, align 4
  83   br label %for.cond9.preheader.us.us.preheader
  84
  85 for.cond9.preheader.us.us.preheader:              ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.lr.ph
  86   %res_x.060.us = phi i32 [ %add25.us, %for.cond5.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond1.preheader ]
  87   br label %for.cond9.preheader.us.us
  88
  89 for.cond9.preheader.us.us:                        ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us, %for.cond9.preheader.us.us.preheader
  90   %filter_y.056.us.us = phi i32 [ %inc20.us.us, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ]
  91   %result_element.055.us.us = phi i32 [ %add18.us.us.3, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ]
  92   %add.us.us = add i32 %filter_y.056.us.us, %res_y.093
  93   %arrayidx.us.us = getelementptr inbounds i16*, i16** %filter, i32 %filter_y.056.us.us
  94   %tmp5 = load i16*, i16** %arrayidx.us.us, align 4
  95   %arrayidx15.us.us = getelementptr inbounds i16*, i16** %input_image, i32 %add.us.us
  96   %tmp6 = load i16*, i16** %arrayidx15.us.us, align 4
  97   br label %for.body12.us.us
  98
  99 for.body12.us.us:                                 ; preds = %for.body12.us.us, %for.cond9.preheader.us.us
 100   %filter_x.053.us.us = phi i32 [ %inc.us.us.3, %for.body12.us.us ], [ 0, %for.cond9.preheader.us.us ]
 101   %result_element.152.us.us = phi i32 [ %add18.us.us.3, %for.body12.us.us ], [ %result_element.055.us.us, %for.cond9.preheader.us.us ]
 102   %niter = phi i32 [ %niter.nsub.3, %for.body12.us.us ], [ %unroll_iter, %for.cond9.preheader.us.us ]
 103   %add13.us.us = add i32 %filter_x.053.us.us, %res_x.060.us
 104   %arrayidx14.us.us = getelementptr inbounds i16, i16* %tmp5, i32 %filter_x.053.us.us
 105   %tmp9 = load i16, i16* %arrayidx14.us.us, align 2
 106   %conv.us.us = sext i16 %tmp9 to i32
 107   %arrayidx16.us.us = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us
 108   %tmp10 = load i16, i16* %arrayidx16.us.us, align 2
 109   %conv17.us.us = sext i16 %tmp10 to i32
 110   %mul.us.us = mul nsw i32 %conv17.us.us, %conv.us.us
 111   %add18.us.us = add nsw i32 %mul.us.us, %result_element.152.us.us
 112   %inc.us.us = or i32 %filter_x.053.us.us, 1
 113   %add13.us.us.1 = add i32 %inc.us.us, %res_x.060.us
 114   %arrayidx14.us.us.1 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us
 115   %tmp11 = load i16, i16* %arrayidx14.us.us.1, align 2
 116   %conv.us.us.1 = sext i16 %tmp11 to i32
 117   %arrayidx16.us.us.1 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.1
 118   %tmp12 = load i16, i16* %arrayidx16.us.us.1, align 2
 119   %conv17.us.us.1 = sext i16 %tmp12 to i32
 120   %mul.us.us.1 = mul nsw i32 %conv17.us.us.1, %conv.us.us.1
 121   %add18.us.us.1 = add nsw i32 %mul.us.us.1, %add18.us.us
 122   %inc.us.us.1 = or i32 %filter_x.053.us.us, 2
 123   %add13.us.us.2 = add i32 %inc.us.us.1, %res_x.060.us
 124   %arrayidx14.us.us.2 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us.1
 125   %tmp13 = load i16, i16* %arrayidx14.us.us.2, align 2
 126   %conv.us.us.2 = sext i16 %tmp13 to i32
 127   %arrayidx16.us.us.2 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.2
 128   %tmp14 = load i16, i16* %arrayidx16.us.us.2, align 2
 129   %conv17.us.us.2 = sext i16 %tmp14 to i32
 130   %mul.us.us.2 = mul nsw i32 %conv17.us.us.2, %conv.us.us.2
 131   %add18.us.us.2 = add nsw i32 %mul.us.us.2, %add18.us.us.1
 132   %inc.us.us.2 = or i32 %filter_x.053.us.us, 3
 133   %add13.us.us.3 = add i32 %inc.us.us.2, %res_x.060.us
 134   %arrayidx14.us.us.3 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us.2
 135   %tmp15 = load i16, i16* %arrayidx14.us.us.3, align 2
 136   %conv.us.us.3 = sext i16 %tmp15 to i32
 137   %arrayidx16.us.us.3 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.3
 138   %tmp16 = load i16, i16* %arrayidx16.us.us.3, align 2
 139   %conv17.us.us.3 = sext i16 %tmp16 to i32
 140   %mul.us.us.3 = mul nsw i32 %conv17.us.us.3, %conv.us.us.3
 141   %add18.us.us.3 = add nsw i32 %mul.us.us.3, %add18.us.us.2
 142   %inc.us.us.3 = add i32 %filter_x.053.us.us, 4
 143   %niter.nsub.3 = add i32 %niter, -4
 144   %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
 145   br i1 %niter.ncmp.3, label %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa, label %for.body12.us.us
 146
 147 for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa: ; preds = %for.body12.us.us, %for.cond9.preheader.us.us
 148   %inc20.us.us = add nuw i32 %filter_y.056.us.us, 1
 149   %exitcond98 = icmp eq i32 %inc20.us.us, %filter_dim
 150   br i1 %exitcond98, label %for.cond5.for.cond.cleanup7_crit_edge.us, label %for.cond9.preheader.us.us
 151
 152 for.cond5.for.cond.cleanup7_crit_edge.us:         ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us
 153   %arrayidx23.us = getelementptr inbounds i32, i32* %tmp3, i32 %res_x.060.us
 154   store i32 %add18.us.us.3, i32* %arrayidx23.us, align 4
 155   %add25.us = add nuw i32 %res_x.060.us, 1
 156   %exitcond99 = icmp eq i32 %add25.us, %out_width
 157   br i1 %exitcond99, label %for.cond.cleanup3, label %for.cond9.preheader.us.us.preheader
 158
 159 for.cond.cleanup3:                                ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.preheader, %for.cond1.preheader
 160   %add28 = add nuw i32 %res_y.093, 1
 161   %exitcond100 = icmp eq i32 %add28, %out_height
 162   br i1 %exitcond100, label %for.cond.cleanup, label %for.cond1.preheader
 163
 164 for.cond.cleanup:                                 ; preds = %for.cond.cleanup3, %entry
 165   ret void
 166 }
 167
 168 ; CHECK-LABEL: mul_8x8
 169 ; CHECK: @ %for.body
 170
 171 ; CHECK-DEFAULT: str{{.*}}, #16]!
 172 ; CHECK-DEFAULT: ldrb{{.*}}, #4]!
 173 ; CHECK-DEFAULT: ldrb{{.*}}, #4]!
 174
 175 ; CHECK-COMPLEX: str{{.*}}, #16]!
 176 ; CHECK-COMPLEX: ldrb{{.*}}, #4]!
 177 ; CHECK-COMPLEX: ldrb{{.*}}, #4]!
 178
 179 ; DISABLED-NOT: ldr{{.*}}]!
 180 ; DISABLED-NOT: str{{.*}}]!
 181
 182 ; CHECK-T2: @ %for.body.epil
 183 ; CHECK-T2: ldrb{{.*}}, #1]!
 184 ; CHECK-T2: ldrb{{.*}}, #1]!
 185 ; CHECK-T2: str{{.*}}, #4]!
 186
 187 define void @mul_8x8(i8* nocapture readonly %A, i8* nocapture readonly %B, i32* nocapture %C, i32 %N) {
 188 entry:
 189   %cmp9 = icmp eq i32 %N, 0
 190   br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
 191
 192 for.body.preheader:                               ; preds = %entry
 193   %tmp = add i32 %N, -1
 194   %xtraiter = and i32 %N, 3
 195   %tmp1 = icmp ult i32 %tmp, 3
 196   br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
 197
 198 for.body.preheader.new:                           ; preds = %for.body.preheader
 199   %unroll_iter = sub i32 %N, %xtraiter
 200   br label %for.body
 201
 202 for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
 203   %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
 204   %lcmp.mod = icmp eq i32 %xtraiter, 0
 205   br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
 206
 207 for.body.epil:                                    ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
 208   %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
 209   %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
 210   %arrayidx.epil = getelementptr inbounds i8, i8* %A, i32 %i.010.epil
 211   %tmp2 = load i8, i8* %arrayidx.epil, align 1
 212   %conv.epil = zext i8 %tmp2 to i32
 213   %arrayidx1.epil = getelementptr inbounds i8, i8* %B, i32 %i.010.epil
 214   %tmp3 = load i8, i8* %arrayidx1.epil, align 1
 215   %conv2.epil = zext i8 %tmp3 to i32
 216   %mul.epil = mul nuw nsw i32 %conv2.epil, %conv.epil
 217   %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil
 218   store i32 %mul.epil, i32* %arrayidx3.epil, align 4
 219   %inc.epil = add nuw i32 %i.010.epil, 1
 220   %epil.iter.sub = add i32 %epil.iter, -1
 221   %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
 222   br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
 223
 224 for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
 225   ret void
 226
 227 for.body:                                         ; preds = %for.body, %for.body.preheader.new
 228   %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
 229   %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
 230   %arrayidx = getelementptr inbounds i8, i8* %A, i32 %i.010
 231   %tmp4 = load i8, i8* %arrayidx, align 1
 232   %conv = zext i8 %tmp4 to i32
 233   %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.010
 234   %tmp5 = load i8, i8* %arrayidx1, align 1
 235   %conv2 = zext i8 %tmp5 to i32
 236   %mul = mul nuw nsw i32 %conv2, %conv
 237   %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010
 238   store i32 %mul, i32* %arrayidx3, align 4
 239   %inc = or i32 %i.010, 1
 240   %arrayidx.1 = getelementptr inbounds i8, i8* %A, i32 %inc
 241   %tmp6 = load i8, i8* %arrayidx.1, align 1
 242   %conv.1 = zext i8 %tmp6 to i32
 243   %arrayidx1.1 = getelementptr inbounds i8, i8* %B, i32 %inc
 244   %tmp7 = load i8, i8* %arrayidx1.1, align 1
 245   %conv2.1 = zext i8 %tmp7 to i32
 246   %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1
 247   %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc
 248   store i32 %mul.1, i32* %arrayidx3.1, align 4
 249   %inc.1 = or i32 %i.010, 2
 250   %arrayidx.2 = getelementptr inbounds i8, i8* %A, i32 %inc.1
 251   %tmp8 = load i8, i8* %arrayidx.2, align 1
 252   %conv.2 = zext i8 %tmp8 to i32
 253   %arrayidx1.2 = getelementptr inbounds i8, i8* %B, i32 %inc.1
 254   %tmp9 = load i8, i8* %arrayidx1.2, align 1
 255   %conv2.2 = zext i8 %tmp9 to i32
 256   %mul.2 = mul nuw nsw i32 %conv2.2, %conv.2
 257   %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1
 258   store i32 %mul.2, i32* %arrayidx3.2, align 4
 259   %inc.2 = or i32 %i.010, 3
 260   %arrayidx.3 = getelementptr inbounds i8, i8* %A, i32 %inc.2
 261   %tmp10 = load i8, i8* %arrayidx.3, align 1
 262   %conv.3 = zext i8 %tmp10 to i32
 263   %arrayidx1.3 = getelementptr inbounds i8, i8* %B, i32 %inc.2
 264   %tmp11 = load i8, i8* %arrayidx1.3, align 1
 265   %conv2.3 = zext i8 %tmp11 to i32
 266   %mul.3 = mul nuw nsw i32 %conv2.3, %conv.3
 267   %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2
 268   store i32 %mul.3, i32* %arrayidx3.3, align 4
 269   %inc.3 = add i32 %i.010, 4
 270   %niter.nsub.3 = add i32 %niter, -4
 271   %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
 272   br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
 273 }
 274
 275 ; CHECK-LABEL: mul_16x8
 276 ; CHECK: @ %for.body
 277
 278 ; CHECK-DEFAULT: str{{.*}}, #16]!
 279 ; CHECK-DEFAULT: ldrsh{{.*}}, #8]!
 280
 281 ; CHECK-COMPLEX: ldrsh{{.*}}, #8]!
 282 ; CHECK-COMPLEX: str{{.*}}, #16]!
 283 ; CHECK-COMPLEX: ldrb{{.*}}, #4]!
 284
 285 ; DISABLED-NOT: ldr{{.*}}]!
 286 ; DISABLED-NOT: str{{.*}}]!
 287
 288 ; CHECK-T2: @ %for.body.epil
 289 ; CHECK-T2: ldrsh{{.*}}, #2]!
 290 ; CHECK-T2: ldrb{{.*}}, #1]!
 291 ; CHECK-T2: str{{.*}}, #4]!
 292
 293 define void @mul_16x8(i16* nocapture readonly %A, i8* nocapture readonly %B, i32* nocapture %C, i32 %N) {
 294 entry:
 295   %cmp9 = icmp eq i32 %N, 0
 296   br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
 297
 298 for.body.preheader:                               ; preds = %entry
 299   %tmp = add i32 %N, -1
 300   %xtraiter = and i32 %N, 3
 301   %tmp1 = icmp ult i32 %tmp, 3
 302   br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
 303
 304 for.body.preheader.new:                           ; preds = %for.body.preheader
 305   %unroll_iter = sub i32 %N, %xtraiter
 306   br label %for.body
 307
 308 for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
 309   %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
 310   %lcmp.mod = icmp eq i32 %xtraiter, 0
 311   br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
 312
 313 for.body.epil:                                    ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
 314   %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
 315   %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
 316   %arrayidx.epil = getelementptr inbounds i16, i16* %A, i32 %i.010.epil
 317   %tmp2 = load i16, i16* %arrayidx.epil, align 2
 318   %conv.epil = sext i16 %tmp2 to i32
 319   %arrayidx1.epil = getelementptr inbounds i8, i8* %B, i32 %i.010.epil
 320   %tmp3 = load i8, i8* %arrayidx1.epil, align 1
 321   %conv2.epil = zext i8 %tmp3 to i32
 322   %mul.epil = mul nsw i32 %conv2.epil, %conv.epil
 323   %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil
 324   store i32 %mul.epil, i32* %arrayidx3.epil, align 4
 325   %inc.epil = add nuw i32 %i.010.epil, 1
 326   %epil.iter.sub = add i32 %epil.iter, -1
 327   %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
 328   br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
 329
 330 for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
 331   ret void
 332
 333 for.body:                                         ; preds = %for.body, %for.body.preheader.new
 334   %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
 335   %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
 336   %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.010
 337   %tmp4 = load i16, i16* %arrayidx, align 2
 338   %conv = sext i16 %tmp4 to i32
 339   %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.010
 340   %tmp5 = load i8, i8* %arrayidx1, align 1
 341   %conv2 = zext i8 %tmp5 to i32
 342   %mul = mul nsw i32 %conv2, %conv
 343   %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010
 344   store i32 %mul, i32* %arrayidx3, align 4
 345   %inc = or i32 %i.010, 1
 346   %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc
 347   %tmp6 = load i16, i16* %arrayidx.1, align 2
 348   %conv.1 = sext i16 %tmp6 to i32
 349   %arrayidx1.1 = getelementptr inbounds i8, i8* %B, i32 %inc
 350   %tmp7 = load i8, i8* %arrayidx1.1, align 1
 351   %conv2.1 = zext i8 %tmp7 to i32
 352   %mul.1 = mul nsw i32 %conv2.1, %conv.1
 353   %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc
 354   store i32 %mul.1, i32* %arrayidx3.1, align 4
 355   %inc.1 = or i32 %i.010, 2
 356   %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1
 357   %tmp8 = load i16, i16* %arrayidx.2, align 2
 358   %conv.2 = sext i16 %tmp8 to i32
 359   %arrayidx1.2 = getelementptr inbounds i8, i8* %B, i32 %inc.1
 360   %tmp9 = load i8, i8* %arrayidx1.2, align 1
 361   %conv2.2 = zext i8 %tmp9 to i32
 362   %mul.2 = mul nsw i32 %conv2.2, %conv.2
 363   %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1
 364   store i32 %mul.2, i32* %arrayidx3.2, align 4
 365   %inc.2 = or i32 %i.010, 3
 366   %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2
 367   %tmp10 = load i16, i16* %arrayidx.3, align 2
 368   %conv.3 = sext i16 %tmp10 to i32
 369   %arrayidx1.3 = getelementptr inbounds i8, i8* %B, i32 %inc.2
 370   %tmp11 = load i8, i8* %arrayidx1.3, align 1
 371   %conv2.3 = zext i8 %tmp11 to i32
 372   %mul.3 = mul nsw i32 %conv2.3, %conv.3
 373   %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2
 374   store i32 %mul.3, i32* %arrayidx3.3, align 4
 375   %inc.3 = add i32 %i.010, 4
 376   %niter.nsub.3 = add i32 %niter, -4
 377   %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
 378   br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
 379 }
 380
 381 ; CHECK-LABEL: mul_16x16
 382 ; CHECK: @ %for.body
 383
 384 ; TODO: pre-indexed loads
 385 ; CHECK-DEFAULT-NOT: ldrsh{{.*}}]!
 386 ; CHECK-DEFAULT: str{{.*}}, #16]!
 387 ; CHECK-DEFAULT-NOT: ldrsh{{.*}}]!
 388
 389 ; CHECK-COMPLEX: ldrsh{{.*}}]!
 390 ; CHECK-COMPLEX: ldrsh{{.*}}]!
 391 ; CHECK-COMPLEX: str{{.*}}]!
 392
 393 ; DISABLED-NOT: ldr{{.*}}]!
 394 ; DISABLED-NOT: str{{.*}}]!
 395
 396 ; CHECK-T2: @ %for.body.epil
 397 ; CHECK-T2: ldrsh{{.*}}, #2]!
 398 ; CHECK-T2: ldrsh{{.*}}, #2]!
 399 ; CHECK-T2: str{{.*}}, #4]!
 400
 401 define void @mul_16x16(i16* nocapture readonly %A, i16* nocapture readonly %B, i32* nocapture %C, i32 %N) {
 402 entry:
 403   %cmp9 = icmp eq i32 %N, 0
 404   br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
 405
 406 for.body.preheader:                               ; preds = %entry
 407   %tmp = add i32 %N, -1
 408   %xtraiter = and i32 %N, 3
 409   %tmp1 = icmp ult i32 %tmp, 3
 410   br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
 411
 412 for.body.preheader.new:                           ; preds = %for.body.preheader
 413   %unroll_iter = sub i32 %N, %xtraiter
 414   br label %for.body
 415
 416 for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
 417   %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
 418   %lcmp.mod = icmp eq i32 %xtraiter, 0
 419   br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
 420
 421 for.body.epil:                                    ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
 422   %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
 423   %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
 424   %arrayidx.epil = getelementptr inbounds i16, i16* %A, i32 %i.010.epil
 425   %tmp2 = load i16, i16* %arrayidx.epil, align 2
 426   %conv.epil = sext i16 %tmp2 to i32
 427   %arrayidx1.epil = getelementptr inbounds i16, i16* %B, i32 %i.010.epil
 428   %tmp3 = load i16, i16* %arrayidx1.epil, align 2
 429   %conv2.epil = sext i16 %tmp3 to i32
 430   %mul.epil = mul nsw i32 %conv2.epil, %conv.epil
 431   %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil
 432   store i32 %mul.epil, i32* %arrayidx3.epil, align 4
 433   %inc.epil = add nuw i32 %i.010.epil, 1
 434   %epil.iter.sub = add i32 %epil.iter, -1
 435   %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
 436   br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
 437
 438 for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
 439   ret void
 440
 441 for.body:                                         ; preds = %for.body, %for.body.preheader.new
 442   %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
 443   %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
 444   %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.010
 445   %tmp4 = load i16, i16* %arrayidx, align 2
 446   %conv = sext i16 %tmp4 to i32
 447   %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.010
 448   %tmp5 = load i16, i16* %arrayidx1, align 2
 449   %conv2 = sext i16 %tmp5 to i32
 450   %mul = mul nsw i32 %conv2, %conv
 451   %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010
 452   store i32 %mul, i32* %arrayidx3, align 4
 453   %inc = or i32 %i.010, 1
 454   %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc
 455   %tmp6 = load i16, i16* %arrayidx.1, align 2
 456   %conv.1 = sext i16 %tmp6 to i32
 457   %arrayidx1.1 = getelementptr inbounds i16, i16* %B, i32 %inc
 458   %tmp7 = load i16, i16* %arrayidx1.1, align 2
 459   %conv2.1 = sext i16 %tmp7 to i32
 460   %mul.1 = mul nsw i32 %conv2.1, %conv.1
 461   %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc
 462   store i32 %mul.1, i32* %arrayidx3.1, align 4
 463   %inc.1 = or i32 %i.010, 2
 464   %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1
 465   %tmp8 = load i16, i16* %arrayidx.2, align 2
 466   %conv.2 = sext i16 %tmp8 to i32
 467   %arrayidx1.2 = getelementptr inbounds i16, i16* %B, i32 %inc.1
 468   %tmp9 = load i16, i16* %arrayidx1.2, align 2
 469   %conv2.2 = sext i16 %tmp9 to i32
 470   %mul.2 = mul nsw i32 %conv2.2, %conv.2
 471   %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1
 472   store i32 %mul.2, i32* %arrayidx3.2, align 4
 473   %inc.2 = or i32 %i.010, 3
 474   %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2
 475   %tmp10 = load i16, i16* %arrayidx.3, align 2
 476   %conv.3 = sext i16 %tmp10 to i32
 477   %arrayidx1.3 = getelementptr inbounds i16, i16* %B, i32 %inc.2
 478   %tmp11 = load i16, i16* %arrayidx1.3, align 2
 479   %conv2.3 = sext i16 %tmp11 to i32
 480   %mul.3 = mul nsw i32 %conv2.3, %conv.3
 481   %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2
 482   store i32 %mul.3, i32* %arrayidx3.3, align 4
 483   %inc.3 = add i32 %i.010, 4
 484   %niter.nsub.3 = add i32 %niter, -4
 485   %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
 486   br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
 487 }
 488
 489 ; CHECK-LABEL: mul_8x8_2d
 490 ; CHECK: @ %for.body4.us
 491
 492 ; CHECK-DEFAULT: ldr{{.*}}, #16]!
 493 ; CHECK-DEFAULT: ldrb{{.*}}, #4]!
 494
 495 ; DISABLED-NOT: ldr{{.*}}]!
 496 ; DISABLED-NOT: str{{.*}}]!
 497
 498 ; CHECK-T2: @ %for.body4.us.epil
 499 ; CHECK-T2: ldrb{{.*}}, #1]!
 500 ; CHECK-T2: ldr{{.*}}, #4]!
 501
 502 define void @mul_8x8_2d(i8* nocapture readonly %A, i8** nocapture readonly %B, i32** nocapture readonly %C, i32 %N, i32 %M) {
 503 entry:
 504   %cmp24 = icmp eq i32 %N, 0
 505   %cmp222 = icmp eq i32 %M, 0
 506   %or.cond = or i1 %cmp24, %cmp222
 507   br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
 508
 509 for.cond1.preheader.us.preheader:                 ; preds = %entry
 510   %tmp = add i32 %M, -1
 511   %xtraiter = and i32 %M, 3
 512   %tmp1 = icmp ult i32 %tmp, 3
 513   %unroll_iter = sub i32 %M, %xtraiter
 514   %lcmp.mod = icmp eq i32 %xtraiter, 0
 515   br label %for.cond1.preheader.us
 516
 517 for.cond1.preheader.us:                           ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
 518   %i.025.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
 519   %arrayidx.us = getelementptr inbounds i8, i8* %A, i32 %i.025.us
 520   %arrayidx5.us = getelementptr inbounds i8*, i8** %B, i32 %i.025.us
 521   %arrayidx8.us = getelementptr inbounds i32*, i32** %C, i32 %i.025.us
 522   %.pre = load i8*, i8** %arrayidx5.us, align 4
 523   %.pre30 = load i32*, i32** %arrayidx8.us, align 4
 524   br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
 525
 526 for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
 527   %j.023.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
 528   %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
 529   %tmp2 = load i8, i8* %arrayidx.us, align 1
 530   %conv.us = zext i8 %tmp2 to i32
 531   %arrayidx6.us = getelementptr inbounds i8, i8* %.pre, i32 %j.023.us
 532   %tmp3 = load i8, i8* %arrayidx6.us, align 1
 533   %conv7.us = zext i8 %tmp3 to i32
 534   %mul.us = mul nuw nsw i32 %conv7.us, %conv.us
 535   %arrayidx9.us = getelementptr inbounds i32, i32* %.pre30, i32 %j.023.us
 536   %tmp4 = load i32, i32* %arrayidx9.us, align 4
 537   %add.us = add nsw i32 %tmp4, %mul.us
 538   store i32 %add.us, i32* %arrayidx9.us, align 4
 539   %inc.us = or i32 %j.023.us, 1
 540   %tmp5 = load i8, i8* %arrayidx.us, align 1
 541   %conv.us.1 = zext i8 %tmp5 to i32
 542   %arrayidx6.us.1 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us
 543   %tmp6 = load i8, i8* %arrayidx6.us.1, align 1
 544   %conv7.us.1 = zext i8 %tmp6 to i32
 545   %mul.us.1 = mul nuw nsw i32 %conv7.us.1, %conv.us.1
 546   %arrayidx9.us.1 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us
 547   %tmp7 = load i32, i32* %arrayidx9.us.1, align 4
 548   %add.us.1 = add nsw i32 %tmp7, %mul.us.1
 549   store i32 %add.us.1, i32* %arrayidx9.us.1, align 4
 550   %inc.us.1 = or i32 %j.023.us, 2
 551   %tmp8 = load i8, i8* %arrayidx.us, align 1
 552   %conv.us.2 = zext i8 %tmp8 to i32
 553   %arrayidx6.us.2 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.1
 554   %tmp9 = load i8, i8* %arrayidx6.us.2, align 1
 555   %conv7.us.2 = zext i8 %tmp9 to i32
 556   %mul.us.2 = mul nuw nsw i32 %conv7.us.2, %conv.us.2
 557   %arrayidx9.us.2 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us.1
 558   %tmp10 = load i32, i32* %arrayidx9.us.2, align 4
 559   %add.us.2 = add nsw i32 %tmp10, %mul.us.2
 560   store i32 %add.us.2, i32* %arrayidx9.us.2, align 4
 561   %inc.us.2 = or i32 %j.023.us, 3
 562   %tmp11 = load i8, i8* %arrayidx.us, align 1
 563   %conv.us.3 = zext i8 %tmp11 to i32
 564   %arrayidx6.us.3 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.2
 565   %tmp12 = load i8, i8* %arrayidx6.us.3, align 1
 566   %conv7.us.3 = zext i8 %tmp12 to i32
 567   %mul.us.3 = mul nuw nsw i32 %conv7.us.3, %conv.us.3
 568   %arrayidx9.us.3 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us.2
 569   %tmp13 = load i32, i32* %arrayidx9.us.3, align 4
 570   %add.us.3 = add nsw i32 %tmp13, %mul.us.3
 571   store i32 %add.us.3, i32* %arrayidx9.us.3, align 4
 572   %inc.us.3 = add i32 %j.023.us, 4
 573   %niter.nsub.3 = add i32 %niter, -4
 574   %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
 575   br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
 576
 577 for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
 578   %j.023.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
 579   br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
 580
 581 for.body4.us.epil:                                ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
 582   %j.023.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.023.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
 583   %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
 584   %tmp14 = load i8, i8* %arrayidx.us, align 1
 585   %conv.us.epil = zext i8 %tmp14 to i32
 586   %arrayidx6.us.epil = getelementptr inbounds i8, i8* %.pre, i32 %j.023.us.epil
 587   %tmp15 = load i8, i8* %arrayidx6.us.epil, align 1
 588   %conv7.us.epil = zext i8 %tmp15 to i32
 589   %mul.us.epil = mul nuw nsw i32 %conv7.us.epil, %conv.us.epil
 590   %arrayidx9.us.epil = getelementptr inbounds i32, i32* %.pre30, i32 %j.023.us.epil
 591   %tmp16 = load i32, i32* %arrayidx9.us.epil, align 4
 592   %add.us.epil = add nsw i32 %tmp16, %mul.us.epil
 593   store i32 %add.us.epil, i32* %arrayidx9.us.epil, align 4
 594   %inc.us.epil = add nuw i32 %j.023.us.epil, 1
 595   %epil.iter.sub = add i32 %epil.iter, -1
 596   %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
 597   br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
 598
 599 for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
 600   %inc11.us = add nuw i32 %i.025.us, 1
 601   %exitcond28 = icmp eq i32 %inc11.us, %N
 602   br i1 %exitcond28, label %for.cond.cleanup, label %for.cond1.preheader.us
 603
 604 for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
 605   ret void
 606 }
 607
 608 ; CHECK-LABEL: mul_16x16_2d
 609 ; CHECK: @ %for.body4.us
 610
 611 ; CHECK-DEFAULT: ldr{{.*}}, #16]!
 612 ; CHECK-DEFAULT: ldrsh{{.*}}, #8]!
 613
 614 ; DISABLED-NOT: ldr{{.*}}]!
 615 ; DISABLED-NOT: str{{.*}}]!
 616
 617 ; CHECK-T2: @ %for.body4.us.epil
 618 ; CHECK-T2: ldrsh{{.*}}, #2]!
 619 ; CHECK-T2: ldr{{.*}}, #4]!
 620
 621 define void @mul_16x16_2d(i16* nocapture readonly %A, i16** nocapture readonly %B, i32** nocapture readonly %C, i32 %N, i32 %M) {
 622 entry:
 623   %cmp24 = icmp eq i32 %N, 0
 624   %cmp222 = icmp eq i32 %M, 0
 625   %or.cond = or i1 %cmp24, %cmp222
 626   br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
 627
 628 for.cond1.preheader.us.preheader:                 ; preds = %entry
 629   %tmp = add i32 %M, -1
 630   %xtraiter = and i32 %M, 3
 631   %tmp1 = icmp ult i32 %tmp, 3
 632   %unroll_iter = sub i32 %M, %xtraiter
 633   %lcmp.mod = icmp eq i32 %xtraiter, 0
 634   br label %for.cond1.preheader.us
 635
 636 for.cond1.preheader.us:                           ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
 637   %i.025.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
 638   %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.025.us
 639   %tmp2 = load i16, i16* %arrayidx.us, align 2
 640   %conv.us = sext i16 %tmp2 to i32
 641   %arrayidx5.us = getelementptr inbounds i16*, i16** %B, i32 %i.025.us
 642   %tmp3 = load i16*, i16** %arrayidx5.us, align 4
 643   %arrayidx8.us = getelementptr inbounds i32*, i32** %C, i32 %i.025.us
 644   %tmp4 = load i32*, i32** %arrayidx8.us, align 4
 645   br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
 646
 647 for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
 648   %j.023.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
 649   %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
 650   %arrayidx6.us = getelementptr inbounds i16, i16* %tmp3, i32 %j.023.us
 651   %tmp5 = load i16, i16* %arrayidx6.us, align 2
 652   %conv7.us = sext i16 %tmp5 to i32
 653   %mul.us = mul nsw i32 %conv7.us, %conv.us
 654   %arrayidx9.us = getelementptr inbounds i32, i32* %tmp4, i32 %j.023.us
 655   %tmp6 = load i32, i32* %arrayidx9.us, align 4
 656   %add.us = add nsw i32 %tmp6, %mul.us
 657   store i32 %add.us, i32* %arrayidx9.us, align 4
 658   %inc.us = or i32 %j.023.us, 1
 659   %arrayidx6.us.1 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us
 660   %tmp7 = load i16, i16* %arrayidx6.us.1, align 2
 661   %conv7.us.1 = sext i16 %tmp7 to i32
 662   %mul.us.1 = mul nsw i32 %conv7.us.1, %conv.us
 663   %arrayidx9.us.1 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us
 664   %tmp8 = load i32, i32* %arrayidx9.us.1, align 4
 665   %add.us.1 = add nsw i32 %tmp8, %mul.us.1
 666   store i32 %add.us.1, i32* %arrayidx9.us.1, align 4
 667   %inc.us.1 = or i32 %j.023.us, 2
 668   %arrayidx6.us.2 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.1
 669   %tmp9 = load i16, i16* %arrayidx6.us.2, align 2
 670   %conv7.us.2 = sext i16 %tmp9 to i32
 671   %mul.us.2 = mul nsw i32 %conv7.us.2, %conv.us
 672   %arrayidx9.us.2 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us.1
 673   %tmp10 = load i32, i32* %arrayidx9.us.2, align 4
 674   %add.us.2 = add nsw i32 %tmp10, %mul.us.2
 675   store i32 %add.us.2, i32* %arrayidx9.us.2, align 4
 676   %inc.us.2 = or i32 %j.023.us, 3
 677   %arrayidx6.us.3 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.2
 678   %tmp11 = load i16, i16* %arrayidx6.us.3, align 2
 679   %conv7.us.3 = sext i16 %tmp11 to i32
 680   %mul.us.3 = mul nsw i32 %conv7.us.3, %conv.us
 681   %arrayidx9.us.3 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us.2
 682   %tmp12 = load i32, i32* %arrayidx9.us.3, align 4
 683   %add.us.3 = add nsw i32 %tmp12, %mul.us.3
 684   store i32 %add.us.3, i32* %arrayidx9.us.3, align 4
 685   %inc.us.3 = add i32 %j.023.us, 4
 686   %niter.nsub.3 = add i32 %niter, -4
 687   %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
 688   br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
 689
 690 for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
 691   %j.023.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
 692   br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
 693
 694 for.body4.us.epil:                                ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
 695   %j.023.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.023.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
 696   %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
 697   %arrayidx6.us.epil = getelementptr inbounds i16, i16* %tmp3, i32 %j.023.us.epil
 698   %tmp13 = load i16, i16* %arrayidx6.us.epil, align 2
 699   %conv7.us.epil = sext i16 %tmp13 to i32
 700   %mul.us.epil = mul nsw i32 %conv7.us.epil, %conv.us
 701   %arrayidx9.us.epil = getelementptr inbounds i32, i32* %tmp4, i32 %j.023.us.epil
 702   %tmp14 = load i32, i32* %arrayidx9.us.epil, align 4
 703   %add.us.epil = add nsw i32 %tmp14, %mul.us.epil
 704   store i32 %add.us.epil, i32* %arrayidx9.us.epil, align 4
 705   %inc.us.epil = add nuw i32 %j.023.us.epil, 1
 706   %epil.iter.sub = add i32 %epil.iter, -1
 707   %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
 708   br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
 709
 710 for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
 711   %inc11.us = add nuw i32 %i.025.us, 1
 712   %exitcond28 = icmp eq i32 %inc11.us, %N
 713   br i1 %exitcond28, label %for.cond.cleanup, label %for.cond1.preheader.us
 714
 715 for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
 716   ret void
 717 }
 718
 719 ; CHECK-LABEL: mac_8x8_2d
 720 ; CHECK: @ %for.body4.us
 721
 722 ; TODO: Both input arrays could use pre-indexed loads.
 723 ; TODO: pre-indexed stores.
 724 ; CHECK-DEFAULT: ldrb{{.*}}, #4]!
 725 ; CHECK-DEFAULT-NOT: ldr{{.*}}]!
 726 ; CHECK-DEFAULT-NOT: str{{.*}}]!
 727
 728 ; TODO: Increased complexity shouldn't prevent indexed accesses.
 729 ; CHECK-COMPLEX-NOT: ldr{{.*}}]!
 730 ; CHECK-COMPLEX-NOT: str{{.*}}]!
 731
 732 ; DISABLED-NOT: ldr{{.*}}]!
 733 ; DISABLED-NOT: str{{.*}}]!
 734
 735 ; CHECK-T2: @ %for.body4.us.epil
 736 ; CHECK-T2: ldrb{{.*}}, #1]!
 737
 738 define void @mac_8x8_2d(i8* nocapture readonly %A, i8** nocapture readonly %B, i32* nocapture %C, i32 %N, i32 %M) {
 739 entry:
 740   %cmp22 = icmp eq i32 %N, 0
 741   %cmp220 = icmp eq i32 %M, 0
 742   %or.cond = or i1 %cmp22, %cmp220
 743   br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
 744
 745 for.cond1.preheader.us.preheader:                 ; preds = %entry
 746   %tmp = add i32 %M, -1
 747   %xtraiter = and i32 %M, 3
 748   %tmp1 = icmp ult i32 %tmp, 3
 749   %unroll_iter = sub i32 %M, %xtraiter
 750   %lcmp.mod = icmp eq i32 %xtraiter, 0
 751   br label %for.cond1.preheader.us
 752
 753 for.cond1.preheader.us:                           ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
 754   %i.023.us = phi i32 [ %inc10.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
 755   %arrayidx.us = getelementptr inbounds i8, i8* %A, i32 %i.023.us
 756   %arrayidx5.us = getelementptr inbounds i8*, i8** %B, i32 %i.023.us
 757   %arrayidx8.us = getelementptr inbounds i32, i32* %C, i32 %i.023.us
 758   %.pre = load i8*, i8** %arrayidx5.us, align 4
 759   %.pre28 = load i32, i32* %arrayidx8.us, align 4
 760   br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
 761
 762 for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
 763   %tmp2 = phi i32 [ %add.us.3, %for.body4.us ], [ %.pre28, %for.cond1.preheader.us ]
 764   %j.021.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
 765   %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
 766   %tmp3 = load i8, i8* %arrayidx.us, align 1
 767   %conv.us = zext i8 %tmp3 to i32
 768   %arrayidx6.us = getelementptr inbounds i8, i8* %.pre, i32 %j.021.us
 769   %tmp4 = load i8, i8* %arrayidx6.us, align 1
 770   %conv7.us = zext i8 %tmp4 to i32
 771   %mul.us = mul nuw nsw i32 %conv7.us, %conv.us
 772   %add.us = add nsw i32 %mul.us, %tmp2
 773   store i32 %add.us, i32* %arrayidx8.us, align 4
 774   %inc.us = or i32 %j.021.us, 1
 775   %tmp5 = load i8, i8* %arrayidx.us, align 1
 776   %conv.us.1 = zext i8 %tmp5 to i32
 777   %arrayidx6.us.1 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us
 778   %tmp6 = load i8, i8* %arrayidx6.us.1, align 1
 779   %conv7.us.1 = zext i8 %tmp6 to i32
 780   %mul.us.1 = mul nuw nsw i32 %conv7.us.1, %conv.us.1
 781   %add.us.1 = add nsw i32 %mul.us.1, %add.us
 782   store i32 %add.us.1, i32* %arrayidx8.us, align 4
 783   %inc.us.1 = or i32 %j.021.us, 2
 784   %tmp7 = load i8, i8* %arrayidx.us, align 1
 785   %conv.us.2 = zext i8 %tmp7 to i32
 786   %arrayidx6.us.2 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.1
 787   %tmp8 = load i8, i8* %arrayidx6.us.2, align 1
 788   %conv7.us.2 = zext i8 %tmp8 to i32
 789   %mul.us.2 = mul nuw nsw i32 %conv7.us.2, %conv.us.2
 790   %add.us.2 = add nsw i32 %mul.us.2, %add.us.1
 791   store i32 %add.us.2, i32* %arrayidx8.us, align 4
 792   %inc.us.2 = or i32 %j.021.us, 3
 793   %tmp9 = load i8, i8* %arrayidx.us, align 1
 794   %conv.us.3 = zext i8 %tmp9 to i32
 795   %arrayidx6.us.3 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.2
 796   %tmp10 = load i8, i8* %arrayidx6.us.3, align 1
 797   %conv7.us.3 = zext i8 %tmp10 to i32
 798   %mul.us.3 = mul nuw nsw i32 %conv7.us.3, %conv.us.3
 799   %add.us.3 = add nsw i32 %mul.us.3, %add.us.2
 800   store i32 %add.us.3, i32* %arrayidx8.us, align 4
 801   %inc.us.3 = add i32 %j.021.us, 4
 802   %niter.nsub.3 = add i32 %niter, -4
 803   %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
 804   br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
 805
 806 for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
 807   %.unr = phi i32 [ %.pre28, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ]
 808   %j.021.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
 809   br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
 810
 811 for.body4.us.epil:                                ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
 812   %tmp11 = phi i32 [ %add.us.epil, %for.body4.us.epil ], [ %.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
 813   %j.021.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.021.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
 814   %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
 815   %tmp12 = load i8, i8* %arrayidx.us, align 1
 816   %conv.us.epil = zext i8 %tmp12 to i32
 817   %arrayidx6.us.epil = getelementptr inbounds i8, i8* %.pre, i32 %j.021.us.epil
 818   %tmp13 = load i8, i8* %arrayidx6.us.epil, align 1
 819   %conv7.us.epil = zext i8 %tmp13 to i32
 820   %mul.us.epil = mul nuw nsw i32 %conv7.us.epil, %conv.us.epil
 821   %add.us.epil = add nsw i32 %mul.us.epil, %tmp11
 822   store i32 %add.us.epil, i32* %arrayidx8.us, align 4
 823   %inc.us.epil = add nuw i32 %j.021.us.epil, 1
 824   %epil.iter.sub = add i32 %epil.iter, -1
 825   %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
 826   br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
 827
 828 for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
 829   %inc10.us = add nuw i32 %i.023.us, 1
 830   %exitcond26 = icmp eq i32 %inc10.us, %N
 831   br i1 %exitcond26, label %for.cond.cleanup, label %for.cond1.preheader.us
 832
 833 for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
 834   ret void
 835 }
 836
 837 ; CHECK-LABEL: mac_16x16_2d
 838 ; CHECK: @ %for.body4.us
 839
 840 ; TODO: pre-indexed loads for both input arrays.
 841 ; CHECK-DEFAULT: ldrsh{{.*}}, #8]!
 842 ; CHECK-DEFAULT-NOT: ldr{{.*}}]!
 843
 844 ; TODO: increased complexity should lead to better codegen.
 845 ; CHECK-COMPLEX-NOT: ldr{{.*}}]!
 846
 847 ; DISABLED-NOT: ldr{{.*}}]!
 848
 849 ; CHECK-T2: @ %for.body4.us.epil
 850 ; CHECK-T2: ldrsh{{.*}}, #2]!
 851
 852 define void @mac_16x16_2d(i16* nocapture readonly %A, i16** nocapture readonly %B, i32* nocapture %C, i32 %N, i32 %M) {
 853 entry:
 854   %cmp23 = icmp eq i32 %N, 0
 855   %cmp220 = icmp eq i32 %M, 0
 856   %or.cond = or i1 %cmp23, %cmp220
 857   br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
 858
 859 for.cond1.preheader.us.preheader:                 ; preds = %entry
 860   %tmp = add i32 %M, -1
 861   %xtraiter = and i32 %M, 3
 862   %tmp1 = icmp ult i32 %tmp, 3
 863   %unroll_iter = sub i32 %M, %xtraiter
 864   %lcmp.mod = icmp eq i32 %xtraiter, 0
 865   br label %for.cond1.preheader.us
 866
 867 for.cond1.preheader.us:                           ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
 868   %i.024.us = phi i32 [ %inc10.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
 869   %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.024.us
 870   %tmp2 = load i16, i16* %arrayidx.us, align 2
 871   %conv.us = sext i16 %tmp2 to i32
 872   %arrayidx5.us = getelementptr inbounds i16*, i16** %B, i32 %i.024.us
 873   %tmp3 = load i16*, i16** %arrayidx5.us, align 4
 874   %arrayidx8.us = getelementptr inbounds i32, i32* %C, i32 %i.024.us
 875   %arrayidx8.promoted.us = load i32, i32* %arrayidx8.us, align 4
 876   br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
 877
 878 for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
 879   %add22.us = phi i32 [ %add.us.3, %for.body4.us ], [ %arrayidx8.promoted.us, %for.cond1.preheader.us ]
 880   %j.021.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
 881   %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
 882   %arrayidx6.us = getelementptr inbounds i16, i16* %tmp3, i32 %j.021.us
 883   %tmp4 = load i16, i16* %arrayidx6.us, align 2
 884   %conv7.us = sext i16 %tmp4 to i32
 885   %mul.us = mul nsw i32 %conv7.us, %conv.us
 886   %add.us = add nsw i32 %mul.us, %add22.us
 887   %inc.us = or i32 %j.021.us, 1
 888   %arrayidx6.us.1 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us
 889   %tmp5 = load i16, i16* %arrayidx6.us.1, align 2
 890   %conv7.us.1 = sext i16 %tmp5 to i32
 891   %mul.us.1 = mul nsw i32 %conv7.us.1, %conv.us
 892   %add.us.1 = add nsw i32 %mul.us.1, %add.us
 893   %inc.us.1 = or i32 %j.021.us, 2
 894   %arrayidx6.us.2 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.1
 895   %tmp6 = load i16, i16* %arrayidx6.us.2, align 2
 896   %conv7.us.2 = sext i16 %tmp6 to i32
 897   %mul.us.2 = mul nsw i32 %conv7.us.2, %conv.us
 898   %add.us.2 = add nsw i32 %mul.us.2, %add.us.1
 899   %inc.us.2 = or i32 %j.021.us, 3
 900   %arrayidx6.us.3 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.2
 901   %tmp7 = load i16, i16* %arrayidx6.us.3, align 2
 902   %conv7.us.3 = sext i16 %tmp7 to i32
 903   %mul.us.3 = mul nsw i32 %conv7.us.3, %conv.us
 904   %add.us.3 = add nsw i32 %mul.us.3, %add.us.2
 905   %inc.us.3 = add i32 %j.021.us, 4
 906   %niter.nsub.3 = add i32 %niter, -4
 907   %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
 908   br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
 909
 910 for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
 911   %add.us.lcssa.ph = phi i32 [ undef, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ]
 912   %add22.us.unr = phi i32 [ %arrayidx8.promoted.us, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ]
 913   %j.021.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
 914   br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
 915
 916 for.body4.us.epil:                                ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
 917   %add22.us.epil = phi i32 [ %add.us.epil, %for.body4.us.epil ], [ %add22.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
 918   %j.021.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.021.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
 919   %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
 920   %arrayidx6.us.epil = getelementptr inbounds i16, i16* %tmp3, i32 %j.021.us.epil
 921   %tmp8 = load i16, i16* %arrayidx6.us.epil, align 2
 922   %conv7.us.epil = sext i16 %tmp8 to i32
 923   %mul.us.epil = mul nsw i32 %conv7.us.epil, %conv.us
 924   %add.us.epil = add nsw i32 %mul.us.epil, %add22.us.epil
 925   %inc.us.epil = add nuw i32 %j.021.us.epil, 1
 926   %epil.iter.sub = add i32 %epil.iter, -1
 927   %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
 928   br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
 929
 930 for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
 931   %add.us.lcssa = phi i32 [ %add.us.lcssa.ph, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ], [ %add.us.epil, %for.body4.us.epil ]
 932   store i32 %add.us.lcssa, i32* %arrayidx8.us, align 4
 933   %inc10.us = add nuw i32 %i.024.us, 1
 934   %exitcond27 = icmp eq i32 %inc10.us, %N
 935   br i1 %exitcond27, label %for.cond.cleanup, label %for.cond1.preheader.us
 936
 937 for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
 938   ret void
 939 }
 940
 941 ; CHECK-LABEL: mul32x32_backwards
 942 ; CHECK: @ %for.body
 943
 944 ; TODO: post increments for decreasing addresses
 945 ; CHECK-DEFAULT-NOT: ldr{{.*}}]!
 946 ; CHECK-DEFAULT-NOT: str{{.*}}]!
 947
 948 ; CHECK-COMPLEX-NOT: ldr{{.*}}]!
 949 ; CHECK-COMPLEX-NOT: str{{.*}}]!
 950
 951 define void @mul32x32_backwards(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
 952 entry:
 953   %i.08 = add i32 %N, -1
 954   %cmp9 = icmp sgt i32 %i.08, -1
 955   br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
 956
 957 for.body.preheader:                               ; preds = %entry
 958   %xtraiter = and i32 %N, 3
 959   %lcmp.mod = icmp eq i32 %xtraiter, 0
 960   br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
 961
 962 for.body.prol:                                    ; preds = %for.body.prol, %for.body.preheader
 963   %i.010.prol = phi i32 [ %i.0.prol, %for.body.prol ], [ %i.08, %for.body.preheader ]
 964   %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader ]
 965   %arrayidx.prol = getelementptr inbounds i32, i32* %b, i32 %i.010.prol
 966   %tmp = load i32, i32* %arrayidx.prol, align 4
 967   %arrayidx1.prol = getelementptr inbounds i32, i32* %c, i32 %i.010.prol
 968   %tmp1 = load i32, i32* %arrayidx1.prol, align 4
 969   %mul.prol = mul nsw i32 %tmp1, %tmp
 970   %arrayidx2.prol = getelementptr inbounds i32, i32* %a, i32 %i.010.prol
 971   store i32 %mul.prol, i32* %arrayidx2.prol, align 4
 972   %i.0.prol = add i32 %i.010.prol, -1
 973   %prol.iter.sub = add i32 %prol.iter, -1
 974   %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
 975   br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
 976
 977 for.body.prol.loopexit:                           ; preds = %for.body.prol, %for.body.preheader
 978   %i.010.unr = phi i32 [ %i.08, %for.body.preheader ], [ %i.0.prol, %for.body.prol ]
 979   %tmp2 = icmp ult i32 %i.08, 3
 980   br i1 %tmp2, label %for.cond.cleanup, label %for.body
 981
 982 for.cond.cleanup:                                 ; preds = %for.body, %for.body.prol.loopexit, %entry
 983   ret void
 984
 985 for.body:                                         ; preds = %for.body, %for.body.prol.loopexit
 986   %i.010 = phi i32 [ %i.0.3, %for.body ], [ %i.010.unr, %for.body.prol.loopexit ]
 987   %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.010
 988   %tmp3 = load i32, i32* %arrayidx, align 4
 989   %arrayidx1 = getelementptr inbounds i32, i32* %c, i32 %i.010
 990   %tmp4 = load i32, i32* %arrayidx1, align 4
 991   %mul = mul nsw i32 %tmp4, %tmp3
 992   %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 %i.010
 993   store i32 %mul, i32* %arrayidx2, align 4
 994   %i.0 = add i32 %i.010, -1
 995   %arrayidx.1 = getelementptr inbounds i32, i32* %b, i32 %i.0
 996   %tmp5 = load i32, i32* %arrayidx.1, align 4
 997   %arrayidx1.1 = getelementptr inbounds i32, i32* %c, i32 %i.0
 998   %tmp6 = load i32, i32* %arrayidx1.1, align 4
 999   %mul.1 = mul nsw i32 %tmp6, %tmp5
1000   %arrayidx2.1 = getelementptr inbounds i32, i32* %a, i32 %i.0
1001   store i32 %mul.1, i32* %arrayidx2.1, align 4
1002   %i.0.1 = add i32 %i.010, -2
1003   %arrayidx.2 = getelementptr inbounds i32, i32* %b, i32 %i.0.1
1004   %tmp7 = load i32, i32* %arrayidx.2, align 4
1005   %arrayidx1.2 = getelementptr inbounds i32, i32* %c, i32 %i.0.1
1006   %tmp8 = load i32, i32* %arrayidx1.2, align 4
1007   %mul.2 = mul nsw i32 %tmp8, %tmp7
1008   %arrayidx2.2 = getelementptr inbounds i32, i32* %a, i32 %i.0.1
1009   store i32 %mul.2, i32* %arrayidx2.2, align 4
1010   %i.0.2 = add i32 %i.010, -3
1011   %arrayidx.3 = getelementptr inbounds i32, i32* %b, i32 %i.0.2
1012   %tmp9 = load i32, i32* %arrayidx.3, align 4
1013   %arrayidx1.3 = getelementptr inbounds i32, i32* %c, i32 %i.0.2
1014   %tmp10 = load i32, i32* %arrayidx1.3, align 4
1015   %mul.3 = mul nsw i32 %tmp10, %tmp9
1016   %arrayidx2.3 = getelementptr inbounds i32, i32* %a, i32 %i.0.2
1017   store i32 %mul.3, i32* %arrayidx2.3, align 4
1018   %i.0.3 = add i32 %i.010, -4
1019   %cmp.3 = icmp sgt i32 %i.0.3, -1
1020   br i1 %cmp.3, label %for.body, label %for.cond.cleanup
1021 }
1022
1023 ; CHECK-LABEL: mul32x32_forwards
1024 ; CHECK: @ %for.body
1025
1026 ; TODO: Would be good for the complexity limit didn't have to be increased to
1027 ; enable the pre-indexed accesses.
1028
1029 ; CHECK-DEFAULT-NOT: ldr{{.*}}]!
1030 ; CHECK-DEFAULT-NOT: str{{.*}}]!
1031
1032 ; CHECK-COMPLEX: ldr{{.*}}, #16]!
1033 ; CHECK-COMPLEX: ldr{{.*}}, #16]!
1034 ; CHECK-COMPLEX: str{{.*}}, #16]!
1035
1036 ; CHECK-T2: @ %for.body.epil
1037 ; CHECK-T2: ldr{{.*}}, #4]!
1038 ; CHECK-T2: ldr{{.*}}, #4]!
1039 ; CHECK-T2: str{{.*}}, #4]!
1040
1041 define void @mul32x32_forwards(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
1042 entry:
1043   %cmp8 = icmp eq i32 %N, 0
1044   br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1045
1046 for.body.preheader:                               ; preds = %entry
1047   %tmp = add i32 %N, -1
1048   %xtraiter = and i32 %N, 3
1049   %tmp1 = icmp ult i32 %tmp, 3
1050   br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1051
1052 for.body.preheader.new:                           ; preds = %for.body.preheader
1053   %unroll_iter = sub i32 %N, %xtraiter
1054   br label %for.body
1055
1056 for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
1057   %i.09.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1058   %lcmp.mod = icmp eq i32 %xtraiter, 0
1059   br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1060
1061 for.body.epil:                                    ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
1062   %i.09.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1063   %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1064   %arrayidx.epil = getelementptr inbounds i32, i32* %b, i32 %i.09.epil
1065   %tmp2 = load i32, i32* %arrayidx.epil, align 4
1066   %arrayidx1.epil = getelementptr inbounds i32, i32* %c, i32 %i.09.epil
1067   %tmp3 = load i32, i32* %arrayidx1.epil, align 4
1068   %mul.epil = mul nsw i32 %tmp3, %tmp2
1069   %arrayidx2.epil = getelementptr inbounds i32, i32* %a, i32 %i.09.epil
1070   store i32 %mul.epil, i32* %arrayidx2.epil, align 4
1071   %inc.epil = add nuw nsw i32 %i.09.epil, 1
1072   %epil.iter.sub = add i32 %epil.iter, -1
1073   %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1074   br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1075
1076 for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
1077   ret void
1078
1079 for.body:                                         ; preds = %for.body, %for.body.preheader.new
1080   %i.09 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1081   %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1082   %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.09
1083   %tmp4 = load i32, i32* %arrayidx, align 4
1084   %arrayidx1 = getelementptr inbounds i32, i32* %c, i32 %i.09
1085   %tmp5 = load i32, i32* %arrayidx1, align 4
1086   %mul = mul nsw i32 %tmp5, %tmp4
1087   %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 %i.09
1088   store i32 %mul, i32* %arrayidx2, align 4
1089   %inc = or i32 %i.09, 1
1090   %arrayidx.1 = getelementptr inbounds i32, i32* %b, i32 %inc
1091   %tmp6 = load i32, i32* %arrayidx.1, align 4
1092   %arrayidx1.1 = getelementptr inbounds i32, i32* %c, i32 %inc
1093   %tmp7 = load i32, i32* %arrayidx1.1, align 4
1094   %mul.1 = mul nsw i32 %tmp7, %tmp6
1095   %arrayidx2.1 = getelementptr inbounds i32, i32* %a, i32 %inc
1096   store i32 %mul.1, i32* %arrayidx2.1, align 4
1097   %inc.1 = or i32 %i.09, 2
1098   %arrayidx.2 = getelementptr inbounds i32, i32* %b, i32 %inc.1
1099   %tmp8 = load i32, i32* %arrayidx.2, align 4
1100   %arrayidx1.2 = getelementptr inbounds i32, i32* %c, i32 %inc.1
1101   %tmp9 = load i32, i32* %arrayidx1.2, align 4
1102   %mul.2 = mul nsw i32 %tmp9, %tmp8
1103   %arrayidx2.2 = getelementptr inbounds i32, i32* %a, i32 %inc.1
1104   store i32 %mul.2, i32* %arrayidx2.2, align 4
1105   %inc.2 = or i32 %i.09, 3
1106   %arrayidx.3 = getelementptr inbounds i32, i32* %b, i32 %inc.2
1107   %tmp10 = load i32, i32* %arrayidx.3, align 4
1108   %arrayidx1.3 = getelementptr inbounds i32, i32* %c, i32 %inc.2
1109   %tmp11 = load i32, i32* %arrayidx1.3, align 4
1110   %mul.3 = mul nsw i32 %tmp11, %tmp10
1111   %arrayidx2.3 = getelementptr inbounds i32, i32* %a, i32 %inc.2
1112   store i32 %mul.3, i32* %arrayidx2.3, align 4
1113   %inc.3 = add nuw nsw i32 %i.09, 4
1114   %niter.nsub.3 = add i32 %niter, -4
1115   %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1116   br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1117 }