gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c

   1
   2 /* { dg-do compile { target { arm*-*-* } } } */
   3 /* { dg-require-effective-target arm_v8_1m_mve_ok } */
   4 /* { dg-options "-O3 -save-temps -fno-schedule-insns2 " } */
   5 /* { dg-add-options arm_v8_1m_mve } */
   6 /* { dg-additional-options "-mtune=cortex-m55" } */
   7 /* { dg-final { check-function-bodies "**" "" "" } } */
   8
   9 #include <arm_mve.h>
  10 /* Using a >=1 condition.  */
  11 void test1 (int32_t *a, int32_t *b, int32_t *c, int n)
  12 {
  13   while (n >= 1)
  14     {
  15       mve_pred16_t p = vctp32q (n);
  16       int32x4_t va = vldrwq_z_s32 (a, p);
  17       int32x4_t vb = vldrwq_z_s32 (b, p);
  18       int32x4_t vc = vaddq_x_s32 (va, vb, p);
  19       vstrwq_p_s32 (c, vc, p);
  20       c+=4;
  21       a+=4;
  22       b+=4;
  23       n-=4;
  24     }
  25 }
  26 /*
  27 ** test1:
  28 **...
  29 **      dlstp.32        lr, r3
  30 **      vldrw.32        q[0-9]+, \[r0\], #16
  31 **      vldrw.32        q[0-9]+, \[r1\], #16
  32 **      vadd.i32        (q[0-9]+), q[0-9]+, q[0-9]+
  33 **      vstrw.32        \1, \[r2\], #16
  34 **      letp    lr, .*
  35 **...
  36 */
  37
  38 /* Test a for loop format of decrementing to zero */
  39 int32_t a[] = {0, 1, 2, 3, 4, 5, 6, 7};
  40 void test2 (int32_t *b, int num_elems)
  41 {
  42     for (int i = num_elems; i > 0; i-= 4)
  43     {
  44         mve_pred16_t p = vctp32q (i);
  45         int32x4_t va = vldrwq_z_s32 (&(a[i]), p);
  46         vstrwq_p_s32 (b + i, va, p);
  47     }
  48 }
  49 /*
  50 ** test2:
  51 **...
  52 **      dlstp.32        lr, r1
  53 **...
  54 **      vldrw.32        (q[0-9]+), \[r3\], #-16
  55 **      vstrw.32        \1, \[r0\], #-16
  56 **      letp    lr, .*
  57 **...
  58 */
  59
  60 /* Iteration counter counting up to num_iter.  */
  61 void test3 (uint8_t *a, uint8_t *b, uint8_t *c, unsigned n)
  62 {
  63     int num_iter = (n + 15)/16;
  64     for (int i = 0; i < num_iter; i++)
  65     {
  66         mve_pred16_t p = vctp8q (n);
  67         uint8x16_t va = vldrbq_z_u8 (a, p);
  68         uint8x16_t vb = vldrbq_z_u8 (b, p);
  69         uint8x16_t vc = vaddq_x_u8 (va, vb, p);
  70         vstrbq_p_u8 (c, vc, p);
  71         n-=16;
  72         a += 16;
  73         b += 16;
  74         c += 16;
  75     }
  76 }
  77
  78 /*
  79 ** test3:
  80 **...
  81 **      dlstp.8 lr, r3
  82 **...
  83 **      vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
  84 **      vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
  85 **...
  86 **      vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
  87 **      vstrb.8 \3, \[(r[0-9]+|ip)\]
  88 **...
  89 **      letp    lr, .*
  90 **...
  91 */
  92
  93 /* Iteration counter counting down from num_iter.  */
  94 void test4 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
  95 {
  96     int num_iter = (n + 15)/16;
  97     for (int i = num_iter; i > 0; i--)
  98     {
  99         mve_pred16_t p = vctp8q (n);
 100         uint8x16_t va = vldrbq_z_u8 (a, p);
 101         uint8x16_t vb = vldrbq_z_u8 (b, p);
 102         uint8x16_t vc = vaddq_x_u8 (va, vb, p);
 103         vstrbq_p_u8 (c, vc, p);
 104         n-=16;
 105         a += 16;
 106         b += 16;
 107         c += 16;
 108     }
 109 }
 110 /*
 111 ** test4:
 112 **...
 113 **      dlstp.8 lr, r3
 114 **...
 115 **      vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
 116 **      vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
 117 **...
 118 **      vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
 119 **      vstrb.8 \3, \[(r[0-9]+|ip)\]
 120 **...
 121 **      letp    lr, .*
 122 **...
 123 */
 124
 125 /* Using an unpredicated arithmetic instruction within the loop.  */
 126 void test5 (uint8_t *a, uint8_t *b, uint8_t *c,  uint8_t *d, int n)
 127 {
 128     while (n > 0)
 129     {
 130         mve_pred16_t p = vctp8q (n);
 131         uint8x16_t va = vldrbq_z_u8 (a, p);
 132         uint8x16_t vb = vldrbq_u8 (b);
 133         /* Is affected by implicit predication, because vb also
 134         came from an unpredicated load, but there is no functional
 135         problem, because the result is used in a predicated store.  */
 136         uint8x16_t vc = vaddq_u8 (va, vb);
 137         uint8x16_t vd = vaddq_x_u8 (va, vb, p);
 138         vstrbq_p_u8 (c, vc, p);
 139         vstrbq_p_u8 (d, vd, p);
 140         n-=16;
 141         a += 16;
 142         b += 16;
 143         c += 16;
 144     }
 145 }
 146
 147 /*
 148 ** test5:
 149 **...
 150 **      dlstp.8 lr, r[0-9]+
 151 **...
 152 **      vldrb.8 q[0-9]+, \[r1\]
 153 **      vldrb.8 q[0-9]+, \[r2\]
 154 **...
 155 **      vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
 156 **...
 157 **      vstrb.8 \1, \[r2\]
 158 **      vstrb.8 \1, \[r3\]
 159 **      letp    lr, .*
 160 **...
 161 */
 162
 163 /* Using a different VPR value for one instruction in the loop.  */
 164 void test6 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
 165 {
 166   while (n > 0)
 167     {
 168       mve_pred16_t p = vctp32q (n);
 169       int32x4_t va = vldrwq_z_s32 (a, p);
 170       int32x4_t vb = vldrwq_z_s32 (b, p1);
 171       int32x4_t vc = vaddq_x_s32 (va, vb, p);
 172       vstrwq_p_s32 (c, vc, p);
 173       c += 4;
 174       a += 4;
 175       b += 4;
 176       n -= 4;
 177     }
 178 }
 179
 180 /*
 181 ** test6:
 182 **...
 183 **      dlstp.32        lr, r3
 184 **      vldrw.32        q[0-9]+, \[r0\], #16
 185 **      vpst
 186 **      vldrwt.32       q[0-9]+, \[r1\], #16
 187 **      vadd.i32        (q[0-9]+), q[0-9]+, q[0-9]+
 188 **      vstrw.32        \1, \[r2\], #16
 189 **      letp    lr, .*
 190 **...
 191 */
 192
 193 /* Generating and using another VPR value in the loop, with a vctp.
 194    The doloop logic will always try to do the transform on the first
 195    vctp it encounters, so this is still expected to work.  */
 196 void test7 (int32_t *a, int32_t *b, int32_t *c, int n, int g)
 197 {
 198   while (n > 0)
 199     {
 200       mve_pred16_t p = vctp32q (n);
 201       int32x4_t va = vldrwq_z_s32 (a, p);
 202       mve_pred16_t p1 = vctp32q (g);
 203       int32x4_t vb = vldrwq_z_s32 (b, p1);
 204       int32x4_t vc = vaddq_x_s32 (va, vb, p);
 205       vstrwq_p_s32 (c, vc, p);
 206       c += 4;
 207       a += 4;
 208       b += 4;
 209       n -= 4;
 210     }
 211 }
 212 /*
 213 ** test7:
 214 **...
 215 **      dlstp.32        lr, r3
 216 **      vldrw.32        q[0-9]+, \[r0\], #16
 217 **      vpst
 218 **      vldrwt.32       q[0-9]+, \[r1\], #16
 219 **      vadd.i32        (q[0-9]+), q[0-9]+, q[0-9]+
 220 **      vstrw.32        \1, \[r2\], #16
 221 **      letp    lr, .*
 222 **...
 223 */
 224
 225 /* Generating and using a different VPR value in the loop, with a vctp,
 226    but this time the p1 will also change in every loop (still fine)  */
 227 void test8 (int32_t *a, int32_t *b, int32_t *c, int n, int g)
 228 {
 229   while (n > 0)
 230     {
 231       mve_pred16_t p = vctp32q (n);
 232       int32x4_t va = vldrwq_z_s32 (a, p);
 233       mve_pred16_t p1 = vctp32q (g);
 234       int32x4_t vb = vldrwq_z_s32 (b, p1);
 235       int32x4_t vc = vaddq_x_s32 (va, vb, p);
 236       vstrwq_p_s32 (c, vc, p);
 237       c += 4;
 238       a += 4;
 239       b += 4;
 240       n -= 4;
 241       g++;
 242     }
 243 }
 244
 245 /*
 246 ** test8:
 247 **...
 248 **      dlstp.32        lr, r3
 249 **      vldrw.32        q[0-9]+, \[r0\], #16
 250 **      vctp.32 r4
 251 **      vpst
 252 **      vldrwt.32       q[0-9]+, \[r1\], #16
 253 **...
 254 **      vadd.i32        (q[0-9]+), q[0-9]+, q[0-9]+
 255 **      vstrw.32        \1, \[r2\], #16
 256 **      letp    lr, .*
 257 **...
 258 */
 259
 260 /* Generating and using a different VPR value in the loop, with a vctp_m
 261    that is independent of the loop vctp VPR.  */
 262 void test9 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
 263 {
 264   while (n > 0)
 265     {
 266       mve_pred16_t p = vctp32q (n);
 267       int32x4_t va = vldrwq_z_s32 (a, p);
 268       mve_pred16_t p2 = vctp32q_m (n, p1);
 269       int32x4_t vb = vldrwq_z_s32 (b, p1);
 270       int32x4_t vc = vaddq_x_s32 (va, vb, p2);
 271       vstrwq_p_s32 (c, vc, p);
 272       c += 4;
 273       a += 4;
 274       b += 4;
 275       n -= 4;
 276     }
 277 }
 278
 279 /*
 280 ** test9:
 281 **...
 282 **      dlstp.32        lr, r3
 283 **      vldrw.32        q[0-9]+, \[r0\], #16
 284 **      vmsr    p0, (r[0-9]+)   @ movhi
 285 **      vpst
 286 **      vctpt.32        r3
 287 **      vmrs    (r[0-9]+), p0   @ movhi
 288 **      vmsr    p0, \1  @ movhi
 289 **      vpst
 290 **      vldrwt.32       q[0-9]+, \[r1\], #16
 291 **      vmsr    p0, \2  @ movhi
 292 **      vpst
 293 **      vaddt.i32       (q[0-9]+), q[0-9]+, q[0-9]+
 294 **...
 295 **      vstrw.32        \3, \[r2\], #16
 296 **      letp    lr, .*
 297 **...
 298 */
 299
 300 /* Generating and using a different VPR value in the loop,
 301    with a vctp_m that is tied to the base vctp VPR.  This
 302    is still fine, because the vctp_m will be transformed
 303    into a vctp and be implicitly predicated.  */
 304 void test10 (int32_t *a, int32_t *b, int32_t *c, int n)
 305 {
 306   while (n > 0)
 307     {
 308       mve_pred16_t p = vctp32q (n);
 309       int32x4_t va = vldrwq_z_s32 (a, p);
 310       mve_pred16_t p1 = vctp32q_m (n, p);
 311       int32x4_t vb = vldrwq_z_s32 (b, p1);
 312       int32x4_t vc = vaddq_x_s32 (va, vb, p1);
 313       vstrwq_p_s32 (c, vc, p);
 314       c += 4;
 315       a += 4;
 316       b += 4;
 317       n -= 4;
 318     }
 319 }
 320 /*
 321    We don't need that extra vctp in the loop, but we currently do not optimize
 322    it away, however, it is not wrong to use it...
 323 */
 324 /*
 325 ** test10:
 326 **...
 327 **      dlstp.32        lr, r3
 328 **      vctp.32 r3
 329 **      vldrw.32        q[0-9]+, \[r0\], #16
 330 **...
 331 **      vpst
 332 **      vldrwt.32       q[0-9]+, \[r1\], #16
 333 **      vpst
 334 **      vaddt.i32       (q[0-9]+), q[0-9]+, q[0-9]+
 335 **      vstrw.32        \1, \[r2\], #16
 336 **      letp    lr, .*
 337 **...
 338 */
 339
 340 /* Generating and using a different VPR value in the loop, with a vcmp.  */
 341 void test11 (int32_t *a, int32_t *b, int32_t *c, int n)
 342 {
 343   while (n > 0)
 344     {
 345       mve_pred16_t p = vctp32q (n);
 346       int32x4_t va = vldrwq_z_s32 (a, p);
 347       int32x4_t vb = vldrwq_z_s32 (b, p);
 348       mve_pred16_t p1 = vcmpeqq_s32 (va, vb);
 349       int32x4_t vc = vaddq_x_s32 (va, vb, p1);
 350       vstrwq_p_s32 (c, vc, p);
 351       c += 4;
 352       a += 4;
 353       b += 4;
 354       n -= 4;
 355     }
 356 }
 357
 358 /*
 359 ** test11:
 360 **...
 361 **      dlstp.32        lr, r3
 362 **      vldrw.32        q[0-9]+, \[r0\], #16
 363 **      vldrw.32        q[0-9]+, \[r1\], #16
 364 **      vcmp.i32        eq, q[0-9]+, q[0-9]+
 365 **      vpst
 366 **      vaddt.i32       (q[0-9]+), q[0-9]+, q[0-9]+
 367 **      vstrw.32        \1, \[r2\], #16
 368 **      letp    lr, .*
 369 **...
 370 */
 371
 372 /* Generating and using a different VPR value in the loop, with a vcmp_m.  */
 373 void test12 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
 374 {
 375   while (n > 0)
 376     {
 377       mve_pred16_t p = vctp32q (n);
 378       int32x4_t va = vldrwq_z_s32 (a, p);
 379       int32x4_t vb = vldrwq_z_s32 (b, p);
 380       mve_pred16_t p2 = vcmpeqq_m_s32 (va, vb, p1);
 381       int32x4_t vc = vaddq_x_s32 (va, vb, p2);
 382       vstrwq_p_s32 (c, vc, p);
 383       c += 4;
 384       a += 4;
 385       b += 4;
 386       n -= 4;
 387     }
 388 }
 389
 390 /*
 391 ** test12:
 392 **...
 393 **      dlstp.32        lr, r3
 394 **      vldrw.32        q[0-9]+, \[r0\], #16
 395 **      vldrw.32        q[0-9]+, \[r1\], #16
 396 **      vmsr    p0, (r[0-9]+|ip)        @ movhi
 397 **      vpst
 398 **      vcmpt.i32       eq, q[0-9]+, q[0-9]+
 399 **      vpst
 400 **      vaddt.i32       (q[0-9]+), q[0-9]+, q[0-9]+
 401 **      vstrw.32        \2, \[r2\], #16
 402 **      letp    lr, .*
 403 **...
 404 */
 405
 406 /* Generating and using a different VPR value in the loop, with a vcmp_m
 407    that is tied to the base vctp VPR (same as above, this will be turned
 408    into a vcmp and be implicitly predicated).  */
 409 void test13 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
 410 {
 411   while (n > 0)
 412     {
 413       mve_pred16_t p = vctp32q (n);
 414       int32x4_t va = vldrwq_z_s32 (a, p);
 415       int32x4_t vb = vldrwq_z_s32 (b, p);
 416       mve_pred16_t p2 = vcmpeqq_m_s32 (va, vb, p);
 417       int32x4_t vc = vaddq_x_s32 (va, vb, p2);
 418       vstrwq_p_s32 (c, vc, p);
 419       c += 4;
 420       a += 4;
 421       b += 4;
 422       n -= 4;
 423     }
 424 }
 425
 426 /*
 427 ** test13:
 428 **...
 429 **      dlstp.32        lr, r3
 430 **      vldrw.32        q[0-9]+, \[r0\], #16
 431 **      vldrw.32        q[0-9]+, \[r1\], #16
 432 **      vcmp.i32        eq, q[0-9]+, q[0-9]+
 433 **      vpst
 434 **      vaddt.i32       (q[0-9]+), q[0-9]+, q[0-9]+
 435 **      vstrw.32        \1, \[r2\], #16
 436 **      letp    lr, .*
 437 **...
 438 */
 439
 440 /* Similar to test27 in dsltp-invalid-asm.c, but use a predicated load to make
 441    it safe to implicitly predicate the vaddv.  */
 442 void test14 (int32_t *a, int32_t *c, int n)
 443 {
 444   int32_t res = 0;
 445   while (n > 0)
 446     {
 447       mve_pred16_t p = vctp32q (n);
 448       int32x4_t va = vldrwq_z_s32 (a, p);
 449       res += vaddvq_s32 (va);
 450       int32x4_t vc = vdupq_n_s32 (res);
 451       vstrwq_p_s32 (c, vc, p);
 452       a += 4;
 453       n -= 4;
 454     }
 455 }
 456
 457 /*
 458 ** test14:
 459 **...
 460 **      dlstp.32        lr, r2
 461 **      vldrw.32        (q[0-9]+), \[r0\], #16
 462 **      vaddv.s32       (r[0-9]+|ip), \1
 463 **      add     (r[0-9]+|ip), \3, \2
 464 **      vdup.32 (q[0-9]+), \3
 465 **      vstrw.32        \4, \[r1\]
 466 **      letp    lr, .*
 467 **...
 468 */
 469
 470 uint8_t test15 (uint8_t *a, uint8_t *b, int n)
 471 {
 472     uint8_t res = 0;
 473     uint8x16_t vc = vdupq_n_u8 (0);
 474     while (n > 0)
 475     {
 476        mve_pred16_t p = vctp8q (n);
 477        uint8x16_t va = vldrbq_z_u8 (a, p);
 478        uint8x16_t vb = vldrbq_u8 (b);
 479        vc = vaddq_m (vc, va, vc, p);
 480        res = vgetq_lane (vc, 5);
 481
 482        a += 16;
 483        b += 16;
 484        n -= 16;
 485     }
 486     return res;
 487 }
 488
 489 /*
 490 ** test15:
 491 **...
 492 **      dlstp.8 lr, r2
 493 **...
 494 **      vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
 495 **...
 496 **      vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
 497 **...
 498 **      letp    lr, .*
 499 **      vmov.u8 r[0-9]+, \2\[5\]
 500 **...
 501 */
 502
 503 uint8_t test16 (uint8_t *a, uint8_t *b, int n)
 504 {
 505     uint8_t res = 0;
 506     uint8x16_t vc = vdupq_n_u8 (0);
 507     while (n > 0)
 508     {
 509        mve_pred16_t p = vctp8q (n);
 510        uint8x16_t va = vldrbq_z_u8 (a, p);
 511        uint8x16_t vb = vldrbq_u8 (b);
 512        vc = vaddq (va, vc);
 513        vc = vaddq_m (vc, va, vc, p);
 514        res = vgetq_lane (vc, 5);
 515
 516        a += 16;
 517        b += 16;
 518        n -= 16;
 519     }
 520     return res;
 521 }
 522
 523 /*
 524 ** test16:
 525 **...
 526 **      dlstp.8 lr, r2
 527 **...
 528 **      vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
 529 **...
 530 **      vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
 531 **      vadd.i8 \2, q[0-9]+, q[0-9]+
 532 **      letp    lr, .*
 533 **      vmov.u8 r[0-9]+, \2\[5\]
 534 **...
 535 */
 536
 537
 538
 539 /* Using an across-vector unpredicated instruction in a valid way.
 540    This tests that "vc" has correctly masked the risky "vb".  */
 541 uint16_t test18 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
 542 {
 543   uint16x8_t vb = vldrhq_u16 (b);
 544   uint16_t res = 0;
 545   while (n > 0)
 546     {
 547       mve_pred16_t p = vctp16q (n);
 548       uint16x8_t va = vldrhq_z_u16 (a, p);
 549       uint16x8_t vc = vaddq_m_u16 (va, va, vb, p);
 550       res += vaddvq_u16 (vc);
 551       c += 8;
 552       a += 8;
 553       b += 8;
 554       n -= 8;
 555     }
 556   return res;
 557 }
 558
 559 /*
 560 ** test18:
 561 **...
 562 **      dlstp.16        lr, r3
 563 **      vldrh.16        (q[0-9]+), \[r2\], #16
 564 **      vadd.i16        \1, q[0-9]+, q[0-9]+
 565 **      vaddv.u16       (r[0-9]+|ip), \1
 566 **      add     (r[0-9]+|ip), \3, \2
 567 **      uxth    \3, \3
 568 **      letp    lr, .*
 569 **...
 570 */
 571
 572 /* Using an across-vector unpredicated instruction with implicit scalar adding from outside the loop.  */
 573 uint16_t test19 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
 574 {
 575   uint16x8_t vb = vldrhq_u16 (b);
 576   uint16_t res = 0;
 577   while (n > 0)
 578     {
 579       mve_pred16_t p = vctp16q (n);
 580       uint16x8_t va = vldrhq_z_u16 (a, p);
 581       uint16x8_t vc = vaddq_m_u16 (va, va, vb, p);
 582       res = vaddvaq_u16 (res, vc);
 583       c += 8;
 584       a += 8;
 585       b += 8;
 586       n -= 8;
 587     }
 588   return res;
 589 }
 590
 591 /*
 592 ** test19:
 593 **...
 594 **      dlstp.16        lr, r3
 595 **      vldrh.16        (q[0-9]+), \[r2\], #16
 596 **      vadd.i16        \1, q[0-9]+, q[0-9]+
 597 **      vaddva.u16      (r[0-9]+|ip), \1
 598 **      uxth    \2, \2
 599 **      letp    lr, .*
 600 **...
 601 */
 602
 603
 604 /* Using an across-vector predicated instruction in a valid way.  */
 605 uint16_t  test20 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
 606 {
 607   uint16_t res = 0;
 608   while (n > 0)
 609     {
 610       mve_pred16_t p = vctp16q (n);
 611       uint16x8_t va = vldrhq_u16 (a);
 612       res = vaddvaq_p_u16 (res, va, p);
 613       c += 8;
 614       a += 8;
 615       b += 8;
 616       n -= 8;
 617     }
 618   return res;
 619 }
 620
 621 /* The uxth could be moved outside the loop.  */
 622 /*
 623 ** test20:
 624 **...
 625 **      dlstp.16        lr, r3
 626 **      vldrh.16        (q[0-9]+), \[r2\], #16
 627 **      vaddva.u16      (r[0-9]+|ip), \1
 628 **      uxth    \2, \2
 629 **      letp    lr, .*
 630 **...
 631 */
 632
 633 /* Using an across-vector predicated instruction in a valid way.  */
 634 uint16_t  test21 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
 635 {
 636   uint16_t res = 0;
 637   while (n > 0)
 638     {
 639       mve_pred16_t p = vctp16q (n);
 640       uint16x8_t va = vldrhq_u16 (a);
 641       res++;
 642       res = vaddvaq_p_u16 (res, va, p);
 643       c += 8;
 644       a += 8;
 645       b += 8;
 646       n -= 8;
 647     }
 648   return res;
 649 }
 650
 651 /* Also think it'd be safe to move uxth outside of the loop here.  */
 652 /*
 653 ** test21:
 654 **...
 655 **      dlstp.16        lr, r3
 656 **      vldrh.16        (q[0-9]+), \[r2\], #16
 657 **      adds    (r[0-9]+|ip), \2, #1
 658 **      uxth    \2, \2
 659 **      vaddva.u16      \2, \1
 660 **      uxth    \2, \2
 661 **      letp    lr, .*
 662 **...
 663 */
 664
 665 int test22 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
 666 {
 667     int res = 0;
 668     while (n > 0)
 669     {
 670         mve_pred16_t p = vctp8q (n);
 671         uint8x16_t va = vldrbq_z_u8 (a, p);
 672         res = vmaxvq (res, va);
 673         n-=16;
 674         a+=16;
 675     }
 676     return res;
 677 }
 678
 679 /*
 680 ** test22:
 681 **...
 682 **      dlstp.8 lr, r3
 683 **...
 684 **      vldrb.8 (q[0-9]+), \[r[0-9]+\]
 685 **...
 686 **      vmaxv.u8        (r[0-9]+|ip), \1
 687 **      uxtb    \2, \2
 688 **      letp    lr, .*
 689 **...
 690 */
 691
 692 int test23 (int8_t *a, int8_t *b, int8_t *c, int n)
 693 {
 694     int res = 0;
 695     while (n > 0)
 696     {
 697         mve_pred16_t p = vctp8q (n);
 698         int8x16_t va = vldrbq_z_s8 (a, p);
 699         res = vmaxavq (res, va);
 700         n-=16;
 701         a+=16;
 702     }
 703     return res;
 704 }
 705
 706 /*
 707 ** test23:
 708 **...
 709 **      dlstp.8 lr, r3
 710 **...
 711 **      vldrb.8 (q[0-9]+), \[r3\]
 712 **...
 713 **      vmaxav.s8       (r[0-9]+|ip), \1
 714 **      uxtb    \2, \2
 715 **      letp    lr, .*
 716 **...
 717 */
 718
 719 /* Like test1, but update n before vctp, meaning we should only iterate for n-4
 720    elements.  */
 721 void test24 (int32_t *a, int32_t *b, int32_t *c, int n)
 722 {
 723   while (n >= 1)
 724     {
 725       n-=4;
 726       mve_pred16_t p = vctp32q (n);
 727       int32x4_t va = vldrwq_z_s32 (a, p);
 728       int32x4_t vb = vldrwq_z_s32 (b, p);
 729       int32x4_t vc = vaddq_x_s32 (va, vb, p);
 730       vstrwq_p_s32 (c, vc, p);
 731       c+=4;
 732       a+=4;
 733       b+=4;
 734     }
 735 }
 736 /*
 737 ** test24:
 738 **...
 739 **      subs    r3, r3, #4
 740 **...
 741 **      dlstp.32        lr, r3
 742 **      vldrw.32        q[0-9]+, \[r0\], #16
 743 **      vldrw.32        q[0-9]+, \[r1\], #16
 744 **      vadd.i32        (q[0-9]+), q[0-9]+, q[0-9]+
 745 **      vstrw.32        \1, \[r2\], #16
 746 **      letp    lr, .*
 747 **...
 748 */
 749