2 /* { dg-do compile { target { arm*-*-* } } } */
3 /* { dg-require-effective-target arm_v8_1m_mve_ok } */
4 /* { dg-options "-O3 -save-temps -fno-schedule-insns2 " } */
5 /* { dg-add-options arm_v8_1m_mve } */
6 /* { dg-additional-options "-mtune=cortex-m55" } */
7 /* { dg-final { check-function-bodies "**" "" "" } } */
10 /* Using a >=1 condition. */
11 void test1 (int32_t *a
, int32_t *b
, int32_t *c
, int n
)
15 mve_pred16_t p
= vctp32q (n
);
16 int32x4_t va
= vldrwq_z_s32 (a
, p
);
17 int32x4_t vb
= vldrwq_z_s32 (b
, p
);
18 int32x4_t vc
= vaddq_x_s32 (va
, vb
, p
);
19 vstrwq_p_s32 (c
, vc
, p
);
30 ** vldrw.32 q[0-9]+, \[r0\], #16
31 ** vldrw.32 q[0-9]+, \[r1\], #16
32 ** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+
33 ** vstrw.32 \1, \[r2\], #16
38 /* Test a for loop format of decrementing to zero */
39 int32_t a
[] = {0, 1, 2, 3, 4, 5, 6, 7};
40 void test2 (int32_t *b
, int num_elems
)
42 for (int i
= num_elems
; i
> 0; i
-= 4)
44 mve_pred16_t p
= vctp32q (i
);
45 int32x4_t va
= vldrwq_z_s32 (&(a
[i
]), p
);
46 vstrwq_p_s32 (b
+ i
, va
, p
);
54 ** vldrw.32 (q[0-9]+), \[r3\], #-16
55 ** vstrw.32 \1, \[r0\], #-16
60 /* Iteration counter counting up to num_iter. */
61 void test3 (uint8_t *a
, uint8_t *b
, uint8_t *c
, unsigned n
)
63 int num_iter
= (n
+ 15)/16;
64 for (int i
= 0; i
< num_iter
; i
++)
66 mve_pred16_t p
= vctp8q (n
);
67 uint8x16_t va
= vldrbq_z_u8 (a
, p
);
68 uint8x16_t vb
= vldrbq_z_u8 (b
, p
);
69 uint8x16_t vc
= vaddq_x_u8 (va
, vb
, p
);
70 vstrbq_p_u8 (c
, vc
, p
);
83 ** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
84 ** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
86 ** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
87 ** vstrb.8 \3, \[(r[0-9]+|ip)\]
93 /* Iteration counter counting down from num_iter. */
94 void test4 (uint8_t *a
, uint8_t *b
, uint8_t *c
, int n
)
96 int num_iter
= (n
+ 15)/16;
97 for (int i
= num_iter
; i
> 0; i
--)
99 mve_pred16_t p
= vctp8q (n
);
100 uint8x16_t va
= vldrbq_z_u8 (a
, p
);
101 uint8x16_t vb
= vldrbq_z_u8 (b
, p
);
102 uint8x16_t vc
= vaddq_x_u8 (va
, vb
, p
);
103 vstrbq_p_u8 (c
, vc
, p
);
115 ** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
116 ** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
118 ** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
119 ** vstrb.8 \3, \[(r[0-9]+|ip)\]
125 /* Using an unpredicated arithmetic instruction within the loop. */
126 void test5 (uint8_t *a
, uint8_t *b
, uint8_t *c
, uint8_t *d
, int n
)
130 mve_pred16_t p
= vctp8q (n
);
131 uint8x16_t va
= vldrbq_z_u8 (a
, p
);
132 uint8x16_t vb
= vldrbq_u8 (b
);
133 /* Is affected by implicit predication, because vb also
134 came from an unpredicated load, but there is no functional
135 problem, because the result is used in a predicated store. */
136 uint8x16_t vc
= vaddq_u8 (va
, vb
);
137 uint8x16_t vd
= vaddq_x_u8 (va
, vb
, p
);
138 vstrbq_p_u8 (c
, vc
, p
);
139 vstrbq_p_u8 (d
, vd
, p
);
150 ** dlstp.8 lr, r[0-9]+
152 ** vldrb.8 q[0-9]+, \[r1\]
153 ** vldrb.8 q[0-9]+, \[r2\]
155 ** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
157 ** vstrb.8 \1, \[r2\]
158 ** vstrb.8 \1, \[r3\]
163 /* Using a different VPR value for one instruction in the loop. */
164 void test6 (int32_t *a
, int32_t *b
, int32_t *c
, int n
, mve_pred16_t p1
)
168 mve_pred16_t p
= vctp32q (n
);
169 int32x4_t va
= vldrwq_z_s32 (a
, p
);
170 int32x4_t vb
= vldrwq_z_s32 (b
, p1
);
171 int32x4_t vc
= vaddq_x_s32 (va
, vb
, p
);
172 vstrwq_p_s32 (c
, vc
, p
);
184 ** vldrw.32 q[0-9]+, \[r0\], #16
186 ** vldrwt.32 q[0-9]+, \[r1\], #16
187 ** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+
188 ** vstrw.32 \1, \[r2\], #16
193 /* Generating and using another VPR value in the loop, with a vctp.
194 The doloop logic will always try to do the transform on the first
195 vctp it encounters, so this is still expected to work. */
196 void test7 (int32_t *a
, int32_t *b
, int32_t *c
, int n
, int g
)
200 mve_pred16_t p
= vctp32q (n
);
201 int32x4_t va
= vldrwq_z_s32 (a
, p
);
202 mve_pred16_t p1
= vctp32q (g
);
203 int32x4_t vb
= vldrwq_z_s32 (b
, p1
);
204 int32x4_t vc
= vaddq_x_s32 (va
, vb
, p
);
205 vstrwq_p_s32 (c
, vc
, p
);
216 ** vldrw.32 q[0-9]+, \[r0\], #16
218 ** vldrwt.32 q[0-9]+, \[r1\], #16
219 ** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+
220 ** vstrw.32 \1, \[r2\], #16
225 /* Generating and using a different VPR value in the loop, with a vctp,
226 but this time the p1 will also change in every loop (still fine) */
227 void test8 (int32_t *a
, int32_t *b
, int32_t *c
, int n
, int g
)
231 mve_pred16_t p
= vctp32q (n
);
232 int32x4_t va
= vldrwq_z_s32 (a
, p
);
233 mve_pred16_t p1
= vctp32q (g
);
234 int32x4_t vb
= vldrwq_z_s32 (b
, p1
);
235 int32x4_t vc
= vaddq_x_s32 (va
, vb
, p
);
236 vstrwq_p_s32 (c
, vc
, p
);
249 ** vldrw.32 q[0-9]+, \[r0\], #16
252 ** vldrwt.32 q[0-9]+, \[r1\], #16
254 ** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+
255 ** vstrw.32 \1, \[r2\], #16
260 /* Generating and using a different VPR value in the loop, with a vctp_m
261 that is independent of the loop vctp VPR. */
262 void test9 (int32_t *a
, int32_t *b
, int32_t *c
, int n
, mve_pred16_t p1
)
266 mve_pred16_t p
= vctp32q (n
);
267 int32x4_t va
= vldrwq_z_s32 (a
, p
);
268 mve_pred16_t p2
= vctp32q_m (n
, p1
);
269 int32x4_t vb
= vldrwq_z_s32 (b
, p1
);
270 int32x4_t vc
= vaddq_x_s32 (va
, vb
, p2
);
271 vstrwq_p_s32 (c
, vc
, p
);
283 ** vldrw.32 q[0-9]+, \[r0\], #16
284 ** vmsr p0, (r[0-9]+) @ movhi
287 ** vmrs (r[0-9]+), p0 @ movhi
288 ** vmsr p0, \1 @ movhi
290 ** vldrwt.32 q[0-9]+, \[r1\], #16
291 ** vmsr p0, \2 @ movhi
293 ** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+
295 ** vstrw.32 \3, \[r2\], #16
300 /* Generating and using a different VPR value in the loop,
301 with a vctp_m that is tied to the base vctp VPR. This
302 is still fine, because the vctp_m will be transformed
303 into a vctp and be implicitly predicated. */
304 void test10 (int32_t *a
, int32_t *b
, int32_t *c
, int n
)
308 mve_pred16_t p
= vctp32q (n
);
309 int32x4_t va
= vldrwq_z_s32 (a
, p
);
310 mve_pred16_t p1
= vctp32q_m (n
, p
);
311 int32x4_t vb
= vldrwq_z_s32 (b
, p1
);
312 int32x4_t vc
= vaddq_x_s32 (va
, vb
, p1
);
313 vstrwq_p_s32 (c
, vc
, p
);
321 We don't need that extra vctp in the loop, but we currently do not optimize
322 it away, however, it is not wrong to use it...
329 ** vldrw.32 q[0-9]+, \[r0\], #16
332 ** vldrwt.32 q[0-9]+, \[r1\], #16
334 ** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+
335 ** vstrw.32 \1, \[r2\], #16
340 /* Generating and using a different VPR value in the loop, with a vcmp. */
341 void test11 (int32_t *a
, int32_t *b
, int32_t *c
, int n
)
345 mve_pred16_t p
= vctp32q (n
);
346 int32x4_t va
= vldrwq_z_s32 (a
, p
);
347 int32x4_t vb
= vldrwq_z_s32 (b
, p
);
348 mve_pred16_t p1
= vcmpeqq_s32 (va
, vb
);
349 int32x4_t vc
= vaddq_x_s32 (va
, vb
, p1
);
350 vstrwq_p_s32 (c
, vc
, p
);
362 ** vldrw.32 q[0-9]+, \[r0\], #16
363 ** vldrw.32 q[0-9]+, \[r1\], #16
364 ** vcmp.i32 eq, q[0-9]+, q[0-9]+
366 ** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+
367 ** vstrw.32 \1, \[r2\], #16
372 /* Generating and using a different VPR value in the loop, with a vcmp_m. */
373 void test12 (int32_t *a
, int32_t *b
, int32_t *c
, int n
, mve_pred16_t p1
)
377 mve_pred16_t p
= vctp32q (n
);
378 int32x4_t va
= vldrwq_z_s32 (a
, p
);
379 int32x4_t vb
= vldrwq_z_s32 (b
, p
);
380 mve_pred16_t p2
= vcmpeqq_m_s32 (va
, vb
, p1
);
381 int32x4_t vc
= vaddq_x_s32 (va
, vb
, p2
);
382 vstrwq_p_s32 (c
, vc
, p
);
394 ** vldrw.32 q[0-9]+, \[r0\], #16
395 ** vldrw.32 q[0-9]+, \[r1\], #16
396 ** vmsr p0, (r[0-9]+|ip) @ movhi
398 ** vcmpt.i32 eq, q[0-9]+, q[0-9]+
400 ** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+
401 ** vstrw.32 \2, \[r2\], #16
406 /* Generating and using a different VPR value in the loop, with a vcmp_m
407 that is tied to the base vctp VPR (same as above, this will be turned
408 into a vcmp and be implicitly predicated). */
409 void test13 (int32_t *a
, int32_t *b
, int32_t *c
, int n
, mve_pred16_t p1
)
413 mve_pred16_t p
= vctp32q (n
);
414 int32x4_t va
= vldrwq_z_s32 (a
, p
);
415 int32x4_t vb
= vldrwq_z_s32 (b
, p
);
416 mve_pred16_t p2
= vcmpeqq_m_s32 (va
, vb
, p
);
417 int32x4_t vc
= vaddq_x_s32 (va
, vb
, p2
);
418 vstrwq_p_s32 (c
, vc
, p
);
430 ** vldrw.32 q[0-9]+, \[r0\], #16
431 ** vldrw.32 q[0-9]+, \[r1\], #16
432 ** vcmp.i32 eq, q[0-9]+, q[0-9]+
434 ** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+
435 ** vstrw.32 \1, \[r2\], #16
440 /* Similar to test27 in dsltp-invalid-asm.c, but use a predicated load to make
441 it safe to implicitly predicate the vaddv. */
442 void test14 (int32_t *a
, int32_t *c
, int n
)
447 mve_pred16_t p
= vctp32q (n
);
448 int32x4_t va
= vldrwq_z_s32 (a
, p
);
449 res
+= vaddvq_s32 (va
);
450 int32x4_t vc
= vdupq_n_s32 (res
);
451 vstrwq_p_s32 (c
, vc
, p
);
461 ** vldrw.32 (q[0-9]+), \[r0\], #16
462 ** vaddv.s32 (r[0-9]+|ip), \1
463 ** add (r[0-9]+|ip), \3, \2
464 ** vdup.32 (q[0-9]+), \3
465 ** vstrw.32 \4, \[r1\]
470 uint8_t test15 (uint8_t *a
, uint8_t *b
, int n
)
473 uint8x16_t vc
= vdupq_n_u8 (0);
476 mve_pred16_t p
= vctp8q (n
);
477 uint8x16_t va
= vldrbq_z_u8 (a
, p
);
478 uint8x16_t vb
= vldrbq_u8 (b
);
479 vc
= vaddq_m (vc
, va
, vc
, p
);
480 res
= vgetq_lane (vc
, 5);
494 ** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
496 ** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
499 ** vmov.u8 r[0-9]+, \2\[5\]
503 uint8_t test16 (uint8_t *a
, uint8_t *b
, int n
)
506 uint8x16_t vc
= vdupq_n_u8 (0);
509 mve_pred16_t p
= vctp8q (n
);
510 uint8x16_t va
= vldrbq_z_u8 (a
, p
);
511 uint8x16_t vb
= vldrbq_u8 (b
);
513 vc
= vaddq_m (vc
, va
, vc
, p
);
514 res
= vgetq_lane (vc
, 5);
528 ** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
530 ** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
531 ** vadd.i8 \2, q[0-9]+, q[0-9]+
533 ** vmov.u8 r[0-9]+, \2\[5\]
539 /* Using an across-vector unpredicated instruction in a valid way.
540 This tests that "vc" has correctly masked the risky "vb". */
541 uint16_t test18 (uint16_t *a
, uint16_t *b
, uint16_t *c
, int n
)
543 uint16x8_t vb
= vldrhq_u16 (b
);
547 mve_pred16_t p
= vctp16q (n
);
548 uint16x8_t va
= vldrhq_z_u16 (a
, p
);
549 uint16x8_t vc
= vaddq_m_u16 (va
, va
, vb
, p
);
550 res
+= vaddvq_u16 (vc
);
563 ** vldrh.16 (q[0-9]+), \[r2\], #16
564 ** vadd.i16 \1, q[0-9]+, q[0-9]+
565 ** vaddv.u16 (r[0-9]+|ip), \1
566 ** add (r[0-9]+|ip), \3, \2
572 /* Using an across-vector unpredicated instruction with implicit scalar adding from outside the loop. */
573 uint16_t test19 (uint16_t *a
, uint16_t *b
, uint16_t *c
, int n
)
575 uint16x8_t vb
= vldrhq_u16 (b
);
579 mve_pred16_t p
= vctp16q (n
);
580 uint16x8_t va
= vldrhq_z_u16 (a
, p
);
581 uint16x8_t vc
= vaddq_m_u16 (va
, va
, vb
, p
);
582 res
= vaddvaq_u16 (res
, vc
);
595 ** vldrh.16 (q[0-9]+), \[r2\], #16
596 ** vadd.i16 \1, q[0-9]+, q[0-9]+
597 ** vaddva.u16 (r[0-9]+|ip), \1
604 /* Using an across-vector predicated instruction in a valid way. */
605 uint16_t test20 (uint16_t *a
, uint16_t *b
, uint16_t *c
, int n
)
610 mve_pred16_t p
= vctp16q (n
);
611 uint16x8_t va
= vldrhq_u16 (a
);
612 res
= vaddvaq_p_u16 (res
, va
, p
);
621 /* The uxth could be moved outside the loop. */
626 ** vldrh.16 (q[0-9]+), \[r2\], #16
627 ** vaddva.u16 (r[0-9]+|ip), \1
633 /* Using an across-vector predicated instruction in a valid way. */
634 uint16_t test21 (uint16_t *a
, uint16_t *b
, uint16_t *c
, int n
)
639 mve_pred16_t p
= vctp16q (n
);
640 uint16x8_t va
= vldrhq_u16 (a
);
642 res
= vaddvaq_p_u16 (res
, va
, p
);
651 /* Also think it'd be safe to move uxth outside of the loop here. */
656 ** vldrh.16 (q[0-9]+), \[r2\], #16
657 ** adds (r[0-9]+|ip), \2, #1
665 int test22 (uint8_t *a
, uint8_t *b
, uint8_t *c
, int n
)
670 mve_pred16_t p
= vctp8q (n
);
671 uint8x16_t va
= vldrbq_z_u8 (a
, p
);
672 res
= vmaxvq (res
, va
);
684 ** vldrb.8 (q[0-9]+), \[r[0-9]+\]
686 ** vmaxv.u8 (r[0-9]+|ip), \1
692 int test23 (int8_t *a
, int8_t *b
, int8_t *c
, int n
)
697 mve_pred16_t p
= vctp8q (n
);
698 int8x16_t va
= vldrbq_z_s8 (a
, p
);
699 res
= vmaxavq (res
, va
);
711 ** vldrb.8 (q[0-9]+), \[r3\]
713 ** vmaxav.s8 (r[0-9]+|ip), \1
719 /* Like test1, but update n before vctp, meaning we should only iterate for n-4
721 void test24 (int32_t *a
, int32_t *b
, int32_t *c
, int n
)
726 mve_pred16_t p
= vctp32q (n
);
727 int32x4_t va
= vldrwq_z_s32 (a
, p
);
728 int32x4_t vb
= vldrwq_z_s32 (b
, p
);
729 int32x4_t vc
= vaddq_x_s32 (va
, vb
, p
);
730 vstrwq_p_s32 (c
, vc
, p
);
742 ** vldrw.32 q[0-9]+, \[r0\], #16
743 ** vldrw.32 q[0-9]+, \[r1\], #16
744 ** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+
745 ** vstrw.32 \1, \[r2\], #16