1 /* { dg-do compile { target { arm*-*-* } } } */
2 /* { dg-require-effective-target arm_v8_1m_mve_ok } */
3 /* { dg-options "-O3 -save-temps" } */
4 /* { dg-add-options arm_v8_1m_mve } */
9 /* Terminating on a non-zero number of elements. */
10 void test0 (uint8_t *a
, uint8_t *b
, uint8_t *c
, int n
)
14 mve_pred16_t p
= vctp8q (n
);
15 uint8x16_t va
= vldrbq_z_u8 (a
, p
);
16 uint8x16_t vb
= vldrbq_z_u8 (b
, p
);
17 uint8x16_t vc
= vaddq_x_u8 (va
, vb
, p
);
18 vstrbq_p_u8 (c
, vc
, p
);
23 /* Terminating on n >= 0. */
24 void test1 (uint8_t *a
, uint8_t *b
, uint8_t *c
, int n
)
28 mve_pred16_t p
= vctp8q (n
);
29 uint8x16_t va
= vldrbq_z_u8 (a
, p
);
30 uint8x16_t vb
= vldrbq_z_u8 (b
, p
);
31 uint8x16_t vc
= vaddq_x_u8 (va
, vb
, p
);
32 vstrbq_p_u8 (c
, vc
, p
);
37 /* Similar, terminating on a non-zero number of elements, but in a for loop
39 int32_t a
[] = {0, 1, 2, 3, 4, 5, 6, 7};
40 void test2 (int32_t *b
, int num_elems
)
42 for (int i
= num_elems
; i
>= 2; i
-= 4)
44 mve_pred16_t p
= vctp32q (i
);
45 int32x4_t va
= vldrwq_z_s32 (&(a
[i
]), p
);
46 vstrwq_p_s32 (b
+ i
, va
, p
);
50 /* Iteration counter counting up to num_iter, with a non-zero starting num. */
51 void test3 (uint8_t *a
, uint8_t *b
, uint8_t *c
, int n
)
53 int num_iter
= (n
+ 15)/16;
54 for (int i
= 1; i
< num_iter
; i
++)
56 mve_pred16_t p
= vctp8q (n
);
57 uint8x16_t va
= vldrbq_z_u8 (a
, p
);
58 uint8x16_t vb
= vldrbq_z_u8 (b
, p
);
59 uint8x16_t vc
= vaddq_x_u8 (va
, vb
, p
);
60 vstrbq_p_u8 (c
, vc
, p
);
65 /* Iteration counter counting up to num_iter, with a larger increment */
66 void test4 (uint8_t *a
, uint8_t *b
, uint8_t *c
, int n
)
68 int num_iter
= (n
+ 15)/16;
69 for (int i
= 0; i
< num_iter
; i
+=2)
71 mve_pred16_t p
= vctp8q (n
);
72 uint8x16_t va
= vldrbq_z_u8 (a
, p
);
73 uint8x16_t vb
= vldrbq_z_u8 (b
, p
);
74 uint8x16_t vc
= vaddq_x_u8 (va
, vb
, p
);
75 vstrbq_p_u8 (c
, vc
, p
);
80 /* Using an unpredicated store instruction within the loop. */
81 void test5 (uint8_t *a
, uint8_t *b
, uint8_t *c
, uint8_t *d
, int n
)
85 mve_pred16_t p
= vctp8q (n
);
86 uint8x16_t va
= vldrbq_z_u8 (a
, p
);
87 uint8x16_t vb
= vldrbq_z_u8 (b
, p
);
88 uint8x16_t vc
= vaddq_u8 (va
, vb
);
89 uint8x16_t vd
= vaddq_x_u8 (va
, vb
, p
);
95 /* Using an unpredicated store outside the loop. */
96 void test6 (uint8_t *a
, uint8_t *b
, uint8_t *c
, int n
, uint8x16_t vx
)
100 mve_pred16_t p
= vctp8q (n
);
101 uint8x16_t va
= vldrbq_z_u8 (a
, p
);
102 uint8x16_t vb
= vldrbq_z_u8 (b
, p
);
103 uint8x16_t vc
= vaddq_m_u8 (vx
, va
, vb
, p
);
104 vx
= vaddq_u8 (vx
, vc
);
112 /* Using a VPR that gets modified within the loop. */
113 void test9 (int32_t *a
, int32_t *b
, int32_t *c
, int n
)
117 mve_pred16_t p
= vctp32q (n
);
118 int32x4_t va
= vldrwq_z_s32 (a
, p
);
120 int32x4_t vb
= vldrwq_z_s32 (b
, p
);
121 int32x4_t vc
= vaddq_x_s32 (va
, vb
, p
);
122 vstrwq_p_s32 (c
, vc
, p
);
130 /* Using a VPR that gets re-generated within the loop. */
131 void test10 (int32_t *a
, int32_t *b
, int32_t *c
, int n
)
133 mve_pred16_t p
= vctp32q (n
);
136 int32x4_t va
= vldrwq_z_s32 (a
, p
);
138 int32x4_t vb
= vldrwq_z_s32 (b
, p
);
139 int32x4_t vc
= vaddq_x_s32 (va
, vb
, p
);
140 vstrwq_p_s32 (c
, vc
, p
);
148 /* Using vctp32q_m instead of vctp32q. */
149 void test11 (int32_t *a
, int32_t *b
, int32_t *c
, int n
, mve_pred16_t p0
)
153 mve_pred16_t p
= vctp32q_m (n
, p0
);
154 int32x4_t va
= vldrwq_z_s32 (a
, p
);
155 int32x4_t vb
= vldrwq_z_s32 (b
, p
);
156 int32x4_t vc
= vaddq_x_s32 (va
, vb
, p
);
157 vstrwq_p_s32 (c
, vc
, p
);
165 /* Using an unpredicated op with a scalar output, where the result is valid
166 outside the bb. This is invalid, because one of the inputs to the
167 unpredicated op is also unpredicated. */
168 uint8_t test12 (uint8_t *a
, uint8_t *b
, uint8_t *c
, int n
, uint8x16_t vx
)
173 mve_pred16_t p
= vctp8q (n
);
174 uint8x16_t va
= vldrbq_z_u8 (a
, p
);
175 uint8x16_t vb
= vldrbq_u8 (b
);
176 uint8x16_t vc
= vaddq_u8 (va
, vb
);
177 sum
+= vaddvq_u8 (vc
);
185 /* Using an unpredicated vcmp to generate a new predicate value in the
186 loop and then using that VPR to predicate a store insn. */
187 void test13 (int32_t *a
, int32_t *b
, int32x4_t vc
, int32_t *c
, int n
)
191 mve_pred16_t p
= vctp32q (n
);
192 int32x4_t va
= vldrwq_s32 (a
);
193 int32x4_t vb
= vldrwq_z_s32 (b
, p
);
194 int32x4_t vc
= vaddq_s32 (va
, vb
);
195 mve_pred16_t p1
= vcmpeqq_s32 (va
, vc
);
196 vstrwq_p_s32 (c
, vc
, p1
);
204 /* Using an across-vector unpredicated instruction. "vb" is the risk. */
205 uint16_t test14 (uint16_t *a
, uint16_t *b
, uint16_t *c
, int n
)
207 uint16x8_t vb
= vldrhq_u16 (b
);
211 mve_pred16_t p
= vctp16q (n
);
212 uint16x8_t va
= vldrhq_z_u16 (a
, p
);
213 vb
= vaddq_u16 (va
, vb
);
214 res
= vaddvq_u16 (vb
);
223 /* Using an across-vector unpredicated instruction. "vc" is the risk. */
224 uint16_t test15 (uint16_t *a
, uint16_t *b
, uint16_t *c
, int n
)
226 uint16x8_t vb
= vldrhq_u16 (b
);
230 mve_pred16_t p
= vctp16q (n
);
231 uint16x8_t va
= vldrhq_z_u16 (a
, p
);
232 uint16x8_t vc
= vaddq_u16 (va
, vb
);
233 res
= vaddvaq_u16 (res
, vc
);
242 uint16_t test16 (uint16_t *a
, uint16_t *b
, uint16_t *c
, int n
)
247 mve_pred16_t p
= vctp16q (n
);
248 uint16x8_t vb
= vldrhq_u16 (b
);
249 uint16x8_t va
= vldrhq_z_u16 (a
, p
);
250 res
= vaddvaq_u16 (res
, vb
);
251 res
= vaddvaq_p_u16 (res
, va
, p
);
260 int test17 (int8_t *a
, int8_t *b
, int8_t *c
, int n
)
265 mve_pred16_t p
= vctp8q (n
);
266 int8x16_t va
= vldrbq_z_s8 (a
, p
);
267 res
= vmaxvq (res
, va
);
276 int test18 (int8_t *a
, int8_t *b
, int8_t *c
, int n
)
281 mve_pred16_t p
= vctp8q (n
);
282 int8x16_t va
= vldrbq_z_s8 (a
, p
);
283 res
= vminvq (res
, va
);
290 int test19 (int8_t *a
, int8_t *b
, int8_t *c
, int n
)
295 mve_pred16_t p
= vctp8q (n
);
296 int8x16_t va
= vldrbq_z_s8 (a
, p
);
297 res
= vminavq (res
, va
);
304 int test20 (uint8_t *a
, uint8_t *b
, uint8_t *c
, int n
)
309 mve_pred16_t p
= vctp8q (n
);
310 uint8x16_t va
= vldrbq_z_u8 (a
, p
);
311 res
= vminvq (res
, va
);
318 uint8x16_t
test21 (uint8_t *a
, uint32_t *b
, int n
, uint8x16_t res
)
322 mve_pred16_t p
= vctp8q (n
);
323 uint8x16_t va
= vldrbq_z_u8 (a
, p
);
324 res
= vshlcq_u8 (va
, b
, 1);
331 int8x16_t
test22 (int8_t *a
, int32_t *b
, int n
, int8x16_t res
)
335 mve_pred16_t p
= vctp8q (n
);
336 int8x16_t va
= vldrbq_z_s8 (a
, p
);
337 res
= vshlcq_s8 (va
, b
, 1);
344 /* Using an unsigned number of elements to count down from, with a >0*/
345 void test23 (int32_t *a
, int32_t *b
, int32_t *c
, unsigned int n
)
349 mve_pred16_t p
= vctp32q (n
);
350 int32x4_t va
= vldrwq_z_s32 (a
, p
);
351 int32x4_t vb
= vldrwq_z_s32 (b
, p
);
352 int32x4_t vc
= vaddq_x_s32 (va
, vb
, p
);
353 vstrwq_p_s32 (c
, vc
, p
);
361 /* Using an unsigned number of elements to count up to, with a <n*/
362 void test24 (uint8_t *a
, uint8_t *b
, uint8_t *c
, unsigned int n
)
364 for (int i
= 0; i
< n
; i
+=16)
366 mve_pred16_t p
= vctp8q (n
-i
);
367 uint8x16_t va
= vldrbq_z_u8 (a
, p
);
368 uint8x16_t vb
= vldrbq_z_u8 (b
, p
);
369 uint8x16_t vc
= vaddq_x_u8 (va
, vb
, p
);
370 vstrbq_p_u8 (c
, vc
, p
);
376 /* Using an unsigned number of elements to count up to, with a <=n*/
377 void test25 (uint8_t *a
, uint8_t *b
, uint8_t *c
, unsigned int n
)
379 for (int i
= 1; i
<= n
; i
+=16)
381 mve_pred16_t p
= vctp8q (n
-i
+1);
382 uint8x16_t va
= vldrbq_z_u8 (a
, p
);
383 uint8x16_t vb
= vldrbq_z_u8 (b
, p
);
384 uint8x16_t vc
= vaddq_x_u8 (va
, vb
, p
);
385 vstrbq_p_u8 (c
, vc
, p
);
389 /* Update n twice in the loop. */
390 void test26 (int32_t *a
, int32_t *b
, int32_t *c
, int n
)
395 mve_pred16_t p
= vctp32q (n
);
396 int32x4_t va
= vldrwq_z_s32 (a
, p
);
397 int32x4_t vb
= vldrwq_z_s32 (b
, p
);
398 int32x4_t vc
= vaddq_x_s32 (va
, vb
, p
);
399 vstrwq_p_s32 (c
, vc
, p
);
407 void test27 (int32_t *a
, int32_t *c
, int n
)
412 mve_pred16_t p
= vctp32q (n
);
413 int32x4_t va
= vldrwq_s32 (a
);
414 res
+= vaddvq_s32 (va
);
415 int32x4_t vc
= vdupq_n_s32 (res
);
416 vstrwq_p_s32 (c
, vc
, p
);
422 /* Using an unpredicated vcmp to generate a new predicate value in the
423 loop and then using it in a predicated store insn. */
424 void test28 (int32_t *a
, int32_t *b
, int32_t *c
, int n
)
428 mve_pred16_t p
= vctp32q (n
);
429 int32x4_t va
= vldrwq_z_s32 (a
, p
);
430 int32x4_t vb
= vldrwq_s32 (b
);
431 int32x4_t vc
= vaddq_x_s32 (va
, vb
, p
);
432 mve_pred16_t p1
= vcmpeqq_s32 (va
, vc
);
433 vstrwq_p_s32 (c
, vc
, p1
);
441 /* Using an unpredicated op with a scalar output, where the result is valid
442 outside the bb. The unpredicated lanes are not guaranteed zero, so would
443 affect the vaddv in the non-tail predicated case. */
444 uint8_t test29 (uint8_t *a
, uint8_t *b
, uint8_t *c
, int n
, uint8x16_t vx
)
449 mve_pred16_t p
= vctp8q (n
);
450 uint8x16_t va
= vldrbq_z_u8 (a
, p
);
451 uint8x16_t vb
= vldrbq_z_u8 (b
, p
);
452 uint8x16_t vc
= vaddq_m_u8 (vx
, va
, vb
, p
);
453 sum
+= vaddvq_u8 (vc
);
461 /* Same as above, but with another scalar op between the unpredicated op and
462 the scalar op outside the loop. */
463 uint8_t test30 (uint8_t *a
, uint8_t *b
, uint8_t *c
, int n
, uint8x16_t vx
, int g
)
468 mve_pred16_t p
= vctp8q (n
);
469 uint8x16_t va
= vldrbq_z_u8 (a
, p
);
470 uint8x16_t vb
= vldrbq_z_u8 (b
, p
);
471 uint8x16_t vc
= vaddq_m_u8 (vx
, va
, vb
, p
);
472 sum
+= vaddvq_u8 (vc
);
481 uint8_t test31 (uint8_t *a
, uint8_t *b
, int n
)
484 uint8x16_t vc
= vdupq_n_u8 (0);
487 mve_pred16_t p
= vctp8q (n
);
488 uint8x16_t va
= vldrbq_z_u8 (a
, p
);
489 uint8x16_t vb
= vldrbq_u8 (b
);
491 res
= vgetq_lane (vc
, 5);
500 uint8_t test32 (uint8_t *a
, uint8_t *b
, int n
)
503 uint8x16_t vc
= vdupq_n_u8 (0);
506 mve_pred16_t p
= vctp8q (n
);
507 uint8x16_t va
= vldrbq_z_u8 (a
, p
);
508 uint8x16_t vb
= vldrbq_u8 (b
);
509 vc
= vaddq_m (vc
, va
, vc
, p
);
511 res
= vgetq_lane (vc
, 5);
520 /* { dg-final { scan-assembler-not "\tdlstp" } } */
521 /* { dg-final { scan-assembler-not "\tletp" } } */