libcpp: Use constexpr for _cpp_trigraph_map initialization for C++14
[official-gcc.git] / gcc / testsuite / gcc.target / arm / mve / dlstp-compile-asm-2.c
blob84f4a2fc4f9bc7a90ba2d030f572c1dcacf4e5f4
2 /* { dg-do compile { target { arm*-*-* } } } */
3 /* { dg-require-effective-target arm_v8_1m_mve_ok } */
4 /* { dg-options "-O3 -save-temps -fno-schedule-insns2 " } */
5 /* { dg-add-options arm_v8_1m_mve } */
6 /* { dg-additional-options "-mtune=cortex-m55" } */
7 /* { dg-final { check-function-bodies "**" "" "" } } */
9 #include <arm_mve.h>
10 /* Using a >=1 condition. */
11 void test1 (int32_t *a, int32_t *b, int32_t *c, int n)
13 while (n >= 1)
15 mve_pred16_t p = vctp32q (n);
16 int32x4_t va = vldrwq_z_s32 (a, p);
17 int32x4_t vb = vldrwq_z_s32 (b, p);
18 int32x4_t vc = vaddq_x_s32 (va, vb, p);
19 vstrwq_p_s32 (c, vc, p);
20 c+=4;
21 a+=4;
22 b+=4;
23 n-=4;
27 ** test1:
28 **...
29 ** dlstp.32 lr, r3
30 ** vldrw.32 q[0-9]+, \[r0\], #16
31 ** vldrw.32 q[0-9]+, \[r1\], #16
32 ** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+
33 ** vstrw.32 \1, \[r2\], #16
34 ** letp lr, .*
35 **...
38 /* Test a for loop format of decrementing to zero */
39 int32_t a[] = {0, 1, 2, 3, 4, 5, 6, 7};
40 void test2 (int32_t *b, int num_elems)
42 for (int i = num_elems; i > 0; i-= 4)
44 mve_pred16_t p = vctp32q (i);
45 int32x4_t va = vldrwq_z_s32 (&(a[i]), p);
46 vstrwq_p_s32 (b + i, va, p);
50 ** test2:
51 **...
52 ** dlstp.32 lr, r1
53 **...
54 ** vldrw.32 (q[0-9]+), \[r3\], #-16
55 ** vstrw.32 \1, \[r0\], #-16
56 ** letp lr, .*
57 **...
60 /* Iteration counter counting up to num_iter. */
61 void test3 (uint8_t *a, uint8_t *b, uint8_t *c, unsigned n)
63 int num_iter = (n + 15)/16;
64 for (int i = 0; i < num_iter; i++)
66 mve_pred16_t p = vctp8q (n);
67 uint8x16_t va = vldrbq_z_u8 (a, p);
68 uint8x16_t vb = vldrbq_z_u8 (b, p);
69 uint8x16_t vc = vaddq_x_u8 (va, vb, p);
70 vstrbq_p_u8 (c, vc, p);
71 n-=16;
72 a += 16;
73 b += 16;
74 c += 16;
79 ** test3:
80 **...
81 ** dlstp.8 lr, r3
82 **...
83 ** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
84 ** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
85 **...
86 ** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
87 ** vstrb.8 \3, \[(r[0-9]+|ip)\]
88 **...
89 ** letp lr, .*
90 **...
93 /* Iteration counter counting down from num_iter. */
94 void test4 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
96 int num_iter = (n + 15)/16;
97 for (int i = num_iter; i > 0; i--)
99 mve_pred16_t p = vctp8q (n);
100 uint8x16_t va = vldrbq_z_u8 (a, p);
101 uint8x16_t vb = vldrbq_z_u8 (b, p);
102 uint8x16_t vc = vaddq_x_u8 (va, vb, p);
103 vstrbq_p_u8 (c, vc, p);
104 n-=16;
105 a += 16;
106 b += 16;
107 c += 16;
111 ** test4:
112 **...
113 ** dlstp.8 lr, r3
114 **...
115 ** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
116 ** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
117 **...
118 ** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
119 ** vstrb.8 \3, \[(r[0-9]+|ip)\]
120 **...
121 ** letp lr, .*
122 **...
125 /* Using an unpredicated arithmetic instruction within the loop. */
126 void test5 (uint8_t *a, uint8_t *b, uint8_t *c, uint8_t *d, int n)
128 while (n > 0)
130 mve_pred16_t p = vctp8q (n);
131 uint8x16_t va = vldrbq_z_u8 (a, p);
132 uint8x16_t vb = vldrbq_u8 (b);
133 /* Is affected by implicit predication, because vb also
134 came from an unpredicated load, but there is no functional
135 problem, because the result is used in a predicated store. */
136 uint8x16_t vc = vaddq_u8 (va, vb);
137 uint8x16_t vd = vaddq_x_u8 (va, vb, p);
138 vstrbq_p_u8 (c, vc, p);
139 vstrbq_p_u8 (d, vd, p);
140 n-=16;
141 a += 16;
142 b += 16;
143 c += 16;
148 ** test5:
149 **...
150 ** dlstp.8 lr, r[0-9]+
151 **...
152 ** vldrb.8 q[0-9]+, \[r1\]
153 ** vldrb.8 q[0-9]+, \[r2\]
154 **...
155 ** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
156 **...
157 ** vstrb.8 \1, \[r2\]
158 ** vstrb.8 \1, \[r3\]
159 ** letp lr, .*
160 **...
163 /* Using a different VPR value for one instruction in the loop. */
164 void test6 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
166 while (n > 0)
168 mve_pred16_t p = vctp32q (n);
169 int32x4_t va = vldrwq_z_s32 (a, p);
170 int32x4_t vb = vldrwq_z_s32 (b, p1);
171 int32x4_t vc = vaddq_x_s32 (va, vb, p);
172 vstrwq_p_s32 (c, vc, p);
173 c += 4;
174 a += 4;
175 b += 4;
176 n -= 4;
181 ** test6:
182 **...
183 ** dlstp.32 lr, r3
184 ** vldrw.32 q[0-9]+, \[r0\], #16
185 ** vpst
186 ** vldrwt.32 q[0-9]+, \[r1\], #16
187 ** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+
188 ** vstrw.32 \1, \[r2\], #16
189 ** letp lr, .*
190 **...
193 /* Generating and using another VPR value in the loop, with a vctp.
194 The doloop logic will always try to do the transform on the first
195 vctp it encounters, so this is still expected to work. */
196 void test7 (int32_t *a, int32_t *b, int32_t *c, int n, int g)
198 while (n > 0)
200 mve_pred16_t p = vctp32q (n);
201 int32x4_t va = vldrwq_z_s32 (a, p);
202 mve_pred16_t p1 = vctp32q (g);
203 int32x4_t vb = vldrwq_z_s32 (b, p1);
204 int32x4_t vc = vaddq_x_s32 (va, vb, p);
205 vstrwq_p_s32 (c, vc, p);
206 c += 4;
207 a += 4;
208 b += 4;
209 n -= 4;
213 ** test7:
214 **...
215 ** dlstp.32 lr, r3
216 ** vldrw.32 q[0-9]+, \[r0\], #16
217 ** vpst
218 ** vldrwt.32 q[0-9]+, \[r1\], #16
219 ** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+
220 ** vstrw.32 \1, \[r2\], #16
221 ** letp lr, .*
222 **...
225 /* Generating and using a different VPR value in the loop, with a vctp,
226 but this time the p1 will also change in every loop (still fine) */
227 void test8 (int32_t *a, int32_t *b, int32_t *c, int n, int g)
229 while (n > 0)
231 mve_pred16_t p = vctp32q (n);
232 int32x4_t va = vldrwq_z_s32 (a, p);
233 mve_pred16_t p1 = vctp32q (g);
234 int32x4_t vb = vldrwq_z_s32 (b, p1);
235 int32x4_t vc = vaddq_x_s32 (va, vb, p);
236 vstrwq_p_s32 (c, vc, p);
237 c += 4;
238 a += 4;
239 b += 4;
240 n -= 4;
241 g++;
246 ** test8:
247 **...
248 ** dlstp.32 lr, r3
249 ** vldrw.32 q[0-9]+, \[r0\], #16
250 ** vctp.32 r4
251 ** vpst
252 ** vldrwt.32 q[0-9]+, \[r1\], #16
253 **...
254 ** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+
255 ** vstrw.32 \1, \[r2\], #16
256 ** letp lr, .*
257 **...
260 /* Generating and using a different VPR value in the loop, with a vctp_m
261 that is independent of the loop vctp VPR. */
262 void test9 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
264 while (n > 0)
266 mve_pred16_t p = vctp32q (n);
267 int32x4_t va = vldrwq_z_s32 (a, p);
268 mve_pred16_t p2 = vctp32q_m (n, p1);
269 int32x4_t vb = vldrwq_z_s32 (b, p1);
270 int32x4_t vc = vaddq_x_s32 (va, vb, p2);
271 vstrwq_p_s32 (c, vc, p);
272 c += 4;
273 a += 4;
274 b += 4;
275 n -= 4;
280 ** test9:
281 **...
282 ** dlstp.32 lr, r3
283 ** vldrw.32 q[0-9]+, \[r0\], #16
284 ** vmsr p0, (r[0-9]+) @ movhi
285 ** vpst
286 ** vctpt.32 r3
287 ** vmrs (r[0-9]+), p0 @ movhi
288 ** vmsr p0, \1 @ movhi
289 ** vpst
290 ** vldrwt.32 q[0-9]+, \[r1\], #16
291 ** vmsr p0, \2 @ movhi
292 ** vpst
293 ** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+
294 **...
295 ** vstrw.32 \3, \[r2\], #16
296 ** letp lr, .*
297 **...
300 /* Generating and using a different VPR value in the loop,
301 with a vctp_m that is tied to the base vctp VPR. This
302 is still fine, because the vctp_m will be transformed
303 into a vctp and be implicitly predicated. */
304 void test10 (int32_t *a, int32_t *b, int32_t *c, int n)
306 while (n > 0)
308 mve_pred16_t p = vctp32q (n);
309 int32x4_t va = vldrwq_z_s32 (a, p);
310 mve_pred16_t p1 = vctp32q_m (n, p);
311 int32x4_t vb = vldrwq_z_s32 (b, p1);
312 int32x4_t vc = vaddq_x_s32 (va, vb, p1);
313 vstrwq_p_s32 (c, vc, p);
314 c += 4;
315 a += 4;
316 b += 4;
317 n -= 4;
321 We don't need that extra vctp in the loop, but we currently do not optimize
322 it away, however, it is not wrong to use it...
325 ** test10:
326 **...
327 ** dlstp.32 lr, r3
328 ** vctp.32 r3
329 ** vldrw.32 q[0-9]+, \[r0\], #16
330 **...
331 ** vpst
332 ** vldrwt.32 q[0-9]+, \[r1\], #16
333 ** vpst
334 ** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+
335 ** vstrw.32 \1, \[r2\], #16
336 ** letp lr, .*
337 **...
340 /* Generating and using a different VPR value in the loop, with a vcmp. */
341 void test11 (int32_t *a, int32_t *b, int32_t *c, int n)
343 while (n > 0)
345 mve_pred16_t p = vctp32q (n);
346 int32x4_t va = vldrwq_z_s32 (a, p);
347 int32x4_t vb = vldrwq_z_s32 (b, p);
348 mve_pred16_t p1 = vcmpeqq_s32 (va, vb);
349 int32x4_t vc = vaddq_x_s32 (va, vb, p1);
350 vstrwq_p_s32 (c, vc, p);
351 c += 4;
352 a += 4;
353 b += 4;
354 n -= 4;
359 ** test11:
360 **...
361 ** dlstp.32 lr, r3
362 ** vldrw.32 q[0-9]+, \[r0\], #16
363 ** vldrw.32 q[0-9]+, \[r1\], #16
364 ** vcmp.i32 eq, q[0-9]+, q[0-9]+
365 ** vpst
366 ** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+
367 ** vstrw.32 \1, \[r2\], #16
368 ** letp lr, .*
369 **...
372 /* Generating and using a different VPR value in the loop, with a vcmp_m. */
373 void test12 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
375 while (n > 0)
377 mve_pred16_t p = vctp32q (n);
378 int32x4_t va = vldrwq_z_s32 (a, p);
379 int32x4_t vb = vldrwq_z_s32 (b, p);
380 mve_pred16_t p2 = vcmpeqq_m_s32 (va, vb, p1);
381 int32x4_t vc = vaddq_x_s32 (va, vb, p2);
382 vstrwq_p_s32 (c, vc, p);
383 c += 4;
384 a += 4;
385 b += 4;
386 n -= 4;
391 ** test12:
392 **...
393 ** dlstp.32 lr, r3
394 ** vldrw.32 q[0-9]+, \[r0\], #16
395 ** vldrw.32 q[0-9]+, \[r1\], #16
396 ** vmsr p0, (r[0-9]+|ip) @ movhi
397 ** vpst
398 ** vcmpt.i32 eq, q[0-9]+, q[0-9]+
399 ** vpst
400 ** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+
401 ** vstrw.32 \2, \[r2\], #16
402 ** letp lr, .*
403 **...
406 /* Generating and using a different VPR value in the loop, with a vcmp_m
407 that is tied to the base vctp VPR (same as above, this will be turned
408 into a vcmp and be implicitly predicated). */
409 void test13 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
411 while (n > 0)
413 mve_pred16_t p = vctp32q (n);
414 int32x4_t va = vldrwq_z_s32 (a, p);
415 int32x4_t vb = vldrwq_z_s32 (b, p);
416 mve_pred16_t p2 = vcmpeqq_m_s32 (va, vb, p);
417 int32x4_t vc = vaddq_x_s32 (va, vb, p2);
418 vstrwq_p_s32 (c, vc, p);
419 c += 4;
420 a += 4;
421 b += 4;
422 n -= 4;
427 ** test13:
428 **...
429 ** dlstp.32 lr, r3
430 ** vldrw.32 q[0-9]+, \[r0\], #16
431 ** vldrw.32 q[0-9]+, \[r1\], #16
432 ** vcmp.i32 eq, q[0-9]+, q[0-9]+
433 ** vpst
434 ** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+
435 ** vstrw.32 \1, \[r2\], #16
436 ** letp lr, .*
437 **...
440 /* Similar to test27 in dsltp-invalid-asm.c, but use a predicated load to make
441 it safe to implicitly predicate the vaddv. */
442 void test14 (int32_t *a, int32_t *c, int n)
444 int32_t res = 0;
445 while (n > 0)
447 mve_pred16_t p = vctp32q (n);
448 int32x4_t va = vldrwq_z_s32 (a, p);
449 res += vaddvq_s32 (va);
450 int32x4_t vc = vdupq_n_s32 (res);
451 vstrwq_p_s32 (c, vc, p);
452 a += 4;
453 n -= 4;
458 ** test14:
459 **...
460 ** dlstp.32 lr, r2
461 ** vldrw.32 (q[0-9]+), \[r0\], #16
462 ** vaddv.s32 (r[0-9]+|ip), \1
463 ** add (r[0-9]+|ip), \3, \2
464 ** vdup.32 (q[0-9]+), \3
465 ** vstrw.32 \4, \[r1\]
466 ** letp lr, .*
467 **...
470 uint8_t test15 (uint8_t *a, uint8_t *b, int n)
472 uint8_t res = 0;
473 uint8x16_t vc = vdupq_n_u8 (0);
474 while (n > 0)
476 mve_pred16_t p = vctp8q (n);
477 uint8x16_t va = vldrbq_z_u8 (a, p);
478 uint8x16_t vb = vldrbq_u8 (b);
479 vc = vaddq_m (vc, va, vc, p);
480 res = vgetq_lane (vc, 5);
482 a += 16;
483 b += 16;
484 n -= 16;
486 return res;
490 ** test15:
491 **...
492 ** dlstp.8 lr, r2
493 **...
494 ** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
495 **...
496 ** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
497 **...
498 ** letp lr, .*
499 ** vmov.u8 r[0-9]+, \2\[5\]
500 **...
503 uint8_t test16 (uint8_t *a, uint8_t *b, int n)
505 uint8_t res = 0;
506 uint8x16_t vc = vdupq_n_u8 (0);
507 while (n > 0)
509 mve_pred16_t p = vctp8q (n);
510 uint8x16_t va = vldrbq_z_u8 (a, p);
511 uint8x16_t vb = vldrbq_u8 (b);
512 vc = vaddq (va, vc);
513 vc = vaddq_m (vc, va, vc, p);
514 res = vgetq_lane (vc, 5);
516 a += 16;
517 b += 16;
518 n -= 16;
520 return res;
524 ** test16:
525 **...
526 ** dlstp.8 lr, r2
527 **...
528 ** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
529 **...
530 ** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
531 ** vadd.i8 \2, q[0-9]+, q[0-9]+
532 ** letp lr, .*
533 ** vmov.u8 r[0-9]+, \2\[5\]
534 **...
539 /* Using an across-vector unpredicated instruction in a valid way.
540 This tests that "vc" has correctly masked the risky "vb". */
541 uint16_t test18 (uint16_t *a, uint16_t *b, uint16_t *c, int n)
543 uint16x8_t vb = vldrhq_u16 (b);
544 uint16_t res = 0;
545 while (n > 0)
547 mve_pred16_t p = vctp16q (n);
548 uint16x8_t va = vldrhq_z_u16 (a, p);
549 uint16x8_t vc = vaddq_m_u16 (va, va, vb, p);
550 res += vaddvq_u16 (vc);
551 c += 8;
552 a += 8;
553 b += 8;
554 n -= 8;
556 return res;
560 ** test18:
561 **...
562 ** dlstp.16 lr, r3
563 ** vldrh.16 (q[0-9]+), \[r2\], #16
564 ** vadd.i16 \1, q[0-9]+, q[0-9]+
565 ** vaddv.u16 (r[0-9]+|ip), \1
566 ** add (r[0-9]+|ip), \3, \2
567 ** uxth \3, \3
568 ** letp lr, .*
569 **...
572 /* Using an across-vector unpredicated instruction with implicit scalar adding from outside the loop. */
573 uint16_t test19 (uint16_t *a, uint16_t *b, uint16_t *c, int n)
575 uint16x8_t vb = vldrhq_u16 (b);
576 uint16_t res = 0;
577 while (n > 0)
579 mve_pred16_t p = vctp16q (n);
580 uint16x8_t va = vldrhq_z_u16 (a, p);
581 uint16x8_t vc = vaddq_m_u16 (va, va, vb, p);
582 res = vaddvaq_u16 (res, vc);
583 c += 8;
584 a += 8;
585 b += 8;
586 n -= 8;
588 return res;
592 ** test19:
593 **...
594 ** dlstp.16 lr, r3
595 ** vldrh.16 (q[0-9]+), \[r2\], #16
596 ** vadd.i16 \1, q[0-9]+, q[0-9]+
597 ** vaddva.u16 (r[0-9]+|ip), \1
598 ** uxth \2, \2
599 ** letp lr, .*
600 **...
604 /* Using an across-vector predicated instruction in a valid way. */
605 uint16_t test20 (uint16_t *a, uint16_t *b, uint16_t *c, int n)
607 uint16_t res = 0;
608 while (n > 0)
610 mve_pred16_t p = vctp16q (n);
611 uint16x8_t va = vldrhq_u16 (a);
612 res = vaddvaq_p_u16 (res, va, p);
613 c += 8;
614 a += 8;
615 b += 8;
616 n -= 8;
618 return res;
621 /* The uxth could be moved outside the loop. */
623 ** test20:
624 **...
625 ** dlstp.16 lr, r3
626 ** vldrh.16 (q[0-9]+), \[r2\], #16
627 ** vaddva.u16 (r[0-9]+|ip), \1
628 ** uxth \2, \2
629 ** letp lr, .*
630 **...
633 /* Using an across-vector predicated instruction in a valid way. */
634 uint16_t test21 (uint16_t *a, uint16_t *b, uint16_t *c, int n)
636 uint16_t res = 0;
637 while (n > 0)
639 mve_pred16_t p = vctp16q (n);
640 uint16x8_t va = vldrhq_u16 (a);
641 res++;
642 res = vaddvaq_p_u16 (res, va, p);
643 c += 8;
644 a += 8;
645 b += 8;
646 n -= 8;
648 return res;
651 /* Also think it'd be safe to move uxth outside of the loop here. */
653 ** test21:
654 **...
655 ** dlstp.16 lr, r3
656 ** vldrh.16 (q[0-9]+), \[r2\], #16
657 ** adds (r[0-9]+|ip), \2, #1
658 ** uxth \2, \2
659 ** vaddva.u16 \2, \1
660 ** uxth \2, \2
661 ** letp lr, .*
662 **...
665 int test22 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
667 int res = 0;
668 while (n > 0)
670 mve_pred16_t p = vctp8q (n);
671 uint8x16_t va = vldrbq_z_u8 (a, p);
672 res = vmaxvq (res, va);
673 n-=16;
674 a+=16;
676 return res;
680 ** test22:
681 **...
682 ** dlstp.8 lr, r3
683 **...
684 ** vldrb.8 (q[0-9]+), \[r[0-9]+\]
685 **...
686 ** vmaxv.u8 (r[0-9]+|ip), \1
687 ** uxtb \2, \2
688 ** letp lr, .*
689 **...
692 int test23 (int8_t *a, int8_t *b, int8_t *c, int n)
694 int res = 0;
695 while (n > 0)
697 mve_pred16_t p = vctp8q (n);
698 int8x16_t va = vldrbq_z_s8 (a, p);
699 res = vmaxavq (res, va);
700 n-=16;
701 a+=16;
703 return res;
707 ** test23:
708 **...
709 ** dlstp.8 lr, r3
710 **...
711 ** vldrb.8 (q[0-9]+), \[r3\]
712 **...
713 ** vmaxav.s8 (r[0-9]+|ip), \1
714 ** uxtb \2, \2
715 ** letp lr, .*
716 **...
719 /* Like test1, but update n before vctp, meaning we should only iterate for n-4
720 elements. */
721 void test24 (int32_t *a, int32_t *b, int32_t *c, int n)
723 while (n >= 1)
725 n-=4;
726 mve_pred16_t p = vctp32q (n);
727 int32x4_t va = vldrwq_z_s32 (a, p);
728 int32x4_t vb = vldrwq_z_s32 (b, p);
729 int32x4_t vc = vaddq_x_s32 (va, vb, p);
730 vstrwq_p_s32 (c, vc, p);
731 c+=4;
732 a+=4;
733 b+=4;
737 ** test24:
738 **...
739 ** subs r3, r3, #4
740 **...
741 ** dlstp.32 lr, r3
742 ** vldrw.32 q[0-9]+, \[r0\], #16
743 ** vldrw.32 q[0-9]+, \[r1\], #16
744 ** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+
745 ** vstrw.32 \1, \[r2\], #16
746 ** letp lr, .*
747 **...