testsuite/52641 - Fix more sloppy tests.
[official-gcc.git] / gcc / testsuite / gcc.target / arm / mve / dlstp-invalid-asm.c
blob26df2d30523cef58036668622f8cbe39462a5cfa
1 /* { dg-do compile { target { arm*-*-* } } } */
2 /* { dg-require-effective-target arm_v8_1m_mve_ok } */
3 /* { dg-options "-O3 -save-temps" } */
4 /* { dg-add-options arm_v8_1m_mve } */
6 #include <limits.h>
7 #include <arm_mve.h>
9 /* Terminating on a non-zero number of elements. */
10 void test0 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
12 while (n > 1)
14 mve_pred16_t p = vctp8q (n);
15 uint8x16_t va = vldrbq_z_u8 (a, p);
16 uint8x16_t vb = vldrbq_z_u8 (b, p);
17 uint8x16_t vc = vaddq_x_u8 (va, vb, p);
18 vstrbq_p_u8 (c, vc, p);
19 n -= 16;
23 /* Terminating on n >= 0. */
24 void test1 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
26 while (n >= 0)
28 mve_pred16_t p = vctp8q (n);
29 uint8x16_t va = vldrbq_z_u8 (a, p);
30 uint8x16_t vb = vldrbq_z_u8 (b, p);
31 uint8x16_t vc = vaddq_x_u8 (va, vb, p);
32 vstrbq_p_u8 (c, vc, p);
33 n -= 16;
37 /* Similar, terminating on a non-zero number of elements, but in a for loop
38 format. */
39 int32_t a[] = {0, 1, 2, 3, 4, 5, 6, 7};
40 void test2 (int32_t *b, int num_elems)
42 for (int i = num_elems; i >= 2; i-= 4)
44 mve_pred16_t p = vctp32q (i);
45 int32x4_t va = vldrwq_z_s32 (&(a[i]), p);
46 vstrwq_p_s32 (b + i, va, p);
50 /* Iteration counter counting up to num_iter, with a non-zero starting num. */
51 void test3 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
53 int num_iter = (n + 15)/16;
54 for (int i = 1; i < num_iter; i++)
56 mve_pred16_t p = vctp8q (n);
57 uint8x16_t va = vldrbq_z_u8 (a, p);
58 uint8x16_t vb = vldrbq_z_u8 (b, p);
59 uint8x16_t vc = vaddq_x_u8 (va, vb, p);
60 vstrbq_p_u8 (c, vc, p);
61 n -= 16;
65 /* Iteration counter counting up to num_iter, with a larger increment */
66 void test4 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
68 int num_iter = (n + 15)/16;
69 for (int i = 0; i < num_iter; i+=2)
71 mve_pred16_t p = vctp8q (n);
72 uint8x16_t va = vldrbq_z_u8 (a, p);
73 uint8x16_t vb = vldrbq_z_u8 (b, p);
74 uint8x16_t vc = vaddq_x_u8 (va, vb, p);
75 vstrbq_p_u8 (c, vc, p);
76 n -= 16;
80 /* Using an unpredicated store instruction within the loop. */
81 void test5 (uint8_t *a, uint8_t *b, uint8_t *c, uint8_t *d, int n)
83 while (n > 0)
85 mve_pred16_t p = vctp8q (n);
86 uint8x16_t va = vldrbq_z_u8 (a, p);
87 uint8x16_t vb = vldrbq_z_u8 (b, p);
88 uint8x16_t vc = vaddq_u8 (va, vb);
89 uint8x16_t vd = vaddq_x_u8 (va, vb, p);
90 vstrbq_u8 (d, vd);
91 n -= 16;
95 /* Using an unpredicated store outside the loop. */
96 void test6 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx)
98 while (n > 0)
100 mve_pred16_t p = vctp8q (n);
101 uint8x16_t va = vldrbq_z_u8 (a, p);
102 uint8x16_t vb = vldrbq_z_u8 (b, p);
103 uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p);
104 vx = vaddq_u8 (vx, vc);
105 a += 16;
106 b += 16;
107 n -= 16;
109 vstrbq_u8 (c, vx);
112 /* Using a VPR that gets modified within the loop. */
113 void test9 (int32_t *a, int32_t *b, int32_t *c, int n)
115 while (n > 0)
117 mve_pred16_t p = vctp32q (n);
118 int32x4_t va = vldrwq_z_s32 (a, p);
119 p++;
120 int32x4_t vb = vldrwq_z_s32 (b, p);
121 int32x4_t vc = vaddq_x_s32 (va, vb, p);
122 vstrwq_p_s32 (c, vc, p);
123 c += 4;
124 a += 4;
125 b += 4;
126 n -= 4;
130 /* Using a VPR that gets re-generated within the loop. */
131 void test10 (int32_t *a, int32_t *b, int32_t *c, int n)
133 mve_pred16_t p = vctp32q (n);
134 while (n > 0)
136 int32x4_t va = vldrwq_z_s32 (a, p);
137 p = vctp32q (n);
138 int32x4_t vb = vldrwq_z_s32 (b, p);
139 int32x4_t vc = vaddq_x_s32 (va, vb, p);
140 vstrwq_p_s32 (c, vc, p);
141 c += 4;
142 a += 4;
143 b += 4;
144 n -= 4;
148 /* Using vctp32q_m instead of vctp32q. */
149 void test11 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p0)
151 while (n > 0)
153 mve_pred16_t p = vctp32q_m (n, p0);
154 int32x4_t va = vldrwq_z_s32 (a, p);
155 int32x4_t vb = vldrwq_z_s32 (b, p);
156 int32x4_t vc = vaddq_x_s32 (va, vb, p);
157 vstrwq_p_s32 (c, vc, p);
158 c += 4;
159 a += 4;
160 b += 4;
161 n -= 4;
165 /* Using an unpredicated op with a scalar output, where the result is valid
166 outside the bb. This is invalid, because one of the inputs to the
167 unpredicated op is also unpredicated. */
168 uint8_t test12 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx)
170 uint8_t sum = 0;
171 while (n > 0)
173 mve_pred16_t p = vctp8q (n);
174 uint8x16_t va = vldrbq_z_u8 (a, p);
175 uint8x16_t vb = vldrbq_u8 (b);
176 uint8x16_t vc = vaddq_u8 (va, vb);
177 sum += vaddvq_u8 (vc);
178 a += 16;
179 b += 16;
180 n -= 16;
182 return sum;
185 /* Using an unpredicated vcmp to generate a new predicate value in the
186 loop and then using that VPR to predicate a store insn. */
187 void test13 (int32_t *a, int32_t *b, int32x4_t vc, int32_t *c, int n)
189 while (n > 0)
191 mve_pred16_t p = vctp32q (n);
192 int32x4_t va = vldrwq_s32 (a);
193 int32x4_t vb = vldrwq_z_s32 (b, p);
194 int32x4_t vc = vaddq_s32 (va, vb);
195 mve_pred16_t p1 = vcmpeqq_s32 (va, vc);
196 vstrwq_p_s32 (c, vc, p1);
197 c += 4;
198 a += 4;
199 b += 4;
200 n -= 4;
204 /* Using an across-vector unpredicated instruction. "vb" is the risk. */
205 uint16_t test14 (uint16_t *a, uint16_t *b, uint16_t *c, int n)
207 uint16x8_t vb = vldrhq_u16 (b);
208 uint16_t res = 0;
209 while (n > 0)
211 mve_pred16_t p = vctp16q (n);
212 uint16x8_t va = vldrhq_z_u16 (a, p);
213 vb = vaddq_u16 (va, vb);
214 res = vaddvq_u16 (vb);
215 c += 8;
216 a += 8;
217 b += 8;
218 n -= 8;
220 return res;
223 /* Using an across-vector unpredicated instruction. "vc" is the risk. */
224 uint16_t test15 (uint16_t *a, uint16_t *b, uint16_t *c, int n)
226 uint16x8_t vb = vldrhq_u16 (b);
227 uint16_t res = 0;
228 while (n > 0)
230 mve_pred16_t p = vctp16q (n);
231 uint16x8_t va = vldrhq_z_u16 (a, p);
232 uint16x8_t vc = vaddq_u16 (va, vb);
233 res = vaddvaq_u16 (res, vc);
234 c += 8;
235 a += 8;
236 b += 8;
237 n -= 8;
239 return res;
242 uint16_t test16 (uint16_t *a, uint16_t *b, uint16_t *c, int n)
244 uint16_t res =0;
245 while (n > 0)
247 mve_pred16_t p = vctp16q (n);
248 uint16x8_t vb = vldrhq_u16 (b);
249 uint16x8_t va = vldrhq_z_u16 (a, p);
250 res = vaddvaq_u16 (res, vb);
251 res = vaddvaq_p_u16 (res, va, p);
252 c += 8;
253 a += 8;
254 b += 8;
255 n -= 8;
257 return res;
260 int test17 (int8_t *a, int8_t *b, int8_t *c, int n)
262 int res = 0;
263 while (n > 0)
265 mve_pred16_t p = vctp8q (n);
266 int8x16_t va = vldrbq_z_s8 (a, p);
267 res = vmaxvq (res, va);
268 n-=16;
269 a+=16;
271 return res;
276 int test18 (int8_t *a, int8_t *b, int8_t *c, int n)
278 int res = 0;
279 while (n > 0)
281 mve_pred16_t p = vctp8q (n);
282 int8x16_t va = vldrbq_z_s8 (a, p);
283 res = vminvq (res, va);
284 n-=16;
285 a+=16;
287 return res;
290 int test19 (int8_t *a, int8_t *b, int8_t *c, int n)
292 int res = 0;
293 while (n > 0)
295 mve_pred16_t p = vctp8q (n);
296 int8x16_t va = vldrbq_z_s8 (a, p);
297 res = vminavq (res, va);
298 n-=16;
299 a+=16;
301 return res;
304 int test20 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
306 int res = 0;
307 while (n > 0)
309 mve_pred16_t p = vctp8q (n);
310 uint8x16_t va = vldrbq_z_u8 (a, p);
311 res = vminvq (res, va);
312 n-=16;
313 a+=16;
315 return res;
318 uint8x16_t test21 (uint8_t *a, uint32_t *b, int n, uint8x16_t res)
320 while (n > 0)
322 mve_pred16_t p = vctp8q (n);
323 uint8x16_t va = vldrbq_z_u8 (a, p);
324 res = vshlcq_u8 (va, b, 1);
325 n-=16;
326 a+=16;
328 return res;
331 int8x16_t test22 (int8_t *a, int32_t *b, int n, int8x16_t res)
333 while (n > 0)
335 mve_pred16_t p = vctp8q (n);
336 int8x16_t va = vldrbq_z_s8 (a, p);
337 res = vshlcq_s8 (va, b, 1);
338 n-=16;
339 a+=16;
341 return res;
344 /* Using an unsigned number of elements to count down from, with a >0*/
345 void test23 (int32_t *a, int32_t *b, int32_t *c, unsigned int n)
347 while (n > 0)
349 mve_pred16_t p = vctp32q (n);
350 int32x4_t va = vldrwq_z_s32 (a, p);
351 int32x4_t vb = vldrwq_z_s32 (b, p);
352 int32x4_t vc = vaddq_x_s32 (va, vb, p);
353 vstrwq_p_s32 (c, vc, p);
354 c+=4;
355 a+=4;
356 b+=4;
357 n-=4;
361 /* Using an unsigned number of elements to count up to, with a <n*/
362 void test24 (uint8_t *a, uint8_t *b, uint8_t *c, unsigned int n)
364 for (int i = 0; i < n; i+=16)
366 mve_pred16_t p = vctp8q (n-i);
367 uint8x16_t va = vldrbq_z_u8 (a, p);
368 uint8x16_t vb = vldrbq_z_u8 (b, p);
369 uint8x16_t vc = vaddq_x_u8 (va, vb, p);
370 vstrbq_p_u8 (c, vc, p);
371 n-=16;
376 /* Using an unsigned number of elements to count up to, with a <=n*/
377 void test25 (uint8_t *a, uint8_t *b, uint8_t *c, unsigned int n)
379 for (int i = 1; i <= n; i+=16)
381 mve_pred16_t p = vctp8q (n-i+1);
382 uint8x16_t va = vldrbq_z_u8 (a, p);
383 uint8x16_t vb = vldrbq_z_u8 (b, p);
384 uint8x16_t vc = vaddq_x_u8 (va, vb, p);
385 vstrbq_p_u8 (c, vc, p);
386 n-=16;
389 /* Update n twice in the loop. */
390 void test26 (int32_t *a, int32_t *b, int32_t *c, int n)
392 while (n >= 1)
394 n-=4;
395 mve_pred16_t p = vctp32q (n);
396 int32x4_t va = vldrwq_z_s32 (a, p);
397 int32x4_t vb = vldrwq_z_s32 (b, p);
398 int32x4_t vc = vaddq_x_s32 (va, vb, p);
399 vstrwq_p_s32 (c, vc, p);
400 c+=4;
401 a+=4;
402 b+=4;
403 n-=4;
407 void test27 (int32_t *a, int32_t *c, int n)
409 int32_t res = 0;
410 while (n > 0)
412 mve_pred16_t p = vctp32q (n);
413 int32x4_t va = vldrwq_s32 (a);
414 res += vaddvq_s32 (va);
415 int32x4_t vc = vdupq_n_s32 (res);
416 vstrwq_p_s32 (c, vc, p);
417 a += 4;
418 n -= 4;
422 /* Using an unpredicated vcmp to generate a new predicate value in the
423 loop and then using it in a predicated store insn. */
424 void test28 (int32_t *a, int32_t *b, int32_t *c, int n)
426 while (n > 0)
428 mve_pred16_t p = vctp32q (n);
429 int32x4_t va = vldrwq_z_s32 (a, p);
430 int32x4_t vb = vldrwq_s32 (b);
431 int32x4_t vc = vaddq_x_s32 (va, vb, p);
432 mve_pred16_t p1 = vcmpeqq_s32 (va, vc);
433 vstrwq_p_s32 (c, vc, p1);
434 c += 4;
435 a += 4;
436 b += 4;
437 n -= 4;
441 /* Using an unpredicated op with a scalar output, where the result is valid
442 outside the bb. The unpredicated lanes are not guaranteed zero, so would
443 affect the vaddv in the non-tail predicated case. */
444 uint8_t test29 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx)
446 uint8_t sum = 0;
447 while (n > 0)
449 mve_pred16_t p = vctp8q (n);
450 uint8x16_t va = vldrbq_z_u8 (a, p);
451 uint8x16_t vb = vldrbq_z_u8 (b, p);
452 uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p);
453 sum += vaddvq_u8 (vc);
454 a += 16;
455 b += 16;
456 n -= 16;
458 return sum;
461 /* Same as above, but with another scalar op between the unpredicated op and
462 the scalar op outside the loop. */
463 uint8_t test30 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx, int g)
465 uint8_t sum = 0;
466 while (n > 0)
468 mve_pred16_t p = vctp8q (n);
469 uint8x16_t va = vldrbq_z_u8 (a, p);
470 uint8x16_t vb = vldrbq_z_u8 (b, p);
471 uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p);
472 sum += vaddvq_u8 (vc);
473 sum += g;
474 a += 16;
475 b += 16;
476 n -= 16;
478 return sum;
481 uint8_t test31 (uint8_t *a, uint8_t *b, int n)
483 uint8_t res = 0;
484 uint8x16_t vc = vdupq_n_u8 (0);
485 while (n > 0)
487 mve_pred16_t p = vctp8q (n);
488 uint8x16_t va = vldrbq_z_u8 (a, p);
489 uint8x16_t vb = vldrbq_u8 (b);
490 vc = vaddq (vb, vc);
491 res = vgetq_lane (vc, 5);
493 a += 16;
494 b += 16;
495 n -= 16;
497 return res;
500 uint8_t test32 (uint8_t *a, uint8_t *b, int n)
502 uint8_t res = 0;
503 uint8x16_t vc = vdupq_n_u8 (0);
504 while (n > 0)
506 mve_pred16_t p = vctp8q (n);
507 uint8x16_t va = vldrbq_z_u8 (a, p);
508 uint8x16_t vb = vldrbq_u8 (b);
509 vc = vaddq_m (vc, va, vc, p);
510 vc = vaddq (vb, vc);
511 res = vgetq_lane (vc, 5);
513 a += 16;
514 b += 16;
515 n -= 16;
517 return res;
520 /* { dg-final { scan-assembler-not "\tdlstp" } } */
521 /* { dg-final { scan-assembler-not "\tletp" } } */