Add support for conditional reductions using SVE CLASTB
[official-gcc.git] / gcc / testsuite / gcc.target / aarch64 / sve / slp_7.c
blob9e6aa8ccbf81c204edc5f9faa37292f0180f0c4d
1 /* { dg-do compile } */
2 /* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable -ffast-math" } */
4 #include <stdint.h>
6 #define VEC_PERM(TYPE) \
7 void __attribute__ ((noinline, noclone)) \
8 vec_slp_##TYPE (TYPE *restrict a, TYPE *restrict b, int n) \
9 { \
10 TYPE x0 = b[0]; \
11 TYPE x1 = b[1]; \
12 TYPE x2 = b[2]; \
13 TYPE x3 = b[3]; \
14 for (int i = 0; i < n; ++i) \
15 { \
16 x0 += a[i * 4]; \
17 x1 += a[i * 4 + 1]; \
18 x2 += a[i * 4 + 2]; \
19 x3 += a[i * 4 + 3]; \
20 } \
21 b[0] = x0; \
22 b[1] = x1; \
23 b[2] = x2; \
24 b[3] = x3; \
27 #define TEST_ALL(T) \
28 T (int8_t) \
29 T (uint8_t) \
30 T (int16_t) \
31 T (uint16_t) \
32 T (int32_t) \
33 T (uint32_t) \
34 T (int64_t) \
35 T (uint64_t) \
36 T (_Float16) \
37 T (float) \
38 T (double)
40 TEST_ALL (VEC_PERM)
42 /* We can't use SLP for the 64-bit loops, since the number of reduction
43 results might be greater than the number of elements in the vector.
44 Otherwise we have two loads per loop, one for the initial vector
45 and one for the loop body. */
46 /* ??? At present we don't treat the int8_t and int16_t loops as
47 reductions. */
48 /* { dg-final { scan-assembler-times {\tld1b\t} 2 { xfail *-*-* } } } */
49 /* { dg-final { scan-assembler-times {\tld1h\t} 3 { xfail *-*-* } } } */
50 /* { dg-final { scan-assembler-times {\tld1b\t} 1 } } */
51 /* { dg-final { scan-assembler-times {\tld1h\t} 2 } } */
52 /* { dg-final { scan-assembler-times {\tld1w\t} 3 } } */
53 /* { dg-final { scan-assembler-times {\tld4d\t} 3 } } */
54 /* { dg-final { scan-assembler-not {\tld4b\t} } } */
55 /* { dg-final { scan-assembler-not {\tld4h\t} } } */
56 /* { dg-final { scan-assembler-not {\tld4w\t} } } */
57 /* { dg-final { scan-assembler-not {\tld1d\t} } } */
58 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 8 { xfail *-*-* } } } */
59 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 8 { xfail *-*-* } } } */
60 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 4 } } */
61 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 4 } } */
62 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s} 8 } } */
63 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 8 } } */
64 /* { dg-final { scan-assembler-times {\tfaddv\th[0-9]+, p[0-7], z[0-9]+\.h} 4 } } */
65 /* { dg-final { scan-assembler-times {\tfaddv\ts[0-9]+, p[0-7], z[0-9]+\.s} 4 } } */
66 /* { dg-final { scan-assembler-times {\tfaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 4 } } */
68 /* Should be 4 and 6 respectively, if we used reductions for int8_t and
69 int16_t. */
70 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b} 2 } } */
71 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h} 4 } } */
72 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s} 6 } } */
73 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 6 } } */
75 /* { dg-final { scan-assembler-not {\tuqdec} } } */