1 /* { dg-do compile } */
2 /* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable -ffast-math" } */
6 #define VEC_PERM(TYPE) \
7 void __attribute__ ((noinline, noclone)) \
8 vec_slp_##TYPE (TYPE *restrict a, TYPE *restrict b, int n) \
14 for (int i = 0; i < n; ++i) \
42 /* We can't use SLP for the 64-bit loops, since the number of reduction
43 results might be greater than the number of elements in the vector.
44 Otherwise we have two loads per loop, one for the initial vector
45 and one for the loop body. */
46 /* ??? At present we don't treat the int8_t and int16_t loops as
48 /* { dg-final { scan-assembler-times {\tld1b\t} 2 { xfail *-*-* } } } */
49 /* { dg-final { scan-assembler-times {\tld1h\t} 3 { xfail *-*-* } } } */
50 /* { dg-final { scan-assembler-times {\tld1b\t} 1 } } */
51 /* { dg-final { scan-assembler-times {\tld1h\t} 2 } } */
52 /* { dg-final { scan-assembler-times {\tld1w\t} 3 } } */
53 /* { dg-final { scan-assembler-times {\tld4d\t} 3 } } */
54 /* { dg-final { scan-assembler-not {\tld4b\t} } } */
55 /* { dg-final { scan-assembler-not {\tld4h\t} } } */
56 /* { dg-final { scan-assembler-not {\tld4w\t} } } */
57 /* { dg-final { scan-assembler-not {\tld1d\t} } } */
58 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 8 { xfail *-*-* } } } */
59 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 8 { xfail *-*-* } } } */
60 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 4 } } */
61 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 4 } } */
62 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s} 8 } } */
63 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 8 } } */
64 /* { dg-final { scan-assembler-times {\tfaddv\th[0-9]+, p[0-7], z[0-9]+\.h} 4 } } */
65 /* { dg-final { scan-assembler-times {\tfaddv\ts[0-9]+, p[0-7], z[0-9]+\.s} 4 } } */
66 /* { dg-final { scan-assembler-times {\tfaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 4 } } */
68 /* Should be 4 and 6 respectively, if we used reductions for int8_t and
70 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b} 2 } } */
71 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h} 4 } } */
72 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s} 6 } } */
73 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 6 } } */
75 /* { dg-final { scan-assembler-not {\tuqdec} } } */