2 /* { dg-options "-O3 -fno-inline -save-temps -fno-vect-cost-model -fno-ipa-icf" } */
4 #pragma GCC target "+nosve"
6 typedef signed char S8_t
;
7 typedef signed short S16_t
;
8 typedef signed int S32_t
;
9 typedef signed long long S64_t
;
10 typedef signed char *__restrict__ pS8_t
;
11 typedef signed short *__restrict__ pS16_t
;
12 typedef signed int *__restrict__ pS32_t
;
13 typedef signed long long *__restrict__ pS64_t
;
14 typedef unsigned char U8_t
;
15 typedef unsigned short U16_t
;
16 typedef unsigned int U32_t
;
17 typedef unsigned long long U64_t
;
18 typedef unsigned char *__restrict__ pU8_t
;
19 typedef unsigned short *__restrict__ pU16_t
;
20 typedef unsigned int *__restrict__ pU32_t
;
21 typedef unsigned long long *__restrict__ pU64_t
;
26 test_addS64_tS32_t4 (pS64_t a
, pS32_t b
, pS32_t c
)
29 for (i
= 0; i
< 4; i
++)
30 a
[i
] += (S64_t
) b
[i
] * (S64_t
) c
[i
];
33 /* { dg-final { scan-assembler "smlal\tv\[0-9\]+\.2d" } } */
34 /* { dg-final { scan-assembler "smlal2\tv\[0-9\]+\.2d" } } */
37 test_addS32_tS16_t8 (pS32_t a
, pS16_t b
, pS16_t c
)
40 for (i
= 0; i
< 8; i
++)
41 a
[i
] += (S32_t
) b
[i
] * (S32_t
) c
[i
];
44 /* { dg-final { scan-assembler "smlal\tv\[0-9\]+\.4s" } } */
45 /* { dg-final { scan-assembler "smlal2\tv\[0-9\]+\.4s" } } */
48 test_addS16_tS8_t16 (pS16_t a
, pS8_t b
, pS8_t c
)
51 for (i
= 0; i
< 16; i
++)
52 a
[i
] += (S16_t
) b
[i
] * (S16_t
) c
[i
];
56 test_addS16_tS8_t16_neg0 (pS16_t a
, pS8_t b
, pS8_t c
)
59 for (i
= 0; i
< 16; i
++)
60 a
[i
] += (S16_t
) -b
[i
] * (S16_t
) -c
[i
];
64 test_addS16_tS8_t16_neg1 (pS16_t a
, pS8_t b
, pS8_t c
)
67 for (i
= 0; i
< 16; i
++)
68 a
[i
] -= (S16_t
) b
[i
] * (S16_t
) -c
[i
];
72 test_addS16_tS8_t16_neg2 (pS16_t a
, pS8_t b
, pS8_t c
)
75 for (i
= 0; i
< 16; i
++)
76 a
[i
] -= (S16_t
) -b
[i
] * (S16_t
) c
[i
];
79 /* { dg-final { scan-assembler-times "smlal\tv\[0-9\]+\.8h" 4 } } */
80 /* { dg-final { scan-assembler-times "smlal2\tv\[0-9\]+\.8h" 4 } } */
83 test_subS64_tS32_t4 (pS64_t a
, pS32_t b
, pS32_t c
)
86 for (i
= 0; i
< 4; i
++)
87 a
[i
] -= (S64_t
) b
[i
] * (S64_t
) c
[i
];
90 /* { dg-final { scan-assembler "smlsl\tv\[0-9\]+\.2d" } } */
91 /* { dg-final { scan-assembler "smlsl2\tv\[0-9\]+\.2d" } } */
94 test_subS32_tS16_t8 (pS32_t a
, pS16_t b
, pS16_t c
)
97 for (i
= 0; i
< 8; i
++)
98 a
[i
] -= (S32_t
) b
[i
] * (S32_t
) c
[i
];
101 /* { dg-final { scan-assembler "smlsl\tv\[0-9\]+\.4s" } } */
102 /* { dg-final { scan-assembler "smlsl2\tv\[0-9\]+\.4s" } } */
105 test_subS16_tS8_t16 (pS16_t a
, pS8_t b
, pS8_t c
)
108 for (i
= 0; i
< 16; i
++)
109 a
[i
] -= (S16_t
) b
[i
] * (S16_t
) c
[i
];
113 test_subS16_tS8_t16_neg0 (pS16_t a
, pS8_t b
, pS8_t c
)
116 for (i
= 0; i
< 16; i
++)
117 a
[i
] += (S16_t
) -b
[i
] * (S16_t
) c
[i
];
121 test_subS16_tS8_t16_neg1 (pS16_t a
, pS8_t b
, pS8_t c
)
124 for (i
= 0; i
< 16; i
++)
125 a
[i
] += (S16_t
) b
[i
] * (S16_t
) -c
[i
];
129 test_subS16_tS8_t16_neg2 (pS16_t a
, pS8_t b
, pS8_t c
)
132 for (i
= 0; i
< 16; i
++)
133 a
[i
] += -((S16_t
) b
[i
] * (S16_t
) c
[i
]);
137 test_subS16_tS8_t16_neg3 (pS16_t a
, pS8_t b
, pS8_t c
)
140 for (i
= 0; i
< 16; i
++)
141 a
[i
] -= (S16_t
) -b
[i
] * (S16_t
) -c
[i
];
144 /* { dg-final { scan-assembler-times "smlsl\tv\[0-9\]+\.8h" 5 } } */
145 /* { dg-final { scan-assembler-times "smlsl2\tv\[0-9\]+\.8h" 5 } } */
148 test_addU64_tU32_t4 (pU64_t a
, pU32_t b
, pU32_t c
)
151 for (i
= 0; i
< 4; i
++)
152 a
[i
] += (U64_t
) b
[i
] * (U64_t
) c
[i
];
155 /* { dg-final { scan-assembler "umlal\tv\[0-9\]+\.2d" } } */
156 /* { dg-final { scan-assembler "umlal2\tv\[0-9\]+\.2d" } } */
159 test_addU32_tU16_t8 (pU32_t a
, pU16_t b
, pU16_t c
)
162 for (i
= 0; i
< 8; i
++)
163 a
[i
] += (U32_t
) b
[i
] * (U32_t
) c
[i
];
166 /* { dg-final { scan-assembler "umlal\tv\[0-9\]+\.4s" } } */
167 /* { dg-final { scan-assembler "umlal2\tv\[0-9\]+\.4s" } } */
170 test_addU16_tU8_t16 (pU16_t a
, pU8_t b
, pU8_t c
)
173 for (i
= 0; i
< 16; i
++)
174 a
[i
] += (U16_t
) b
[i
] * (U16_t
) c
[i
];
177 /* { dg-final { scan-assembler "umlal\tv\[0-9\]+\.8h" } } */
178 /* { dg-final { scan-assembler "umlal2\tv\[0-9\]+\.8h" } } */
181 test_subU64_tU32_t4 (pU64_t a
, pU32_t b
, pU32_t c
)
184 for (i
= 0; i
< 4; i
++)
185 a
[i
] -= (U64_t
) b
[i
] * (U64_t
) c
[i
];
188 /* { dg-final { scan-assembler "umlsl\tv\[0-9\]+\.2d" } } */
189 /* { dg-final { scan-assembler "umlsl2\tv\[0-9\]+\.2d" } } */
192 test_subU32_tU16_t8 (pU32_t a
, pU16_t b
, pU16_t c
)
195 for (i
= 0; i
< 8; i
++)
196 a
[i
] -= (U32_t
) b
[i
] * (U32_t
) c
[i
];
199 /* { dg-final { scan-assembler "umlsl\tv\[0-9\]+\.4s" } } */
200 /* { dg-final { scan-assembler "umlsl2\tv\[0-9\]+\.4s" } } */
203 test_subU16_tU8_t16 (pU16_t a
, pU8_t b
, pU8_t c
)
206 for (i
= 0; i
< 16; i
++)
207 a
[i
] -= (U16_t
) b
[i
] * (U16_t
) c
[i
];
210 /* { dg-final { scan-assembler "umlsl\tv\[0-9\]+\.8h" } } */
211 /* { dg-final { scan-assembler "umlsl2\tv\[0-9\]+\.8h" } } */
214 S64_t add_rS64
[4] = { 6, 7, -4, -3 };
215 S32_t add_rS32
[8] = { 6, 7, -4, -3, 10, 11, 0, 1 };
217 { 6, 7, -4, -3, 10, 11, 0, 1, 14, 15, 4, 5, 18, 19, 8, 9 };
219 S64_t sub_rS64
[4] = { 0, 1, 2, 3 };
220 S32_t sub_rS32
[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
221 S16_t sub_rS16
[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
223 U64_t add_rU64
[4] = { 0x6, 0x7, 0x2fffffffc, 0x2fffffffd };
227 0x6, 0x7, 0x2fffc, 0x2fffd,
228 0xa, 0xb, 0x30000, 0x30001
233 0x6, 0x7, 0x2fc, 0x2fd, 0xa, 0xb, 0x300, 0x301,
234 0xe, 0xf, 0x304, 0x305, 0x12, 0x13, 0x308, 0x309
237 U64_t sub_rU64
[4] = { 0, 1, 2, 3 };
238 U32_t sub_rU32
[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
239 U16_t sub_rU16
[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
241 S8_t neg_r
[16] = { -6, -5, 8, 9, -2, -1, 12, 13, 2, 3, 16, 17, 6, 7, 20, 21 };
243 S64_t S64_ta
[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
244 S32_t S32_tb
[16] = { 2, 2, -2, -2, 2, 2, -2, -2, 2, 2, -2, -2, 2, 2, -2, -2 };
245 S32_t S32_tc
[16] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 };
247 S32_t S32_ta
[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
248 S16_t S16_tb
[16] = { 2, 2, -2, -2, 2, 2, -2, -2, 2, 2, -2, -2, 2, 2, -2, -2 };
249 S16_t S16_tc
[16] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 };
251 S16_t S16_ta
[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
252 S8_t S8_tb
[16] = { 2, 2, -2, -2, 2, 2, -2, -2, 2, 2, -2, -2, 2, 2, -2, -2 };
253 S8_t S8_tc
[16] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 };
256 #define CHECK(T,N,AS,US) \
259 for (i = 0; i < N; i++) \
260 if (S##T##_ta[i] != AS##_r##US##T[i]) \
265 #define SCHECK(T,N,AS) CHECK(T,N,AS,S)
266 #define UCHECK(T,N,AS) CHECK(T,N,AS,U)
268 #define NCHECK(RES) \
271 for (i = 0; i < 16; i++) \
272 if (S16_ta[i] != RES[i]) \
283 test_addS64_tS32_t4 (S64_ta
, S32_tb
, S32_tc
);
285 test_addS32_tS16_t8 (S32_ta
, S16_tb
, S16_tc
);
287 test_addS16_tS8_t16 (S16_ta
, S8_tb
, S8_tc
);
288 SCHECK (16, 16, add
);
289 test_subS64_tS32_t4 (S64_ta
, S32_tb
, S32_tc
);
291 test_subS32_tS16_t8 (S32_ta
, S16_tb
, S16_tc
);
293 test_subS16_tS8_t16 (S16_ta
, S8_tb
, S8_tc
);
294 SCHECK (16, 16, sub
);
296 test_addU64_tU32_t4 (S64_ta
, S32_tb
, S32_tc
);
298 test_addU32_tU16_t8 (S32_ta
, S16_tb
, S16_tc
);
300 test_addU16_tU8_t16 (S16_ta
, S8_tb
, S8_tc
);
301 UCHECK (16, 16, add
);
302 test_subU64_tU32_t4 (S64_ta
, S32_tb
, S32_tc
);
304 test_subU32_tU16_t8 (S32_ta
, S16_tb
, S16_tc
);
306 test_subU16_tU8_t16 (S16_ta
, S8_tb
, S8_tc
);
307 UCHECK (16, 16, sub
);
309 test_addS16_tS8_t16_neg0 (S16_ta
, S8_tb
, S8_tc
);
311 test_subS16_tS8_t16_neg0 (S16_ta
, S8_tb
, S8_tc
);
313 test_addS16_tS8_t16_neg1 (S16_ta
, S8_tb
, S8_tc
);
315 test_subS16_tS8_t16_neg1 (S16_ta
, S8_tb
, S8_tc
);
317 test_addS16_tS8_t16_neg2 (S16_ta
, S8_tb
, S8_tc
);
319 test_subS16_tS8_t16_neg2 (S16_ta
, S8_tb
, S8_tc
);
321 test_subS16_tS8_t16_neg3 (S16_ta
, S8_tb
, S8_tc
);