1 /* { dg-do compile { target { powerpc*-*-* } } } */
2 /* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power9" } } */
3 /* { dg-require-effective-target powerpc_p9vector_ok } */
4 /* { dg-skip-if "" { powerpc*-*-aix* } } */
5 /* { dg-options "-O3 -mcpu=power9" } */
7 /* Verify that we vectorize this SAD loop using vabsdub. */
9 extern int abs (int __x
) __attribute__ ((__nothrow__
, __leaf__
)) __attribute__ ((__const__
));
12 foo (unsigned char *w
, int i
, unsigned char *x
, int j
)
15 for (int a
= 0; a
< 16; a
++)
17 for (int b
= 0; b
< 16; b
++)
18 tot
+= abs (w
[b
] - x
[b
]);
26 bar (unsigned char *w
, unsigned char *x
, int i
, int *result
)
28 *result
= foo (w
, 16, x
, i
);
31 /* { dg-final { scan-assembler-times "vabsdub" 16 } } */
32 /* { dg-final { scan-assembler-times "vsum4ubs" 16 } } */
33 /* { dg-final { scan-assembler-times "vadduwm" 17 } } */
35 /* Note: One of the 16 adds is optimized out (add with zero),
36 leaving 15. The extra two adds are for the final reduction. */