gcc/testsuite/gcc.target/powerpc/sad-vectorize-1.c

   1 /* { dg-do compile { target { powerpc*-*-* } } } */
   2 /* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power9" } } */
   3 /* { dg-require-effective-target powerpc_p9vector_ok } */
   4 /* { dg-skip-if "" { powerpc*-*-aix* } } */
   5 /* { dg-options "-O3 -mcpu=power9" } */
   6
   7 /* Verify that we vectorize this SAD loop using vabsdub. */
   8
   9 extern int abs (int __x) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__const__));
  10
  11 static int
  12 foo (unsigned char *w, int i, unsigned char *x, int j)
  13 {
  14   int tot = 0;
  15   for (int a = 0; a < 16; a++)
  16     {
  17       for (int b = 0; b < 16; b++)
  18         tot += abs (w[b] - x[b]);
  19       w += i;
  20       x += j;
  21     }
  22   return tot;
  23 }
  24
  25 void
  26 bar (unsigned char *w, unsigned char *x, int i, int *result)
  27 {
  28   *result = foo (w, 16, x, i);
  29 }
  30
  31 /* { dg-final { scan-assembler-times "vabsdub" 16 } } */
  32 /* { dg-final { scan-assembler-times "vsum4ubs" 16 } } */
  33 /* { dg-final { scan-assembler-times "vadduwm" 17 } } */
  34
  35 /* Note: One of the 16 adds is optimized out (add with zero),
  36    leaving 15.  The extra two adds are for the final reduction.  */