gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c

   1 /* { dg-do compile } */
   2 /* { dg-options "-O2 -ftree-vectorize -fno-inline -msve-vector-bits=256 -fdump-tree-vect-details" } */
   3
   4 double mat[100][4];
   5 double mat2[100][8];
   6 double mat3[100][12];
   7 double mat4[100][3];
   8
   9 double
  10 slp_reduc_plus (int n)
  11 {
  12   double tmp = 0.0;
  13   for (int i = 0; i < n; i++)
  14     {
  15       tmp = tmp + mat[i][0];
  16       tmp = tmp + mat[i][1];
  17       tmp = tmp + mat[i][2];
  18       tmp = tmp + mat[i][3];
  19     }
  20   return tmp;
  21 }
  22
  23 double
  24 slp_reduc_plus2 (int n)
  25 {
  26   double tmp = 0.0;
  27   for (int i = 0; i < n; i++)
  28     {
  29       tmp = tmp + mat2[i][0];
  30       tmp = tmp + mat2[i][1];
  31       tmp = tmp + mat2[i][2];
  32       tmp = tmp + mat2[i][3];
  33       tmp = tmp + mat2[i][4];
  34       tmp = tmp + mat2[i][5];
  35       tmp = tmp + mat2[i][6];
  36       tmp = tmp + mat2[i][7];
  37     }
  38   return tmp;
  39 }
  40
  41 double
  42 slp_reduc_plus3 (int n)
  43 {
  44   double tmp = 0.0;
  45   for (int i = 0; i < n; i++)
  46     {
  47       tmp = tmp + mat3[i][0];
  48       tmp = tmp + mat3[i][1];
  49       tmp = tmp + mat3[i][2];
  50       tmp = tmp + mat3[i][3];
  51       tmp = tmp + mat3[i][4];
  52       tmp = tmp + mat3[i][5];
  53       tmp = tmp + mat3[i][6];
  54       tmp = tmp + mat3[i][7];
  55       tmp = tmp + mat3[i][8];
  56       tmp = tmp + mat3[i][9];
  57       tmp = tmp + mat3[i][10];
  58       tmp = tmp + mat3[i][11];
  59     }
  60   return tmp;
  61 }
  62
  63 void
  64 slp_non_chained_reduc (int n, double * restrict out)
  65 {
  66   for (int i = 0; i < 3; i++)
  67     out[i] = 0;
  68
  69   for (int i = 0; i < n; i++)
  70     {
  71       out[0] = out[0] + mat4[i][0];
  72       out[1] = out[1] + mat4[i][1];
  73       out[2] = out[2] + mat4[i][2];
  74     }
  75 }
  76
  77 /* Strict FP reductions shouldn't be used for the outer loops, only the
  78    inner loops.  */
  79
  80 float
  81 double_reduc1 (float (*restrict i)[16])
  82 {
  83   float l = 0;
  84
  85   for (int a = 0; a < 8; a++)
  86     for (int b = 0; b < 8; b++)
  87       l += i[b][a];
  88   return l;
  89 }
  90
  91 float
  92 double_reduc2 (float *restrict i)
  93 {
  94   float l = 0;
  95
  96   for (int a = 0; a < 8; a++)
  97     for (int b = 0; b < 16; b++)
  98       {
  99         l += i[b * 4];
 100         l += i[b * 4 + 1];
 101         l += i[b * 4 + 2];
 102         l += i[b * 4 + 3];
 103       }
 104   return l;
 105 }
 106
 107 float
 108 double_reduc3 (float *restrict i, float *restrict j)
 109 {
 110   float k = 0, l = 0;
 111
 112   for (int a = 0; a < 8; a++)
 113     for (int b = 0; b < 8; b++)
 114       {
 115         k += i[b];
 116         l += j[b];
 117       }
 118   return l * k;
 119 }
 120
 121 /* We can't yet handle double_reduc1.  */
 122 /* { dg-final { scan-assembler-times {\tfadda\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s} 3 } } */
 123 /* { dg-final { scan-assembler-times {\tfadda\td[0-9]+, p[0-7], d[0-9]+, z[0-9]+\.d} 9 } } */
 124 /* 1 reduction each for double_reduc{1,2} and 2 for double_reduc3.  Each one
 125    is reported three times, once for SVE, once for 128-bit AdvSIMD and once
 126    for 64-bit AdvSIMD.  */
 127 /* { dg-final { scan-tree-dump-times "Detected double reduction" 12 "vect" } } */
 128 /* double_reduc2 has 2 reductions and slp_non_chained_reduc has 3.
 129    double_reduc1 is reported 3 times (SVE, 128-bit AdvSIMD, 64-bit AdvSIMD)
 130    before failing.  */
 131 /* { dg-final { scan-tree-dump-times "Detected reduction" 12 "vect" } } */