Add support for in-order addition reduction using SVE FADDA
[official-gcc.git] / gcc / testsuite / gcc.target / aarch64 / sve / reduc_strict_3.c
bloba28145febce65b443f3b84d12e61139b1aa23ac7
1 /* { dg-do compile } */
2 /* { dg-options "-O2 -ftree-vectorize -fno-inline -msve-vector-bits=256 -fdump-tree-vect-details" } */
4 double mat[100][4];
5 double mat2[100][8];
6 double mat3[100][12];
7 double mat4[100][3];
9 double
10 slp_reduc_plus (int n)
12 double tmp = 0.0;
13 for (int i = 0; i < n; i++)
15 tmp = tmp + mat[i][0];
16 tmp = tmp + mat[i][1];
17 tmp = tmp + mat[i][2];
18 tmp = tmp + mat[i][3];
20 return tmp;
23 double
24 slp_reduc_plus2 (int n)
26 double tmp = 0.0;
27 for (int i = 0; i < n; i++)
29 tmp = tmp + mat2[i][0];
30 tmp = tmp + mat2[i][1];
31 tmp = tmp + mat2[i][2];
32 tmp = tmp + mat2[i][3];
33 tmp = tmp + mat2[i][4];
34 tmp = tmp + mat2[i][5];
35 tmp = tmp + mat2[i][6];
36 tmp = tmp + mat2[i][7];
38 return tmp;
41 double
42 slp_reduc_plus3 (int n)
44 double tmp = 0.0;
45 for (int i = 0; i < n; i++)
47 tmp = tmp + mat3[i][0];
48 tmp = tmp + mat3[i][1];
49 tmp = tmp + mat3[i][2];
50 tmp = tmp + mat3[i][3];
51 tmp = tmp + mat3[i][4];
52 tmp = tmp + mat3[i][5];
53 tmp = tmp + mat3[i][6];
54 tmp = tmp + mat3[i][7];
55 tmp = tmp + mat3[i][8];
56 tmp = tmp + mat3[i][9];
57 tmp = tmp + mat3[i][10];
58 tmp = tmp + mat3[i][11];
60 return tmp;
63 void
64 slp_non_chained_reduc (int n, double * restrict out)
66 for (int i = 0; i < 3; i++)
67 out[i] = 0;
69 for (int i = 0; i < n; i++)
71 out[0] = out[0] + mat4[i][0];
72 out[1] = out[1] + mat4[i][1];
73 out[2] = out[2] + mat4[i][2];
77 /* Strict FP reductions shouldn't be used for the outer loops, only the
78 inner loops. */
80 float
81 double_reduc1 (float (*restrict i)[16])
83 float l = 0;
85 for (int a = 0; a < 8; a++)
86 for (int b = 0; b < 8; b++)
87 l += i[b][a];
88 return l;
91 float
92 double_reduc2 (float *restrict i)
94 float l = 0;
96 for (int a = 0; a < 8; a++)
97 for (int b = 0; b < 16; b++)
99 l += i[b * 4];
100 l += i[b * 4 + 1];
101 l += i[b * 4 + 2];
102 l += i[b * 4 + 3];
104 return l;
107 float
108 double_reduc3 (float *restrict i, float *restrict j)
110 float k = 0, l = 0;
112 for (int a = 0; a < 8; a++)
113 for (int b = 0; b < 8; b++)
115 k += i[b];
116 l += j[b];
118 return l * k;
121 /* We can't yet handle double_reduc1. */
122 /* { dg-final { scan-assembler-times {\tfadda\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s} 3 } } */
123 /* { dg-final { scan-assembler-times {\tfadda\td[0-9]+, p[0-7], d[0-9]+, z[0-9]+\.d} 9 } } */
124 /* 1 reduction each for double_reduc{1,2} and 2 for double_reduc3. Each one
125 is reported three times, once for SVE, once for 128-bit AdvSIMD and once
126 for 64-bit AdvSIMD. */
127 /* { dg-final { scan-tree-dump-times "Detected double reduction" 12 "vect" } } */
128 /* double_reduc2 has 2 reductions and slp_non_chained_reduc has 3.
129 double_reduc1 is reported 3 times (SVE, 128-bit AdvSIMD, 64-bit AdvSIMD)
130 before failing. */
131 /* { dg-final { scan-tree-dump-times "Detected reduction" 12 "vect" } } */