IVOPT performance tuning patch. The main problem is a variant of maximal weight
[official-gcc.git] / gcc / testsuite / gcc.dg / vmx / fft.c
blob2b8a537c6693ed0e14679744d4473f75bd961638
1 /* { dg-do compile } */
2 #include <altivec.h>
4 inline void
5 transpose4x4(vector float *matrix)
7 vector float v0, v1, v2, v3;
9 v0 = vec_mergeh(matrix[0], matrix[2]);
10 v1 = vec_mergel(matrix[0], matrix[2]);
11 v2 = vec_mergeh(matrix[1], matrix[3]);
12 v3 = vec_mergel(matrix[1], matrix[3]);
14 matrix[0] = vec_mergeh(v0, v2);
15 matrix[1] = vec_mergel(v0, v2);
16 matrix[2] = vec_mergeh(v1, v3);
17 matrix[3] = vec_mergel(v1, v3);
20 void
21 vec_ifft64(vector float *x0, vector float *x1)
23 int i;
24 vector float real[4], imag[4];
25 vector float c0r, c1r, c2r, c3r, c0i, c1i, c2i, c3i;
26 vector float d0r, d1r, d2r, d3r, d0i, d1i, d2i, d3i;
29 * N=64
31 * Stage 1: t=1 => k = 0, j = 0..15
32 * ================================
33 * for j = 0:15
34 * c0 = x0(j+0*16);
35 * c1 = x0(j+1*16);
36 * c2 = x0(j+2*16);
37 * c3 = x0(j+3*16);
39 * d0 = c0 + c2;
40 * d1 = c0 - c2;
41 * d2 = c1 + c3;
42 * d3 = i*(c1 - c3);
44 * x1(4j+0) = d0 + d2;
45 * x1(4j+1) = d1 + d3;
46 * x1(4j+2) = d0 - d2;
47 * x1(4j+3) = d1 - d3;
48 * end
49 ******************************************************/
51 for (i=0; i < 4; i++)
53 c0r = x0[i];
54 c1r = x0[i+4];
55 c2r = x0[i+8];
56 c3r = x0[i+12];
58 c0i = x0[i+16];
59 c1i = x0[i+20];
60 c2i = x0[i+24];
61 c3i = x0[i+28];
63 d0r = vec_add(c0r, c2r);
64 d1r = vec_sub(c0r, c2r);
65 d2r = vec_add(c1r, c3r);
66 d3r = vec_sub(c3i, c1i);
68 d0i = vec_add(c0i, c2i);
69 d1i = vec_sub(c0i, c2i);
70 d2i = vec_add(c1i, c3i);
71 d3i = vec_sub(c1r, c3r);
73 /* Calculate real{x1} */
74 real[0] = vec_add(d0r, d2r);
75 real[1] = vec_add(d1r, d3r);
76 real[2] = vec_sub(d0r, d2r);
77 real[3] = vec_sub(d1r, d3r);
79 transpose4x4(real);
81 /* Calculate imag{x1} */
82 imag[0] = vec_add(d0i, d2i);
83 imag[1] = vec_add(d1i, d3i);
84 imag[2] = vec_sub(d0i, d2i);
85 imag[3] = vec_sub(d1i, d3i);
87 transpose4x4(imag);
89 x1[4*i] = real[0];
90 x1[4*i+1] = real[1];
91 x1[4*i+2] = real[2];
92 x1[4*i+3] = real[3];
94 x1[4*i+16] = imag[0];
95 x1[4*i+17] = imag[1];
96 x1[4*i+18] = imag[2];
97 x1[4*i+19] = imag[3];