Sansa clip+: do not set GPIO B7 in the display driver, it's already used for FM radio I2C
[kugel-rb.git] / apps / codecs / lib / fft-ffmpeg_arm.h
blob073ad8ee46f405a6cdae8ef563577ef1c88d2d29
1 /***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id$
10 * Copyright (C) 2010 Dave Hooper
12 * ARM optimisations for ffmpeg's fft (used in fft-ffmpeg.c)
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version 2
17 * of the License, or (at your option) any later version.
19 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
20 * KIND, either express or implied.
22 ****************************************************************************/
24 #ifdef CPU_ARM
26 /* Start off with optimised variants of the butterflies that work
27 nicely on arm */
28 /* 1. where y and a share the same variable/register */
29 #define BF_OPT(x,y,a,b) {\
30 y = a + b;\
31 x = y - (b<<1);\
34 /* 2. where y and b share the same variable/register */
35 #define BF_OPT2(x,y,a,b) {\
36 x = a - b;\
37 y = x + (b<<1);\
40 /* 3. where y and b share the same variable/register (but y=(-b)) */
41 #define BF_OPT2_REV(x,y,a,b) {\
42 x = a + b;\
43 y = x - (b<<1);\
47 /* standard BUTTERFLIES package. Note, we actually manually inline this
48 in all the TRANSFORM macros below anyway */
49 #define FFT_FFMPEG_INCL_OPTIMISED_BUTTERFLIES
50 #define BUTTERFLIES(a0,a1,a2,a3) {\
52 BF_OPT(t1, t5, t5, t1);\
53 BF_OPT(t6, t2, t2, t6);\
54 BF_OPT(a2.re, a0.re, a0.re, t5);\
55 BF_OPT(a2.im, a0.im, a0.im, t2);\
56 BF_OPT(a3.re, a1.re, a1.re, t6);\
57 BF_OPT(a3.im, a1.im, a1.im, t1);\
61 #define FFT_FFMPEG_INCL_OPTIMISED_TRANSFORM
63 static inline FFTComplex* TRANSFORM( FFTComplex* z, int n, FFTSample wre, FFTSample wim )
65 register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"),r_re asm("r8"),r_im asm("r9");
66 z += n*2; /* z[o2] */
67 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
68 XPROD31_R(r_re, r_im, wre, wim, t1,t2);
70 z += n; /* z[o3] */
71 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
72 XNPROD31_R(r_re, r_im, wre, wim, t5,t6);
74 BF_OPT(t1, t5, t5, t1);
75 BF_OPT(t6, t2, t2, t6);
78 register FFTSample rt0temp asm("r4");
79 /*{*/
80 /* BF_OPT(t1, t5, t5, t1);*/
81 /* BF_OPT(t6, t2, t2, t6);*/
82 /* BF_OPT(a2.re, a0.re, a0.re, t5);*/
83 /* BF_OPT(a2.im, a0.im, a0.im, t2);*/
84 /* BF_OPT(a3.re, a1.re, a1.re, t6);*/
85 /* BF_OPT(a3.im, a1.im, a1.im, t1);*/
86 /*}*/
87 z -= n*3;
88 /* r_re = my_z[0]; r_im = my_z[1]; */
89 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
90 BF_OPT(rt0temp, r_re, r_re, t5);
91 BF_OPT(t2, r_im, r_im, t2);
92 /* my_z[0] = r_re; my_z[1] = r_im; */
93 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory" );
94 z += n;
95 /* r_re = my_z[0]; r_im = my_z[1]; */
96 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
97 BF_OPT(t5, r_re, r_re, t6);
98 BF_OPT(t6, r_im, r_im, t1);
99 /* my_z[0] = r_re; my_z[1] = r_im; */
100 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
101 z += n;
102 /* my_z[0] = rt0temp; my_z[1] = t2; */
103 asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory");
105 z += n;
107 /* my_z[0] = t5; my_z[1] = t6; */
108 asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory");
109 z -= n*3;
110 return(z);
113 static inline FFTComplex* TRANSFORM_W01( FFTComplex* z, int n, const FFTSample* w )
115 register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"),r_re asm("r8"),r_im asm("r9");
117 /* load wre,wim into t5,t6 */
118 asm volatile( "ldmia %[w], {%[wre], %[wim]}\n\t":[wre] "=r" (t5), [wim] "=r" (t6):[w] "r" (w));
119 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */
120 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
121 XPROD31_R(r_re, r_im, t5 /*wre*/, t6 /*wim*/, t1,t2);
123 z += n; /* z[o3] */
124 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
125 XNPROD31_R(r_re, r_im, t5 /*wre*/, t6 /*wim*/, t5,t6);
127 BF_OPT(t1, t5, t5, t1);
128 BF_OPT(t6, t2, t2, t6);
130 register FFTSample rt0temp asm("r4");
131 /*{*/
132 /* BF_OPT(t1, t5, t5, t1);*/
133 /* BF_OPT(t6, t2, t2, t6);*/
134 /* BF_OPT(a2.re, a0.re, a0.re, t5);*/
135 /* BF_OPT(a2.im, a0.im, a0.im, t2);*/
136 /* BF_OPT(a3.re, a1.re, a1.re, t6);*/
137 /* BF_OPT(a3.im, a1.im, a1.im, t1);*/
138 /*}*/
139 z -= n*3;
140 /* r_re = my_z[0]; r_im = my_z[1]; */
141 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
142 BF_OPT(rt0temp, r_re, r_re, t5);
143 BF_OPT(t2, r_im, r_im, t2);
144 /* my_z[0] = r_re; my_z[1] = r_im; */
145 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
146 z += n;
147 /* r_re = my_z[0]; r_im = my_z[1]; */
148 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
149 BF_OPT(t5, r_re, r_re, t6);
150 BF_OPT(t6, r_im, r_im, t1);
151 /* my_z[0] = r_re; my_z[1] = r_im; */
152 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
153 z += n;
154 /* my_z[0] = rt0temp; my_z[1] = t2; */
155 asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory");
157 z += n;
159 /* my_z[0] = t5; my_z[1] = t6; */
160 asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory");
161 z -= n*3;
162 return(z);
165 static inline FFTComplex* TRANSFORM_W10( FFTComplex* z, int n, const FFTSample* w )
167 register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"),r_re asm("r8"),r_im asm("r9");
169 /* load wim,wre into t5,t6 */
170 asm volatile( "ldmia %[w], {%[wim], %[wre]}\n\t":[wim] "=r" (t5), [wre] "=r" (t6):[w] "r" (w));
171 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */
172 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
173 XPROD31_R(r_re, r_im, t6 /*wim*/, t5 /*wre*/, t1,t2);
175 z += n; /* z[o3] */
176 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
177 XNPROD31_R(r_re, r_im, t6 /*wim*/, t5 /*wre*/, t5,t6);
179 BF_OPT(t1, t5, t5, t1);
180 BF_OPT(t6, t2, t2, t6);
182 register FFTSample rt0temp asm("r4");
183 /*{*/
184 /* BF_OPT(t1, t5, t5, t1);*/
185 /* BF_OPT(t6, t2, t2, t6);*/
186 /* BF_OPT(a2.re, a0.re, a0.re, t5);*/
187 /* BF_OPT(a2.im, a0.im, a0.im, t2);*/
188 /* BF_OPT(a3.re, a1.re, a1.re, t6);*/
189 /* BF_OPT(a3.im, a1.im, a1.im, t1);*/
190 /*}*/
191 z -= n*3;
192 /* r_re = my_z[0]; r_im = my_z[1]; */
193 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
194 BF_OPT(rt0temp, r_re, r_re, t5);
195 BF_OPT(t2, r_im, r_im, t2);
196 /* my_z[0] = r_re; my_z[1] = r_im; */
197 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
198 z += n;
199 /* r_re = my_z[0]; r_im = my_z[1]; */
200 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
201 BF_OPT(t5, r_re, r_re, t6);
202 BF_OPT(t6, r_im, r_im, t1);
203 /* my_z[0] = r_re; my_z[1] = r_im; */
204 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
205 z += n;
206 /* my_z[0] = rt0temp; my_z[1] = t2; */
207 asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory");
209 z += n;
211 /* my_z[0] = t5; my_z[1] = t6; */
212 asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory");
213 z -= n*3;
214 return(z);
217 static inline FFTComplex* TRANSFORM_EQUAL( FFTComplex* z, int n )
219 register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"),r_re asm("r8"),r_im asm("r9");
221 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */
222 asm volatile( "ldmia %[my_z], {%[t5],%[t6]}\n\t":[t5] "=r" (t5), [t6] "=r" (t6):[my_z] "r" (z));
223 z += n; /* z[o3] */
224 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
226 /**/
227 /*t2 = MULT32(cPI2_8, t5);*/
228 /*t1 = MULT31(cPI2_8, t6);*/
229 /*t6 = MULT31(cPI2_8, r_re);*/
230 /*t5 = MULT32(cPI2_8, r_im);*/
232 /*t1 = ( t1 + (t2<<1) );*/
233 /*t2 = ( t1 - (t2<<2) );*/
234 /*t6 = ( t6 + (t5<<1) );*/
235 /*t5 = ( t6 - (t5<<2) );*/
236 /**/
237 t2 = MULT31(cPI2_8, t5);
238 t6 = MULT31(cPI2_8, t6);
239 r_re = MULT31(cPI2_8, r_re);
240 t5 = MULT31(cPI2_8, r_im);
242 t1 = ( t6 + t2 );
243 t2 = ( t6 - t2 );
244 t6 = ( r_re + t5 );
245 t5 = ( r_re - t5 );
247 BF_OPT(t1, t5, t5, t1);
248 BF_OPT(t6, t2, t2, t6);
250 register FFTSample rt0temp asm("r4");
251 /*{*/
252 /* BF_OPT(t1, t5, t5, t1);*/
253 /* BF_OPT(t6, t2, t2, t6);*/
254 /* BF_OPT(a2.re, a0.re, a0.re, t5);*/
255 /* BF_OPT(a2.im, a0.im, a0.im, t2);*/
256 /* BF_OPT(a3.re, a1.re, a1.re, t6);*/
257 /* BF_OPT(a3.im, a1.im, a1.im, t1);*/
258 /*}*/
259 z -= n*3;
260 /* r_re = my_z[0]; r_im = my_z[1]; */
261 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
262 BF_OPT(rt0temp, r_re, r_re, t5);
263 BF_OPT(t2, r_im, r_im, t2);
264 /* my_z[0] = r_re; my_z[1] = r_im; */
265 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
266 z += n;
267 /* r_re = my_z[0]; r_im = my_z[1]; */
268 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
269 BF_OPT(t5, r_re, r_re, t6);
270 BF_OPT(t6, r_im, r_im, t1);
271 /* my_z[0] = r_re; my_z[1] = r_im; */
272 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
273 z += n;
274 /* my_z[0] = rt0temp; my_z[1] = t2; */
275 asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory");
277 z += n;
279 /* my_z[0] = t5; my_z[1] = t6; */
280 asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory");
281 z -= n*3;
282 return(z);
285 static inline FFTComplex* TRANSFORM_ZERO( FFTComplex* z, int n )
287 register FFTSample t1,t2 asm("r5"),t5 asm("r6"),t6 asm("r7"), r_re asm("r8"), r_im asm("r9");
289 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */
290 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
291 z += n; /* z[o3] */
292 asm volatile( "ldmia %[my_z], {%[t5],%[t6]}\n\t":[t5] "=r" (t5), [t6] "=r" (t6):[my_z] "r" (z));
294 BF_OPT(t1, t5, t5, r_re);
295 BF_OPT(t6, t2, r_im, t6);
297 register FFTSample rt0temp asm("r4");
298 /*{*/
299 /* BF_OPT(t1, t5, t5, t1);*/
300 /* BF_OPT(t6, t2, t2, t6);*/
301 /* BF_OPT(a2.re, a0.re, a0.re, t5);*/
302 /* BF_OPT(a2.im, a0.im, a0.im, t2);*/
303 /* BF_OPT(a3.re, a1.re, a1.re, t6);*/
304 /* BF_OPT(a3.im, a1.im, a1.im, t1);*/
305 /*}*/
306 z -= n*3;
307 /* r_re = my_z[0]; r_im = my_z[1]; */
308 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
309 BF_OPT(rt0temp, r_re, r_re, t5);
310 BF_OPT(t2, r_im, r_im, t2);
311 /* my_z[0] = r_re; my_z[1] = r_im; */
312 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
313 z += n;
314 /* r_re = my_z[0]; r_im = my_z[1]; */
315 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));
316 BF_OPT(t5, r_re, r_re, t6);
317 BF_OPT(t6, r_im, r_im, t1);
318 /* my_z[0] = r_re; my_z[1] = r_im; */
319 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im):"memory");
320 z += n;
321 /* my_z[0] = rt0temp; my_z[1] = t2; */
322 asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2):"memory");
324 z += n;
326 /* my_z[0] = t5; my_z[1] = t6; */
327 asm volatile( "stmia %[my_z]!, {%[t5],%[t6]}\n\t":[my_z] "+r" (z) : [t5] "r" (t5), [t6] "r" (t6):"memory");
328 z -= n*3;
329 return(z);
332 #define FFT_FFMPEG_INCL_OPTIMISED_FFT4
333 static inline FFTComplex* fft4(FFTComplex * z)
335 FFTSample temp;
337 /* input[0..7] -> output[0..7] */
338 /* load r1=z[0],r2=z[1],...,r8=z[7] */
339 asm volatile(
340 "ldmia %[z], {r1-r8}\n\t"
341 "add r1,r1,r3\n\t" /* r1 :=t1 */
342 "sub r3,r1,r3, lsl #1\n\t" /* r3 :=t3 */
343 "sub r7,r7,r5\n\t" /* r10:=t8 */
344 "add r5,r7,r5, lsl #1\n\t" /* r5 :=t6 */
346 "add r1,r1,r5\n\t" /* r1 = o[0] */
347 "sub r5,r1,r5, lsl #1\n\t" /* r5 = o[4] */
349 "add r2,r2,r4\n\t" /* r2 :=t2 */
350 "sub r4,r2,r4, lsl #1\n\t" /* r9 :=t4 */
352 "add %[temp],r6,r8\n\t" /* r10:=t5 */
353 "sub r6,r6,r8\n\t" /* r6 :=t7 */
355 "sub r8,r4,r7\n\t" /* r8 = o[7]*/
356 "add r4,r4,r7\n\t" /* r4 = o[3]*/
357 "sub r7,r3,r6\n\t" /* r7 = o[6]*/
358 "add r3,r3,r6\n\t" /* r3 = o[2]*/
359 "sub r6,r2,%[temp]\n\t" /* r6 = o[5]*/
360 "add r2,r2,%[temp]\n\t" /* r2 = o[1]*/
362 "stmia %[z]!, {r1-r8}\n\t"
363 : /* outputs */ [z] "+r" (z), [temp] "=r" (temp)
364 : /* inputs */
365 : /* clobbers */
366 "r1","r2","r3","r4","r5","r6","r7","r8","memory"
368 return z;
371 #define FFT_FFMPEG_INCL_OPTIMISED_FFT8
372 /* The chunk of asm below is equivalent to the following:
374 // first load in z[4].re thru z[7].im into local registers
375 // ...
376 BF_OPT2_REV(z[4].re, z[5].re, z[4].re, z[5].re); // x=a+b; y=x-(b<<1)
377 BF_OPT2_REV(z[4].im, z[5].im, z[4].im, z[5].im);
378 BF_REV (temp, z[7].re, z[6].re, z[7].re); // x=a+b; y=a-b;
379 BF_REV (z[6].re, z[7].im, z[6].im, z[7].im);
380 // save z[7].re and z[7].im as those are complete now
381 // z[5].re and z[5].im are also complete now but save these later on
383 BF(z[6].im, z[4].re, temp, z[4].re); // x=a-b; y=a+b
384 BF_OPT(z[6].re, z[4].im, z[4].im, z[6].re); // y=a+b; x=y-(b<<1)
385 // now load z[2].re and z[2].im
386 // ...
387 BF_OPT(z[6].re, z[2].re, z[2].re, z[6].re); // y=a+b; x=y-(b<<1)
388 BF_OPT(z[6].im, z[2].im, z[2].im, z[6].im); // y=a+b; x=y-(b<<1)
389 // Now save z[6].re and z[6].im, along with z[5].re and z[5].im
390 // for efficiency. Also save z[2].re and z[2].im.
391 // Now load z[0].re and z[0].im
392 // ...
394 BF_OPT(z[4].re, z[0].re, z[0].re, z[4].re); // y=a+b; x=y-(b<<1)
395 BF_OPT(z[4].im, z[0].im, z[0].im, z[4].im); // y=a+b; x=y-(b<<1)
396 // Finally save out z[4].re, z[4].im, z[0].re and z[0].im
397 // ...
399 static inline void fft8(FFTComplex * z)
401 FFTComplex* m4 = fft4(z);
403 /* note that we increment z_ptr on the final stmia, which
404 leaves z_ptr pointing to z[1].re ready for the Transform step */
406 register FFTSample temp;
408 asm volatile(
409 /* read in z[4].re thru z[7].im */
410 "ldmia %[z4_ptr]!, {r1-r8}\n\t"
411 /* (now points one word past &z[7].im) */
412 "add r1,r1,r3\n\t"
413 "sub r3,r1,r3,lsl #1\n\t"
414 "add r2,r2,r4\n\t"
415 "sub r4,r2,r4,lsl #1\n\t"
416 "add %[temp],r5,r7\n\t"
417 "sub r7,r5,r7\n\t"
418 "add r5,r6,r8\n\t"
419 "sub r8,r6,r8\n\t"
421 "stmdb %[z4_ptr]!, {r7,r8}\n\t" /* write z[7].re,z[7].im straight away */
422 /* Note, registers r7 & r8 now free */
424 "sub r6,%[temp],r1\n\t"
425 "add r1,%[temp],r1\n\t"
426 "add r2,r2,r5\n\t"
427 "sub r5,r2,r5,lsl #1\n\t"
428 "add %[temp], %[z_ptr], #16\n\t" /* point to &z[2].re */
429 "ldmia %[temp],{r7,r8}\n\t" /* load z[2].re and z[2].im */
430 "add r7,r7,r5\n\t"
431 "sub r5,r7,r5,lsl #1\n\t"
432 "add r8,r8,r6\n\t"
433 "sub r6,r8,r6,lsl #1\n\t"
435 /* write out z[5].re, z[5].im, z[6].re, z[6].im in one go*/
436 "stmdb %[z4_ptr]!, {r3-r6}\n\t"
437 "stmia %[temp],{r7,r8}\n\t" /* write out z[2].re, z[2].im */
438 "ldmia %[z_ptr],{r7,r8}\n\t" /* load r[0].re, r[0].im */
440 "add r7,r7,r1\n\t"
441 "sub r1,r7,r1,lsl #1\n\t"
442 "add r8,r8,r2\n\t"
443 "sub r2,r8,r2,lsl #1\n\t"
445 "stmia %[z_ptr]!,{r7,r8}\n\t" /* write out z[0].re, z[0].im */
446 "stmdb %[z4_ptr], {r1,r2}\n\t" /* write out z[4].re, z[4].im */
447 : [z4_ptr] "+r" (m4), [temp] "=r" (temp), [z_ptr] "+r" (z)
449 : "r1","r2","r3","r4","r5","r6","r7","r8","memory"
453 TRANSFORM_EQUAL(z,2);
456 #endif // CPU_ARM