Fix inline asm argument. Shouldn't have much impact in practice but might save a...
[kugel-rb.git] / apps / codecs / lib / fft-ffmpeg_arm.h
blob9d396a3fc01f1a108efce4dc74f97ec06c9336c3
1 /***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id$
10 * Copyright (C) 2010 Dave Hooper
12 * ARM optimisations for ffmpeg's fft (used in fft-ffmpeg.c)
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version 2
17 * of the License, or (at your option) any later version.
19 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
20 * KIND, either express or implied.
22 ****************************************************************************/
24 #ifdef CPU_ARM
26 /* Start off with optimised variants of the butterflies that work
27 nicely on arm */
28 /* 1. where y and a share the same variable/register */
29 #define BF_OPT(x,y,a,b) {\
30 y = a + b;\
31 x = y - (b<<1);\
34 /* 2. where y and b share the same variable/register */
35 #define BF_OPT2(x,y,a,b) {\
36 x = a - b;\
37 y = x + (b<<1);\
40 /* 3. where y and b share the same variable/register (but y=(-b)) */
41 #define BF_OPT2_REV(x,y,a,b) {\
42 x = a + b;\
43 y = x - (b<<1);\
46 /* standard BUTTERFLIES package. Note, we actually manually inline this
47 in all the TRANSFORM macros below anyway */
48 #define FFT_FFMPEG_INCL_OPTIMISED_BUTTERFLIES
49 #define BUTTERFLIES(a0,a1,a2,a3) {\
51 BF_OPT(t1, t5, t5, t1);\
52 BF_OPT(t6, t2, t2, t6);\
53 BF_OPT(a2.re, a0.re, a0.re, t5);\
54 BF_OPT(a2.im, a0.im, a0.im, t2);\
55 BF_OPT(a3.re, a1.re, a1.re, t6);\
56 BF_OPT(a3.im, a1.im, a1.im, t1);\
60 #define FFT_FFMPEG_INCL_OPTIMISED_TRANSFORM
62 /* on ARM, all the TRANSFORM_etc inlines use the following registers:
63 r5,r6,r7,r8,r9,r10,r4,r12
65 inputs are: z, n, STEP
67 NOTE THAT THESE MACROS ACTUALLY CHANGE z INPUT INPLACE-
68 so sequential actions, z += n*3, z -= n*2 etc etc matter
72 #define TRANSFORM_POST_STORE( z, n ) {\
73 /*{*/\
74 /* BF_OPT(t1, t5, t5, t1);*/\
75 /* BF_OPT(t6, t2, t2, t6);*/\
76 /* BF_OPT(a2.re, a0.re, a0.re, t5);*/\
77 /* BF_OPT(a2.im, a0.im, a0.im, t2);*/\
78 /* BF_OPT(a3.re, a1.re, a1.re, t6);*/\
79 /* BF_OPT(a3.im, a1.im, a1.im, t1);*/\
80 /*}*/\
81 z -= n*3;\
82 /* r_re = my_z[0]; r_im = my_z[1]; */\
84 register FFTSample rt0temp asm("r4");\
85 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\
86 BF_OPT(rt0temp, r_re, r_re, t5);\
87 BF_OPT(t2, r_im, r_im, t2);\
88 /* my_z[0] = r_re; my_z[1] = r_im; */\
89 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im));\
90 z += n;\
91 /* r_re = my_z[0]; r_im = my_z[1]; */\
92 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\
93 BF_OPT(t5, r_re, r_re, t6);\
94 BF_OPT(t6, r_im, r_im, t1);\
95 /* my_z[0] = r_re; my_z[1] = r_im; */\
96 asm volatile( "stmia %[my_z], {%[r_re],%[r_im]}\n\t"::[my_z] "r" (z), [r_re] "r" (r_re), [r_im] "r" (r_im));\
97 z += n;\
98 /* my_z[0] = rt0temp; my_z[1] = t2; */\
99 asm volatile( "stmia %[my_z], {%[rt0temp],%[t2]}\n\t"::[my_z] "r" (z), [rt0temp] "r" (rt0temp), [t2] "r" (t2));\
100 z += n;\
102 /* my_z[0] = t5; my_z[1] = t6; */\
103 asm volatile( "stmia %[my_z], {%[t5],%[t6]}\n\t"::[my_z] "r" (z), [t5] "r" (t5), [t6] "r" (t6));\
104 z -= n*3;\
107 #define TRANSFORM( z, n, wre_arg, wim_arg )\
109 FFTSample wre = wre_arg, wim = wim_arg;\
110 register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\
111 z += n*2; /* z[o2] */\
112 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\
113 XPROD31_R(r_re, r_im, wre, wim, t1,t2);\
115 z += n; /* z[o3] */\
116 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\
117 XNPROD31_R(r_re, r_im, wre, wim, t5,t6);\
119 BF_OPT(t1, t5, t5, t1);\
120 BF_OPT(t6, t2, t2, t6);\
121 TRANSFORM_POST_STORE( z, n );\
124 #define TRANSFORM_W01( z, n, w )\
126 register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\
129 register FFTSample wre asm("r4"),wim asm("r12");\
130 asm volatile( "ldmia %[w], {%[wre], %[wim]}\n\t":[wre] "=r" (wre), [wim] "=r" (wim):[w] "r" (w));\
131 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */\
132 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\
133 XPROD31_R(r_re, r_im, wre, wim, t1,t2);\
135 z += n; /* z[o3] */\
136 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\
137 XNPROD31_R(r_re, r_im, wre, wim, t5,t6);\
140 BF_OPT(t1, t5, t5, t1);\
141 BF_OPT(t6, t2, t2, t6);\
142 TRANSFORM_POST_STORE( z, n );\
145 //static inline void TRANSFORM_W10(int32_t * z, unsigned int n, const int32_t * w)
146 #define TRANSFORM_W10( z, n, w )\
148 register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\
151 register FFTSample wim asm("r4"),wre asm("r12");\
152 asm volatile( "ldmia %[w], {%[wim], %[wre]}\n\t":[wim] "=r" (wim), [wre] "=r" (wre):[w] "r" (w));\
153 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */\
154 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\
155 XPROD31_R(r_re, r_im, wre, wim, t1,t2);\
157 z += n; /* z[o3] */\
158 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\
159 XNPROD31_R(r_re, r_im, wre, wim, t5,t6);\
162 BF_OPT(t1, t5, t5, t1);\
163 BF_OPT(t6, t2, t2, t6);\
164 TRANSFORM_POST_STORE( z, n );\
167 #define TRANSFORM_EQUAL( z, n )\
169 register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\
171 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */\
172 asm volatile( "ldmia %[my_z], {%[t5],%[t6]}\n\t":[t5] "=r" (t5), [t6] "=r" (t6):[my_z] "r" (z));\
173 z += n; /* z[o3] */\
174 asm volatile( "ldmia %[my_z], {%[r_re],%[r_im]}\n\t":[r_re] "=r" (r_re), [r_im] "=r" (r_im):[my_z] "r" (z));\
176 /**/\
177 /*t2 = MULT32(cPI2_8, t5);*/\
178 /*t1 = MULT31(cPI2_8, t6);*/\
179 /*t6 = MULT31(cPI2_8, r_re);*/\
180 /*t5 = MULT32(cPI2_8, r_im);*/\
182 /*t1 = ( t1 + (t2<<1) );*/\
183 /*t2 = ( t1 - (t2<<2) );*/\
184 /*t6 = ( t6 + (t5<<1) );*/\
185 /*t5 = ( t6 - (t5<<2) );*/\
186 /**/\
187 t2 = MULT31(cPI2_8, t5);\
188 t6 = MULT31(cPI2_8, t6);\
189 r_re = MULT31(cPI2_8, r_re);\
190 t5 = MULT31(cPI2_8, r_im);\
192 t1 = ( t6 + t2 );\
193 t2 = ( t6 - t2 );\
194 t6 = ( r_re + t5 );\
195 t5 = ( r_re - t5 );\
197 BF_OPT(t1, t5, t5, t1);\
198 BF_OPT(t6, t2, t2, t6);\
199 TRANSFORM_POST_STORE( z, n );\
202 #define TRANSFORM_ZERO( z,n )\
204 register FFTSample t1 asm("r5"),t2 asm("r6"),t5 asm("r7"),t6 asm("r8"),r_re asm("r9"),r_im asm("r10");\
206 z += n*2; /* z[o2] -- 2n * 2 since complex numbers */\
207 asm volatile( "ldmia %[my_z], {%[t1],%[t2]}\n\t":[t1] "=r" (t1), [t2] "=r" (t2):[my_z] "r" (z));\
208 z += n; /* z[o3] */\
209 asm volatile( "ldmia %[my_z], {%[t5],%[t6]}\n\t":[t5] "=r" (t5), [t6] "=r" (t6):[my_z] "r" (z));\
211 BF_OPT(t1, t5, t5, t1);\
212 BF_OPT(t6, t2, t2, t6);\
213 TRANSFORM_POST_STORE( z, n );\
216 #define FFT_FFMPEG_INCL_OPTIMISED_FFT4
217 #define fft4(z_arg)\
219 /* input[0..7] -> output[0..7] */\
220 fixed32 * m = (fixed32 *) ( ( z_arg ) );\
221 /* load r1=z[0],r2=z[1],...,r8=z[7] */\
222 asm volatile(\
223 "ldmia %[z], {r1-r8}\n\t"\
224 "add r1,r1,r3\n\t" /* r1 :=t1 */\
225 "sub r3,r1,r3, lsl #1\n\t" /* r3 :=t3 */\
226 "sub r7,r7,r5\n\t" /* r10:=t8 */\
227 "add r5,r7,r5, lsl #1\n\t" /* r5 :=t6 */\
229 "add r1,r1,r5\n\t" /* r1 = o[0] */\
230 "sub r5,r1,r5, lsl #1\n\t" /* r5 = o[4] */\
232 "add r2,r2,r4\n\t" /* r2 :=t2 */\
233 "sub r4,r2,r4, lsl #1\n\t" /* r9 :=t4 */\
235 "add r12,r6,r8\n\t" /* r10:=t5 */\
236 "sub r6,r6,r8\n\t" /* r6 :=t7 */\
238 "sub r8,r4,r7\n\t" /* r8 = o[7]*/ \
239 "add r4,r4,r7\n\t" /* r4 = o[3]*/ \
240 "sub r7,r3,r6\n\t" /* r7 = o[6]*/ \
241 "add r3,r3,r6\n\t" /* r3 = o[2]*/ \
242 "sub r6,r2,r12\n\t" /* r6 = o[5]*/ \
243 "add r2,r2,r12\n\t" /* r2 = o[1]*/ \
245 "stmia %[z], {r1-r8}\n\t"\
246 : /* outputs */\
247 : /* inputs */ [z] "r" (m)\
248 : /* clobbers */\
249 "r1","r2","r3","r4","r5","r6","r7","r8","r12","memory"\
254 #define FFT_FFMPEG_INCL_OPTIMISED_FFT8
255 /* The chunk of asm below is equivalent to the following:
257 // first load in z[4].re thru z[7].im into local registers
258 // ...
259 BF_OPT2_REV(z[4].re, z[5].re, z[4].re, z[5].re); // x=a+b; y=x-(b<<1)
260 BF_OPT2_REV(z[4].im, z[5].im, z[4].im, z[5].im);
261 BF_REV (temp, z[7].re, z[6].re, z[7].re); // x=a+b; y=a-b;
262 BF_REV (z[6].re, z[7].im, z[6].im, z[7].im);
263 // save z[7].re and z[7].im as those are complete now
264 // z[5].re and z[5].im are also complete now but save these later on
266 BF(z[6].im, z[4].re, temp, z[4].re); // x=a-b; y=a+b
267 BF_OPT(z[6].re, z[4].im, z[4].im, z[6].re); // y=a+b; x=y-(b<<1)
268 // now load z[2].re and z[2].im
269 // ...
270 BF_OPT(z[6].re, z[2].re, z[2].re, z[6].re); // y=a+b; x=y-(b<<1)
271 BF_OPT(z[6].im, z[2].im, z[2].im, z[6].im); // y=a+b; x=y-(b<<1)
272 // Now save z[6].re and z[6].im, along with z[5].re and z[5].im
273 // for efficiency. Also save z[2].re and z[2].im.
274 // Now load z[0].re and z[0].im
275 // ...
277 BF_OPT(z[4].re, z[0].re, z[0].re, z[4].re); // y=a+b; x=y-(b<<1)
278 BF_OPT(z[4].im, z[0].im, z[0].im, z[4].im); // y=a+b; x=y-(b<<1)
279 // Finally save out z[4].re, z[4].im, z[0].re and z[0].im
280 // ...
282 static inline void fft8( FFTComplex * z )
284 fft4(z);
286 FFTSample temp;
287 fixed32 * m4 = (fixed32 *)(&(z[4].re));
289 asm volatile(
290 /* read in z[4].re thru z[7].im */
291 "ldmia %[z4_ptr]!, {r1-r8}\n\t"
292 /* (now points one word past &z[7].im) */
293 "add r1,r1,r3\n\t"
294 "sub r3,r1,r3,lsl #1\n\t"
295 "add r2,r2,r4\n\t"
296 "sub r4,r2,r4,lsl #1\n\t"
297 "add %[temp],r5,r7\n\t"
298 "sub r7,r5,r7\n\t"
299 "add r5,r6,r8\n\t"
300 "sub r8,r6,r8\n\t"
302 "stmdb %[z4_ptr]!, {r7,r8}\n\t" /* write z[7].re,z[7].im straight away */
303 /* Note, registers r7 & r8 now free */
305 "sub r6,%[temp],r1\n\t"
306 "add r1,%[temp],r1\n\t"
307 "add r2,r2,r5\n\t"
308 "sub r5,r2,r5,lsl #1\n\t"
309 "add %[temp], %[z_ptr], #16\n\t" /* point to &z[2].re */
310 "ldmia %[temp],{r7,r8}\n\t" /* load z[2].re and z[2].im */
311 "add r7,r7,r5\n\t"
312 "sub r5,r7,r5,lsl #1\n\t"
313 "add r8,r8,r6\n\t"
314 "sub r6,r8,r6,lsl #1\n\t"
316 /* write out z[5].re, z[5].im, z[6].re, z[6].im in one go*/
317 "stmdb %[z4_ptr]!, {r3-r6}\n\t"
318 "stmia %[temp],{r7,r8}\n\t" /* write out z[2].re, z[2].im */
319 "ldmia %[z_ptr],{r7,r8}\n\t" /* load r[0].re, r[0].im */
321 "add r7,r7,r1\n\t"
322 "sub r1,r7,r1,lsl #1\n\t"
323 "add r8,r8,r2\n\t"
324 "sub r2,r8,r2,lsl #1\n\t"
326 "stmia %[z_ptr],{r7,r8}\n\t" /* write out z[0].re, z[0].im */
327 "stmdb %[z4_ptr], {r1,r2}\n\t" /* write out z[4].re, z[4].im */
328 : [z4_ptr] "+r" (m4), [temp] "=r" (temp)
329 : [z_ptr] "r" (z)
330 : "r1","r2","r3","r4","r5","r6","r7","r8","memory"
334 z++;
335 TRANSFORM_EQUAL(z,2);
339 #endif // CPU_ARM