Some asm for mdct on coldfire, speeds up vorbis decoding by about 0.3MHz
[kugel-rb.git] / apps / codecs / lib / mdct.c
blob4b0a50940199ba01e1848752bf4d2778e4bcc97a
1 /*
2 * Fixed Point IMDCT
3 * Copyright (c) 2002 The FFmpeg Project.
4 * Copyright (c) 2010 Dave Hooper, Mohamed Tarek, Michael Giacomelli
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 #include "codeclib.h"
22 #include "mdct.h"
23 #include "asm_arm.h"
24 #include "asm_mcf5249.h"
25 #include "codeclib_misc.h"
26 #include "mdct_lookup.h"
28 #ifndef ICODE_ATTR_TREMOR_MDCT
29 #define ICODE_ATTR_TREMOR_MDCT ICODE_ATTR
30 #endif
32 /**
33 * Compute the middle half of the inverse MDCT of size N = 2^nbits
34 * thus excluding the parts that can be derived by symmetry
35 * @param output N/2 samples
36 * @param input N/2 samples
38 * NOTE - CANNOT CURRENTLY OPERATE IN PLACE (input and output must
39 * not overlap or intersect at all)
41 void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input) ICODE_ATTR_TREMOR_MDCT;
42 void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input)
44 int n8, n4, n2, n, j;
45 const fixed32 *in1, *in2;
47 n = 1 << nbits;
49 n2 = n >> 1;
50 n4 = n >> 2;
51 n8 = n >> 3;
53 FFTComplex *z = (FFTComplex *)output;
55 /* pre rotation */
56 in1 = input;
57 in2 = input + n2 - 1;
59 /* revtab comes from the fft; revtab table is sized for N=4096 size fft = 2^12.
60 The fft is size N/4 so s->nbits-2, so our shift needs to be (12-(nbits-2)) */
61 const int revtab_shift = (14- nbits);
63 /* bitreverse reorder the input and rotate; result here is in OUTPUT ... */
64 /* (note that when using the current split radix, the bitreverse ordering is
65 complex, meaning that this reordering cannot easily be done in-place) */
66 /* Using the following pdf, you can see that it is possible to rearrange
67 the 'classic' pre/post rotate with an alternative one that enables
68 us to use fewer distinct twiddle factors.
69 http://www.eurasip.org/Proceedings/Eusipco/Eusipco2006/papers/1568980508.pdf
71 For prerotation, the factors are just sin,cos(2PI*i/N)
72 For postrotation, the factors are sin,cos(2PI*(i+1/4)/N)
74 Therefore, prerotation can immediately reuse the same twiddles as fft
75 (for postrotation it's still a bit complex, we reuse the fft trig tables
76 where we can, or a special table for N=2048, or interpolate between
77 trig tables for N>2048)
79 const int32_t *T = sincos_lookup0;
80 const int step = 2<<(12-nbits);
81 const uint16_t * p_revtab=revtab;
83 const uint16_t * const p_revtab_end = p_revtab + n8;
84 while(LIKELY(p_revtab < p_revtab_end))
86 j = (*p_revtab)>>revtab_shift;
87 XNPROD31(*in2, *in1, T[1], T[0], &z[j].re, &z[j].im );
88 T += step;
89 in1 += 2;
90 in2 -= 2;
91 p_revtab++;
92 j = (*p_revtab)>>revtab_shift;
93 XNPROD31(*in2, *in1, T[1], T[0], &z[j].re, &z[j].im );
94 T += step;
95 in1 += 2;
96 in2 -= 2;
97 p_revtab++;
101 const uint16_t * const p_revtab_end = p_revtab + n8;
102 while(LIKELY(p_revtab < p_revtab_end))
104 j = (*p_revtab)>>revtab_shift;
105 XNPROD31(*in2, *in1, T[0], T[1], &z[j].re, &z[j].im);
106 T -= step;
107 in1 += 2;
108 in2 -= 2;
109 p_revtab++;
110 j = (*p_revtab)>>revtab_shift;
111 XNPROD31(*in2, *in1, T[0], T[1], &z[j].re, &z[j].im);
112 T -= step;
113 in1 += 2;
114 in2 -= 2;
115 p_revtab++;
120 /* ... and so fft runs in OUTPUT buffer */
121 ff_fft_calc_c(nbits-2, z);
123 /* post rotation + reordering. now keeps the result within the OUTPUT buffer */
124 switch( nbits )
126 default:
128 fixed32 * z1 = (fixed32 *)(&z[0]);
129 fixed32 * z2 = (fixed32 *)(&z[n4-1]);
130 int magic_step = step>>2;
131 int newstep;
132 if(n<=1024)
134 T = sincos_lookup0 + magic_step;
135 newstep = step>>1;
137 else
139 T = sincos_lookup1;
140 newstep = 2;
143 while(z1<z2)
145 fixed32 r0,i0,r1,i1;
146 XNPROD31_R(z1[1], z1[0], T[0], T[1], r0, i1 ); T+=newstep;
147 XNPROD31_R(z2[1], z2[0], T[1], T[0], r1, i0 ); T+=newstep;
148 z1[0] = -r0;
149 z1[1] = -i0;
150 z2[0] = -r1;
151 z2[1] = -i1;
152 z1+=2;
153 z2-=2;
156 break;
159 case 12: /* n=4096 */
161 /* linear interpolation (50:50) between sincos_lookup0 and sincos_lookup1 */
162 const int32_t * V = sincos_lookup1;
163 T = sincos_lookup0;
164 int32_t t0,t1,v0,v1;
165 fixed32 * z1 = (fixed32 *)(&z[0]);
166 fixed32 * z2 = (fixed32 *)(&z[n4-1]);
168 t0 = T[0]>>1; t1=T[1]>>1;
170 while(z1<z2)
172 fixed32 r0,i0,r1,i1;
173 t0 += (v0 = (V[0]>>1));
174 t1 += (v1 = (V[1]>>1));
175 XNPROD31_R(z1[1], z1[0], t0, t1, r0, i1 );
176 T+=2;
177 v0 += (t0 = (T[0]>>1));
178 v1 += (t1 = (T[1]>>1));
179 XNPROD31_R(z2[1], z2[0], v1, v0, r1, i0 );
180 z1[0] = -r0;
181 z1[1] = -i0;
182 z2[0] = -r1;
183 z2[1] = -i1;
184 z1+=2;
185 z2-=2;
186 V+=2;
189 break;
192 case 13: /* n = 8192 */
194 /* weight linear interpolation between sincos_lookup0 and sincos_lookup1
195 specifically: 25:75 for first twiddle and 75:25 for second twiddle */
196 const int32_t * V = sincos_lookup1;
197 T = sincos_lookup0;
198 int32_t t0,t1,v0,v1,q0,q1;
199 fixed32 * z1 = (fixed32 *)(&z[0]);
200 fixed32 * z2 = (fixed32 *)(&z[n4-1]);
202 t0 = T[0]; t1=T[1];
204 while(z1<z2)
206 fixed32 r0,i0,r1,i1;
207 v0 = V[0]; v1 = V[1];
208 t0 += (q0 = (v0-t0)>>1);
209 t1 += (q1 = (v1-t1)>>1);
210 XNPROD31_R(z1[1], z1[0], t0, t1, r0, i1 );
211 t0 = v0-q0;
212 t1 = v1-q1;
213 XNPROD31_R(z2[1], z2[0], t1, t0, r1, i0 );
214 z1[0] = -r0;
215 z1[1] = -i0;
216 z2[0] = -r1;
217 z2[1] = -i1;
218 z1+=2;
219 z2-=2;
220 T+=2;
222 t0 = T[0]; t1 = T[1];
223 v0 += (q0 = (t0-v0)>>1);
224 v1 += (q1 = (t1-v1)>>1);
225 XNPROD31_R(z1[1], z1[0], v0, v1, r0, i1 );
226 v0 = t0-q0;
227 v1 = t1-q1;
228 XNPROD31_R(z2[1], z2[0], v1, v0, r1, i0 );
229 z1[0] = -r0;
230 z1[1] = -i0;
231 z2[0] = -r1;
232 z2[1] = -i1;
233 z1+=2;
234 z2-=2;
235 V+=2;
238 break;
244 * Compute inverse MDCT of size N = 2^nbits
245 * @param output N samples
246 * @param input N/2 samples
247 * "In-place" processing can be achieved provided that:
248 * [0 .. N/2-1 | N/2 .. N-1 ]
249 * <----input---->
250 * <-----------output----------->
252 * The result of ff_imdct_half is to put the 'half' imdct here
254 * N/2 N-1
255 * <--half imdct-->
257 * We want it here for the full imdct:
258 * N/4 3N/4-1
259 * <-------------->
261 * In addition we need to apply two symmetries to get the full imdct:
263 * <AAAAAA> <DDDDDD>
264 * <BBBBBB><CCCCCC>
266 * D is a reflection of C
267 * A is a reflection of B (but with sign flipped)
269 * We process the symmetries at the same time as we 'move' the half imdct
270 * from [N/2,N-1] to [N/4,3N/4-1]
272 * TODO: find a way to make ff_imdct_half put the result in [N/4..3N/4-1]
273 * This would require being able to use revtab 'inplace' (since the input
274 * and output of imdct_half would then overlap somewhat)
276 void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) ICODE_ATTR_TREMOR_MDCT;
277 #ifndef CPU_ARM
278 void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input)
280 const int n = (1<<nbits);
281 const int n2 = (n>>1);
282 const int n4 = (n>>2);
284 /* tell imdct_half to put the output in [N/2..3N/4-1] i.e. output+n2 */
285 ff_imdct_half(nbits,output+n2,input);
287 fixed32 * in_r, * in_r2, * out_r, * out_r2;
289 /* Copy BBBB to AAAA, reflected and sign-flipped.
290 Also copy BBBB to its correct destination (from [N/2..3N/4-1] to [N/4..N/2-1]) */
291 out_r = output;
292 out_r2 = output+n2-8;
293 in_r = output+n2+n4-8;
294 while(out_r<out_r2)
296 #if defined CPU_COLDFIRE
297 asm volatile(
298 "movem.l (%[in_r]), %%d0-%%d7\n\t"
299 "movem.l %%d0-%%d7, (%[out_r2])\n\t"
300 "neg.l %%d7\n\t"
301 "move.l %%d7, (%[out_r])+\n\t"
302 "neg.l %%d6\n\t"
303 "move.l %%d6, (%[out_r])+\n\t"
304 "neg.l %%d5\n\t"
305 "move.l %%d5, (%[out_r])+\n\t"
306 "neg.l %%d4\n\t"
307 "move.l %%d4, (%[out_r])+\n\t"
308 "neg.l %%d3\n\t"
309 "move.l %%d3, (%[out_r])+\n\t"
310 "neg.l %%d2\n\t"
311 "move.l %%d2, (%[out_r])+\n\t"
312 "lea.l (-8*4, %[in_r]), %[in_r]\n\t"
313 "neg.l %%d1\n\t"
314 "move.l %%d1, (%[out_r])+\n\t"
315 "lea.l (-8*4, %[out_r2]), %[out_r2]\n\t"
316 "neg.l %%d0\n\t"
317 "move.l %%d0, (%[out_r])+\n\t"
318 : [in_r] "+a" (in_r), [out_r] "+a" (out_r), [out_r2] "+a" (out_r2)
320 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory" );
321 #else
322 out_r[0] = -(out_r2[7] = in_r[7]);
323 out_r[1] = -(out_r2[6] = in_r[6]);
324 out_r[2] = -(out_r2[5] = in_r[5]);
325 out_r[3] = -(out_r2[4] = in_r[4]);
326 out_r[4] = -(out_r2[3] = in_r[3]);
327 out_r[5] = -(out_r2[2] = in_r[2]);
328 out_r[6] = -(out_r2[1] = in_r[1]);
329 out_r[7] = -(out_r2[0] = in_r[0]);
330 in_r -= 8;
331 out_r += 8;
332 out_r2 -= 8;
333 #endif
335 in_r = output + n2+n4;
336 in_r2 = output + n-4;
337 out_r = output + n2;
338 out_r2 = output + n2 + n4 - 4;
339 while(in_r<in_r2)
341 #if defined CPU_COLDFIRE
342 asm volatile(
343 "movem.l (%[in_r]), %%d0-%%d3\n\t"
344 "movem.l %%d0-%%d3, (%[out_r])\n\t"
345 "movem.l (%[in_r2]), %%d4-%%d7\n\t"
346 "movem.l %%d4-%%d7, (%[out_r2])\n\t"
347 "move.l %%d0, %%a3\n\t"
348 "move.l %%d3, %%d0\n\t"
349 "move.l %%d1, %%d3\n\t"
350 "movem.l %%d0/%%d2-%%d3/%%a3, (%[in_r2])\n\t"
351 "move.l %%d7, %%d1\n\t"
352 "move.l %%d6, %%d2\n\t"
353 "move.l %%d5, %%d3\n\t"
354 "movem.l %%d1-%%d4, (%[in_r])\n\t"
355 "lea.l (4*4, %[in_r]), %[in_r]\n\t"
356 "lea.l (-4*4, %[in_r2]), %[in_r2]\n\t"
357 "lea.l (4*4, %[out_r]), %[out_r]\n\t"
358 "lea.l (-4*4, %[out_r2]), %[out_r2]\n\t"
359 : [in_r] "+a" (in_r), [in_r2] "+a" (in_r2),
360 [out_r] "+a" (out_r), [out_r2] "+a" (out_r2)
362 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a3", "memory", "cc" );
363 #else
364 register fixed32 t0,t1,t2,t3;
365 register fixed32 s0,s1,s2,s3;
367 /* Copy and reflect CCCC to DDDD. Because CCCC is already where
368 we actually want to put DDDD this is a bit complicated.
369 * So simultaneously do the following things:
370 * 1. copy range from [n2+n4 .. n-1] to range[n2 .. n2+n4-1]
371 * 2. reflect range from [n2+n4 .. n-1] inplace
373 * [ | ]
374 * ^a -> <- ^b ^c -> <- ^d
376 * #1: copy from ^c to ^a
377 * #2: copy from ^d to ^b
378 * #3: swap ^c and ^d in place
380 /* #1 pt1 : load 4 words from ^c. */
381 t0=in_r[0]; t1=in_r[1]; t2=in_r[2]; t3=in_r[3];
382 /* #1 pt2 : write to ^a */
383 out_r[0]=t0;out_r[1]=t1;out_r[2]=t2;out_r[3]=t3;
384 /* #2 pt1 : load 4 words from ^d */
385 s0=in_r2[0];s1=in_r2[1];s2=in_r2[2];s3=in_r2[3];
386 /* #2 pt2 : write to ^b */
387 out_r2[0]=s0;out_r2[1]=s1;out_r2[2]=s2;out_r2[3]=s3;
388 /* #3 pt1 : write words from #2 to ^c */
389 in_r[0]=s3;in_r[1]=s2;in_r[2]=s1;in_r[3]=s0;
390 /* #3 pt2 : write words from #1 to ^d */
391 in_r2[0]=t3;in_r2[1]=t2;in_r2[2]=t1;in_r2[3]=t0;
393 in_r += 4;
394 in_r2 -= 4;
395 out_r += 4;
396 out_r2 -= 4;
397 #endif
400 #else
401 /* Follows the same structure as the canonical version above */
402 void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input)
404 const int n = (1<<nbits);
405 const int n2 = (n>>1);
406 const int n4 = (n>>2);
408 ff_imdct_half(nbits,output+n2,input);
410 fixed32 * in_r, * in_r2, * out_r, * out_r2;
412 out_r = output;
413 out_r2 = output+n2;
414 in_r = output+n2+n4;
415 while(out_r<out_r2)
417 asm volatile(
418 "ldmdb %[in_r]!, {r0-r7}\n\t"
419 "stmdb %[out_r2]!, {r0-r7}\n\t"
420 "rsb r8,r0,#0\n\t"
421 "rsb r0,r7,#0\n\t"
422 "rsb r7,r1,#0\n\t"
423 "rsb r1,r6,#0\n\t"
424 "rsb r6,r2,#0\n\t"
425 "rsb r2,r5,#0\n\t"
426 "rsb r5,r3,#0\n\t"
427 "rsb r3,r4,#0\n\t"
428 "stmia %[out_r]!, {r0-r3,r5-r8}\n\t"
429 : [in_r] "+r" (in_r), [out_r] "+r" (out_r), [out_r2] "+r" (out_r2)
431 : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "memory" );
433 in_r = output + n2+n4;
434 in_r2 = output + n;
435 out_r = output + n2;
436 out_r2 = output + n2 + n4;
437 while(in_r<in_r2)
439 asm volatile(
440 "ldmia %[in_r], {r0-r3}\n\t"
441 "stmia %[out_r]!, {r0-r3}\n\t"
442 "ldmdb %[in_r2], {r5-r8}\n\t"
443 "stmdb %[out_r2]!, {r5-r8}\n\t"
444 "mov r4,r0\n\t"
445 "mov r0,r3\n\t"
446 "mov r3,r1\n\t"
447 "stmdb %[in_r2]!, {r0,r2,r3,r4}\n\t"
448 "mov r4,r8\n\t"
449 "mov r8,r5\n\t"
450 "mov r5,r7\n\t"
451 "stmia %[in_r]!, {r4,r5,r6,r8}\n\t"
453 [in_r] "+r" (in_r), [in_r2] "+r" (in_r2), [out_r] "+r" (out_r), [out_r2] "+r" (out_r2)
455 : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "memory" );
458 #endif