3 * Copyright (c) 2002 The FFmpeg Project.
4 * Copyright (c) 2010 Dave Hooper, Mohamed Tarek, Michael Giacomelli
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 #include "asm_mcf5249.h"
25 #include "codeclib_misc.h"
26 #include "mdct_lookup.h"
28 #ifndef ICODE_ATTR_TREMOR_MDCT
29 #define ICODE_ATTR_TREMOR_MDCT ICODE_ATTR
33 * Compute the middle half of the inverse MDCT of size N = 2^nbits
34 * thus excluding the parts that can be derived by symmetry
35 * @param output N/2 samples
36 * @param input N/2 samples
38 * NOTE - CANNOT CURRENTLY OPERATE IN PLACE (input and output must
39 * not overlap or intersect at all)
41 void ff_imdct_half(unsigned int nbits
, fixed32
*output
, const fixed32
*input
) ICODE_ATTR_TREMOR_MDCT
;
42 void ff_imdct_half(unsigned int nbits
, fixed32
*output
, const fixed32
*input
)
45 const fixed32
*in1
, *in2
;
53 FFTComplex
*z
= (FFTComplex
*)output
;
59 /* revtab comes from the fft; revtab table is sized for N=4096 size fft = 2^12.
60 The fft is size N/4 so s->nbits-2, so our shift needs to be (12-(nbits-2)) */
61 const int revtab_shift
= (14- nbits
);
63 /* bitreverse reorder the input and rotate; result here is in OUTPUT ... */
64 /* (note that when using the current split radix, the bitreverse ordering is
65 complex, meaning that this reordering cannot easily be done in-place) */
66 /* Using the following pdf, you can see that it is possible to rearrange
67 the 'classic' pre/post rotate with an alternative one that enables
68 us to use fewer distinct twiddle factors.
69 http://www.eurasip.org/Proceedings/Eusipco/Eusipco2006/papers/1568980508.pdf
71 For prerotation, the factors are just sin,cos(2PI*i/N)
72 For postrotation, the factors are sin,cos(2PI*(i+1/4)/N)
74 Therefore, prerotation can immediately reuse the same twiddles as fft
75 (for postrotation it's still a bit complex, we reuse the fft trig tables
76 where we can, or a special table for N=2048, or interpolate between
77 trig tables for N>2048)
79 const int32_t *T
= sincos_lookup0
;
80 const int step
= 2<<(12-nbits
);
81 const uint16_t * p_revtab
=revtab
;
83 const uint16_t * const p_revtab_end
= p_revtab
+ n8
;
84 while(LIKELY(p_revtab
< p_revtab_end
))
86 j
= (*p_revtab
)>>revtab_shift
;
87 XNPROD31(*in2
, *in1
, T
[1], T
[0], &z
[j
].re
, &z
[j
].im
);
92 j
= (*p_revtab
)>>revtab_shift
;
93 XNPROD31(*in2
, *in1
, T
[1], T
[0], &z
[j
].re
, &z
[j
].im
);
101 const uint16_t * const p_revtab_end
= p_revtab
+ n8
;
102 while(LIKELY(p_revtab
< p_revtab_end
))
104 j
= (*p_revtab
)>>revtab_shift
;
105 XNPROD31(*in2
, *in1
, T
[0], T
[1], &z
[j
].re
, &z
[j
].im
);
110 j
= (*p_revtab
)>>revtab_shift
;
111 XNPROD31(*in2
, *in1
, T
[0], T
[1], &z
[j
].re
, &z
[j
].im
);
120 /* ... and so fft runs in OUTPUT buffer */
121 ff_fft_calc_c(nbits
-2, z
);
123 /* post rotation + reordering. now keeps the result within the OUTPUT buffer */
128 fixed32
* z1
= (fixed32
*)(&z
[0]);
129 fixed32
* z2
= (fixed32
*)(&z
[n4
-1]);
130 int magic_step
= step
>>2;
134 T
= sincos_lookup0
+ magic_step
;
146 XNPROD31_R(z1
[1], z1
[0], T
[0], T
[1], r0
, i1
); T
+=newstep
;
147 XNPROD31_R(z2
[1], z2
[0], T
[1], T
[0], r1
, i0
); T
+=newstep
;
159 case 12: /* n=4096 */
161 /* linear interpolation (50:50) between sincos_lookup0 and sincos_lookup1 */
162 const int32_t * V
= sincos_lookup1
;
165 fixed32
* z1
= (fixed32
*)(&z
[0]);
166 fixed32
* z2
= (fixed32
*)(&z
[n4
-1]);
168 t0
= T
[0]>>1; t1
=T
[1]>>1;
173 t0
+= (v0
= (V
[0]>>1));
174 t1
+= (v1
= (V
[1]>>1));
175 XNPROD31_R(z1
[1], z1
[0], t0
, t1
, r0
, i1
);
177 v0
+= (t0
= (T
[0]>>1));
178 v1
+= (t1
= (T
[1]>>1));
179 XNPROD31_R(z2
[1], z2
[0], v1
, v0
, r1
, i0
);
192 case 13: /* n = 8192 */
194 /* weight linear interpolation between sincos_lookup0 and sincos_lookup1
195 specifically: 25:75 for first twiddle and 75:25 for second twiddle */
196 const int32_t * V
= sincos_lookup1
;
198 int32_t t0
,t1
,v0
,v1
,q0
,q1
;
199 fixed32
* z1
= (fixed32
*)(&z
[0]);
200 fixed32
* z2
= (fixed32
*)(&z
[n4
-1]);
207 v0
= V
[0]; v1
= V
[1];
208 t0
+= (q0
= (v0
-t0
)>>1);
209 t1
+= (q1
= (v1
-t1
)>>1);
210 XNPROD31_R(z1
[1], z1
[0], t0
, t1
, r0
, i1
);
213 XNPROD31_R(z2
[1], z2
[0], t1
, t0
, r1
, i0
);
222 t0
= T
[0]; t1
= T
[1];
223 v0
+= (q0
= (t0
-v0
)>>1);
224 v1
+= (q1
= (t1
-v1
)>>1);
225 XNPROD31_R(z1
[1], z1
[0], v0
, v1
, r0
, i1
);
228 XNPROD31_R(z2
[1], z2
[0], v1
, v0
, r1
, i0
);
244 * Compute inverse MDCT of size N = 2^nbits
245 * @param output N samples
246 * @param input N/2 samples
247 * "In-place" processing can be achieved provided that:
248 * [0 .. N/2-1 | N/2 .. N-1 ]
250 * <-----------output----------->
252 * The result of ff_imdct_half is to put the 'half' imdct here
257 * We want it here for the full imdct:
261 * In addition we need to apply two symmetries to get the full imdct:
266 * D is a reflection of C
267 * A is a reflection of B (but with sign flipped)
269 * We process the symmetries at the same time as we 'move' the half imdct
270 * from [N/2,N-1] to [N/4,3N/4-1]
272 * TODO: find a way to make ff_imdct_half put the result in [N/4..3N/4-1]
273 * This would require being able to use revtab 'inplace' (since the input
274 * and output of imdct_half would then overlap somewhat)
276 void ff_imdct_calc(unsigned int nbits
, fixed32
*output
, const fixed32
*input
) ICODE_ATTR_TREMOR_MDCT
;
278 void ff_imdct_calc(unsigned int nbits
, fixed32
*output
, const fixed32
*input
)
280 const int n
= (1<<nbits
);
281 const int n2
= (n
>>1);
282 const int n4
= (n
>>2);
284 /* tell imdct_half to put the output in [N/2..3N/4-1] i.e. output+n2 */
285 ff_imdct_half(nbits
,output
+n2
,input
);
287 fixed32
* in_r
, * in_r2
, * out_r
, * out_r2
;
289 /* Copy BBBB to AAAA, reflected and sign-flipped.
290 Also copy BBBB to its correct destination (from [N/2..3N/4-1] to [N/4..N/2-1]) */
292 out_r2
= output
+n2
-8;
293 in_r
= output
+n2
+n4
-8;
296 #if defined CPU_COLDFIRE
298 "movem.l (%[in_r]), %%d0-%%d7\n\t"
299 "movem.l %%d0-%%d7, (%[out_r2])\n\t"
301 "move.l %%d7, (%[out_r])+\n\t"
303 "move.l %%d6, (%[out_r])+\n\t"
305 "move.l %%d5, (%[out_r])+\n\t"
307 "move.l %%d4, (%[out_r])+\n\t"
309 "move.l %%d3, (%[out_r])+\n\t"
311 "move.l %%d2, (%[out_r])+\n\t"
312 "lea.l (-8*4, %[in_r]), %[in_r]\n\t"
314 "move.l %%d1, (%[out_r])+\n\t"
315 "lea.l (-8*4, %[out_r2]), %[out_r2]\n\t"
317 "move.l %%d0, (%[out_r])+\n\t"
318 : [in_r
] "+a" (in_r
), [out_r
] "+a" (out_r
), [out_r2
] "+a" (out_r2
)
320 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory" );
322 out_r
[0] = -(out_r2
[7] = in_r
[7]);
323 out_r
[1] = -(out_r2
[6] = in_r
[6]);
324 out_r
[2] = -(out_r2
[5] = in_r
[5]);
325 out_r
[3] = -(out_r2
[4] = in_r
[4]);
326 out_r
[4] = -(out_r2
[3] = in_r
[3]);
327 out_r
[5] = -(out_r2
[2] = in_r
[2]);
328 out_r
[6] = -(out_r2
[1] = in_r
[1]);
329 out_r
[7] = -(out_r2
[0] = in_r
[0]);
335 in_r
= output
+ n2
+n4
;
336 in_r2
= output
+ n
-4;
338 out_r2
= output
+ n2
+ n4
- 4;
341 #if defined CPU_COLDFIRE
343 "movem.l (%[in_r]), %%d0-%%d3\n\t"
344 "movem.l %%d0-%%d3, (%[out_r])\n\t"
345 "movem.l (%[in_r2]), %%d4-%%d7\n\t"
346 "movem.l %%d4-%%d7, (%[out_r2])\n\t"
347 "move.l %%d0, %%a3\n\t"
348 "move.l %%d3, %%d0\n\t"
349 "move.l %%d1, %%d3\n\t"
350 "movem.l %%d0/%%d2-%%d3/%%a3, (%[in_r2])\n\t"
351 "move.l %%d7, %%d1\n\t"
352 "move.l %%d6, %%d2\n\t"
353 "move.l %%d5, %%d3\n\t"
354 "movem.l %%d1-%%d4, (%[in_r])\n\t"
355 "lea.l (4*4, %[in_r]), %[in_r]\n\t"
356 "lea.l (-4*4, %[in_r2]), %[in_r2]\n\t"
357 "lea.l (4*4, %[out_r]), %[out_r]\n\t"
358 "lea.l (-4*4, %[out_r2]), %[out_r2]\n\t"
359 : [in_r
] "+a" (in_r
), [in_r2
] "+a" (in_r2
),
360 [out_r
] "+a" (out_r
), [out_r2
] "+a" (out_r2
)
362 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a3", "memory", "cc" );
364 register fixed32 t0
,t1
,t2
,t3
;
365 register fixed32 s0
,s1
,s2
,s3
;
367 /* Copy and reflect CCCC to DDDD. Because CCCC is already where
368 we actually want to put DDDD this is a bit complicated.
369 * So simultaneously do the following things:
370 * 1. copy range from [n2+n4 .. n-1] to range[n2 .. n2+n4-1]
371 * 2. reflect range from [n2+n4 .. n-1] inplace
374 * ^a -> <- ^b ^c -> <- ^d
376 * #1: copy from ^c to ^a
377 * #2: copy from ^d to ^b
378 * #3: swap ^c and ^d in place
380 /* #1 pt1 : load 4 words from ^c. */
381 t0
=in_r
[0]; t1
=in_r
[1]; t2
=in_r
[2]; t3
=in_r
[3];
382 /* #1 pt2 : write to ^a */
383 out_r
[0]=t0
;out_r
[1]=t1
;out_r
[2]=t2
;out_r
[3]=t3
;
384 /* #2 pt1 : load 4 words from ^d */
385 s0
=in_r2
[0];s1
=in_r2
[1];s2
=in_r2
[2];s3
=in_r2
[3];
386 /* #2 pt2 : write to ^b */
387 out_r2
[0]=s0
;out_r2
[1]=s1
;out_r2
[2]=s2
;out_r2
[3]=s3
;
388 /* #3 pt1 : write words from #2 to ^c */
389 in_r
[0]=s3
;in_r
[1]=s2
;in_r
[2]=s1
;in_r
[3]=s0
;
390 /* #3 pt2 : write words from #1 to ^d */
391 in_r2
[0]=t3
;in_r2
[1]=t2
;in_r2
[2]=t1
;in_r2
[3]=t0
;
401 /* Follows the same structure as the canonical version above */
402 void ff_imdct_calc(unsigned int nbits
, fixed32
*output
, const fixed32
*input
)
404 const int n
= (1<<nbits
);
405 const int n2
= (n
>>1);
406 const int n4
= (n
>>2);
408 ff_imdct_half(nbits
,output
+n2
,input
);
410 fixed32
* in_r
, * in_r2
, * out_r
, * out_r2
;
418 "ldmdb %[in_r]!, {r0-r7}\n\t"
419 "stmdb %[out_r2]!, {r0-r7}\n\t"
428 "stmia %[out_r]!, {r0-r3,r5-r8}\n\t"
429 : [in_r
] "+r" (in_r
), [out_r
] "+r" (out_r
), [out_r2
] "+r" (out_r2
)
431 : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "memory" );
433 in_r
= output
+ n2
+n4
;
436 out_r2
= output
+ n2
+ n4
;
440 "ldmia %[in_r], {r0-r3}\n\t"
441 "stmia %[out_r]!, {r0-r3}\n\t"
442 "ldmdb %[in_r2], {r5-r8}\n\t"
443 "stmdb %[out_r2]!, {r5-r8}\n\t"
447 "stmdb %[in_r2]!, {r0,r2,r3,r4}\n\t"
451 "stmia %[in_r]!, {r4,r5,r6,r8}\n\t"
453 [in_r
] "+r" (in_r
), [in_r2
] "+r" (in_r2
), [out_r
] "+r" (out_r
), [out_r2
] "+r" (out_r2
)
455 : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "memory" );