apps/codecs/lib/mdct.c

   1 /*
   2  * Fixed Point IMDCT
   3  * Copyright (c) 2002 The FFmpeg Project.
   4  * Copyright (c) 2010 Dave Hooper, Mohamed Tarek, Michael Giacomelli
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19  */
  20
  21 #include "codeclib.h"
  22 #include "mdct.h"
  23 #include "asm_arm.h"
  24 #include "asm_mcf5249.h"
  25 #include "codeclib_misc.h"
  26 #include "mdct_lookup.h"
  27
  28 #ifndef ICODE_ATTR_TREMOR_MDCT
  29 #define ICODE_ATTR_TREMOR_MDCT ICODE_ATTR
  30 #endif
  31
  32 /**
  33  * Compute the middle half of the inverse MDCT of size N = 2^nbits
  34  * thus excluding the parts that can be derived by symmetry
  35  * @param output N/2 samples
  36  * @param input N/2 samples
  37  *
  38  * NOTE - CANNOT CURRENTLY OPERATE IN PLACE (input and output must
  39  *                                          not overlap or intersect at all)
  40  */
  41 void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input) ICODE_ATTR_TREMOR_MDCT;
  42 void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input)
  43 {
  44     int n8, n4, n2, n, j;
  45     const fixed32 *in1, *in2;
  46
  47     n = 1 << nbits;
  48
  49     n2 = n >> 1;
  50     n4 = n >> 2;
  51     n8 = n >> 3;
  52
  53     FFTComplex *z = (FFTComplex *)output;
  54
  55     /* pre rotation */
  56     in1 = input;
  57     in2 = input + n2 - 1;
  58
  59     /* revtab comes from the fft; revtab table is sized for N=4096 size fft = 2^12.
  60        The fft is size N/4 so s->nbits-2, so our shift needs to be (12-(nbits-2)) */
  61     const int revtab_shift = (14- nbits);
  62
  63     /* bitreverse reorder the input and rotate;   result here is in OUTPUT ... */
  64     /* (note that when using the current split radix, the bitreverse ordering is
  65         complex, meaning that this reordering cannot easily be done in-place) */
  66     /* Using the following pdf, you can see that it is possible to rearrange
  67        the 'classic' pre/post rotate with an alternative one that enables
  68        us to use fewer distinct twiddle factors.
  69        http://www.eurasip.org/Proceedings/Eusipco/Eusipco2006/papers/1568980508.pdf
  70
  71        For prerotation, the factors are just sin,cos(2PI*i/N)
  72        For postrotation, the factors are sin,cos(2PI*(i+1/4)/N)
  73
  74        Therefore, prerotation can immediately reuse the same twiddles as fft
  75        (for postrotation it's still a bit complex, we reuse the fft trig tables
  76         where we can, or a special table for N=2048, or interpolate between
  77         trig tables for N>2048)
  78        */
  79     const int32_t *T = sincos_lookup0;
  80     const int step = 2<<(12-nbits);
  81     const uint16_t * p_revtab=revtab;
  82     {
  83         const uint16_t * const p_revtab_end = p_revtab + n8;
  84         while(LIKELY(p_revtab < p_revtab_end))
  85         {
  86             j = (*p_revtab)>>revtab_shift;
  87             XNPROD31(*in2, *in1, T[1], T[0], &z[j].re, &z[j].im );
  88             T += step;
  89             in1 += 2;
  90             in2 -= 2;
  91             p_revtab++;
  92             j = (*p_revtab)>>revtab_shift;
  93             XNPROD31(*in2, *in1, T[1], T[0], &z[j].re, &z[j].im );
  94             T += step;
  95             in1 += 2;
  96             in2 -= 2;
  97             p_revtab++;
  98         }
  99     }
 100     {
 101         const uint16_t * const p_revtab_end = p_revtab + n8;
 102         while(LIKELY(p_revtab < p_revtab_end))
 103         {
 104             j = (*p_revtab)>>revtab_shift;
 105             XNPROD31(*in2, *in1, T[0], T[1], &z[j].re, &z[j].im);
 106             T -= step;
 107             in1 += 2;
 108             in2 -= 2;
 109             p_revtab++;
 110             j = (*p_revtab)>>revtab_shift;
 111             XNPROD31(*in2, *in1, T[0], T[1], &z[j].re, &z[j].im);
 112             T -= step;
 113             in1 += 2;
 114             in2 -= 2;
 115             p_revtab++;
 116         }
 117     }
 118
 119
 120     /* ... and so fft runs in OUTPUT buffer */
 121     ff_fft_calc_c(nbits-2, z);
 122
 123     /* post rotation + reordering.  now keeps the result within the OUTPUT buffer */
 124     switch( nbits )
 125     {
 126         default:
 127         {
 128             fixed32 * z1 = (fixed32 *)(&z[0]);
 129             fixed32 * z2 = (fixed32 *)(&z[n4-1]);
 130             int magic_step = step>>2;
 131             int newstep;
 132             if(n<=1024)
 133             {
 134                 T = sincos_lookup0 + magic_step;
 135                 newstep = step>>1;
 136             }
 137             else
 138             {
 139                  T = sincos_lookup1;
 140                  newstep = 2;
 141             }
 142
 143             while(z1<z2)
 144             {
 145                 fixed32 r0,i0,r1,i1;
 146                 XNPROD31_R(z1[1], z1[0], T[0], T[1], r0, i1 ); T+=newstep;
 147                 XNPROD31_R(z2[1], z2[0], T[1], T[0], r1, i0 ); T+=newstep;
 148                 z1[0] = -r0;
 149                 z1[1] = -i0;
 150                 z2[0] = -r1;
 151                 z2[1] = -i1;
 152                 z1+=2;
 153                 z2-=2;
 154             }
 155
 156             break;
 157         }
 158
 159         case 12: /* n=4096 */
 160         {
 161             /* linear interpolation (50:50) between sincos_lookup0 and sincos_lookup1 */
 162             const int32_t * V = sincos_lookup1;
 163             T = sincos_lookup0;
 164             int32_t t0,t1,v0,v1;
 165             fixed32 * z1 = (fixed32 *)(&z[0]);
 166             fixed32 * z2 = (fixed32 *)(&z[n4-1]);
 167
 168             t0 = T[0]>>1; t1=T[1]>>1;
 169
 170             while(z1<z2)
 171             {
 172                 fixed32 r0,i0,r1,i1;
 173                 t0 += (v0 = (V[0]>>1));
 174                 t1 += (v1 = (V[1]>>1));
 175                 XNPROD31_R(z1[1], z1[0], t0, t1, r0, i1 );
 176                 T+=2;
 177                 v0 += (t0 = (T[0]>>1));
 178                 v1 += (t1 = (T[1]>>1));
 179                 XNPROD31_R(z2[1], z2[0], v1, v0, r1, i0 );
 180                 z1[0] = -r0;
 181                 z1[1] = -i0;
 182                 z2[0] = -r1;
 183                 z2[1] = -i1;
 184                 z1+=2;
 185                 z2-=2;
 186                 V+=2;
 187             }
 188
 189             break;
 190         }
 191
 192         case 13: /* n = 8192 */
 193         {
 194             /* weight linear interpolation between sincos_lookup0 and sincos_lookup1
 195                specifically: 25:75 for first twiddle and 75:25 for second twiddle */
 196             const int32_t * V = sincos_lookup1;
 197             T = sincos_lookup0;
 198             int32_t t0,t1,v0,v1,q0,q1;
 199             fixed32 * z1 = (fixed32 *)(&z[0]);
 200             fixed32 * z2 = (fixed32 *)(&z[n4-1]);
 201
 202             t0 = T[0]; t1=T[1];
 203
 204             while(z1<z2)
 205             {
 206                 fixed32 r0,i0,r1,i1;
 207                 v0 = V[0]; v1 = V[1];
 208                 t0 += (q0 = (v0-t0)>>1);
 209                 t1 += (q1 = (v1-t1)>>1);
 210                 XNPROD31_R(z1[1], z1[0], t0, t1, r0, i1 );
 211                 t0 = v0-q0;
 212                 t1 = v1-q1;
 213                 XNPROD31_R(z2[1], z2[0], t1, t0, r1, i0 );
 214                 z1[0] = -r0;
 215                 z1[1] = -i0;
 216                 z2[0] = -r1;
 217                 z2[1] = -i1;
 218                 z1+=2;
 219                 z2-=2;
 220                 T+=2;
 221
 222                 t0 = T[0]; t1 = T[1];
 223                 v0 += (q0 = (t0-v0)>>1);
 224                 v1 += (q1 = (t1-v1)>>1);
 225                 XNPROD31_R(z1[1], z1[0], v0, v1, r0, i1 );
 226                 v0 = t0-q0;
 227                 v1 = t1-q1;
 228                 XNPROD31_R(z2[1], z2[0], v1, v0, r1, i0 );
 229                 z1[0] = -r0;
 230                 z1[1] = -i0;
 231                 z2[0] = -r1;
 232                 z2[1] = -i1;
 233                 z1+=2;
 234                 z2-=2;
 235                 V+=2;
 236             }
 237
 238             break;
 239         }
 240     }
 241 }
 242
 243 /**
 244  * Compute inverse MDCT of size N = 2^nbits
 245  * @param output N samples
 246  * @param input N/2 samples
 247  * "In-place" processing can be achieved provided that:
 248  *            [0  ..  N/2-1 | N/2  ..  N-1 ]
 249  *            <----input---->
 250  *            <-----------output----------->
 251  *
 252  * The result of ff_imdct_half is to put the 'half' imdct here
 253  *
 254  *                          N/2          N-1
 255  *                          <--half imdct-->
 256  *
 257  * We want it here for the full imdct:
 258  *                   N/4      3N/4-1
 259  *                   <-------------->
 260  *
 261  * In addition we need to apply two symmetries to get the full imdct:
 262  *
 263  *           <AAAAAA>                <DDDDDD>
 264  *                   <BBBBBB><CCCCCC>
 265  *
 266  *           D is a reflection of C
 267  *           A is a reflection of B (but with sign flipped)
 268  *
 269  * We process the symmetries at the same time as we 'move' the half imdct
 270  * from [N/2,N-1] to [N/4,3N/4-1]
 271  *
 272  * TODO: find a way to make ff_imdct_half put the result in [N/4..3N/4-1]
 273  * This would require being able to use revtab 'inplace' (since the input
 274  * and output of imdct_half would then overlap somewhat)
 275  */
 276 void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) ICODE_ATTR_TREMOR_MDCT;
 277 #ifndef CPU_ARM
 278 void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input)
 279 {
 280     const int n = (1<<nbits);
 281     const int n2 = (n>>1);
 282     const int n4 = (n>>2);
 283
 284     /* tell imdct_half to put the output in [N/2..3N/4-1] i.e. output+n2 */
 285     ff_imdct_half(nbits,output+n2,input);
 286
 287     fixed32 * in_r, * in_r2, * out_r, * out_r2;
 288
 289     /* Copy BBBB to AAAA, reflected and sign-flipped.
 290        Also copy BBBB to its correct destination (from [N/2..3N/4-1] to [N/4..N/2-1]) */
 291     out_r = output;
 292     out_r2 = output+n2-8;
 293     in_r  = output+n2+n4-8;
 294     while(out_r<out_r2)
 295     {
 296 #if defined CPU_COLDFIRE
 297         asm volatile(
 298             "movem.l (%[in_r]), %%d0-%%d7\n\t"
 299             "movem.l %%d0-%%d7, (%[out_r2])\n\t"
 300             "neg.l %%d7\n\t"
 301             "move.l %%d7, (%[out_r])+\n\t"
 302             "neg.l %%d6\n\t"
 303             "move.l %%d6, (%[out_r])+\n\t"
 304             "neg.l %%d5\n\t"
 305             "move.l %%d5, (%[out_r])+\n\t"
 306             "neg.l %%d4\n\t"
 307             "move.l %%d4, (%[out_r])+\n\t"
 308             "neg.l %%d3\n\t"
 309             "move.l %%d3, (%[out_r])+\n\t"
 310             "neg.l %%d2\n\t"
 311             "move.l %%d2, (%[out_r])+\n\t"
 312             "lea.l (-8*4, %[in_r]), %[in_r]\n\t"
 313             "neg.l %%d1\n\t"
 314             "move.l %%d1, (%[out_r])+\n\t"
 315             "lea.l (-8*4, %[out_r2]), %[out_r2]\n\t"
 316             "neg.l %%d0\n\t"
 317             "move.l %%d0, (%[out_r])+\n\t"
 318             : [in_r] "+a" (in_r), [out_r] "+a" (out_r), [out_r2] "+a" (out_r2)
 319             :
 320             : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory" );
 321 #else
 322         out_r[0]     = -(out_r2[7] = in_r[7]);
 323         out_r[1]     = -(out_r2[6] = in_r[6]);
 324         out_r[2]     = -(out_r2[5] = in_r[5]);
 325         out_r[3]     = -(out_r2[4] = in_r[4]);
 326         out_r[4]     = -(out_r2[3] = in_r[3]);
 327         out_r[5]     = -(out_r2[2] = in_r[2]);
 328         out_r[6]     = -(out_r2[1] = in_r[1]);
 329         out_r[7]     = -(out_r2[0] = in_r[0]);
 330         in_r -= 8;
 331         out_r += 8;
 332         out_r2 -= 8;
 333 #endif
 334     }
 335     in_r = output + n2+n4;
 336     in_r2 = output + n-4;
 337     out_r = output + n2;
 338     out_r2 = output + n2 + n4 - 4;
 339     while(in_r<in_r2)
 340     {
 341 #if defined CPU_COLDFIRE
 342         asm volatile(
 343             "movem.l (%[in_r]), %%d0-%%d3\n\t"
 344             "movem.l %%d0-%%d3, (%[out_r])\n\t"
 345             "movem.l (%[in_r2]), %%d4-%%d7\n\t"
 346             "movem.l %%d4-%%d7, (%[out_r2])\n\t"
 347             "move.l %%d0, %%a3\n\t"
 348             "move.l %%d3, %%d0\n\t"
 349             "move.l %%d1, %%d3\n\t"
 350             "movem.l %%d0/%%d2-%%d3/%%a3, (%[in_r2])\n\t"
 351             "move.l %%d7, %%d1\n\t"
 352             "move.l %%d6, %%d2\n\t"
 353             "move.l %%d5, %%d3\n\t"
 354             "movem.l %%d1-%%d4, (%[in_r])\n\t"
 355             "lea.l (4*4, %[in_r]), %[in_r]\n\t"
 356             "lea.l (-4*4, %[in_r2]), %[in_r2]\n\t"
 357             "lea.l (4*4, %[out_r]), %[out_r]\n\t"
 358             "lea.l (-4*4, %[out_r2]), %[out_r2]\n\t"
 359             : [in_r] "+a" (in_r), [in_r2] "+a" (in_r2),
 360               [out_r] "+a" (out_r), [out_r2] "+a" (out_r2)
 361             :
 362             : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a3", "memory", "cc" );
 363 #else
 364         register fixed32 t0,t1,t2,t3;
 365         register fixed32 s0,s1,s2,s3;
 366
 367         /* Copy and reflect CCCC to DDDD.  Because CCCC is already where
 368            we actually want to put DDDD this is a bit complicated.
 369          * So simultaneously do the following things:
 370          * 1. copy range from [n2+n4 .. n-1] to range[n2 .. n2+n4-1]
 371          * 2. reflect range from [n2+n4 .. n-1] inplace
 372          *
 373          *  [                      |                        ]
 374          *   ^a ->            <- ^b ^c ->               <- ^d
 375          *
 376          *  #1: copy from ^c to ^a
 377          *  #2: copy from ^d to ^b
 378          *  #3: swap ^c and ^d in place
 379          */
 380         /* #1 pt1 : load 4 words from ^c. */
 381         t0=in_r[0]; t1=in_r[1]; t2=in_r[2]; t3=in_r[3];
 382         /* #1 pt2 : write to ^a */
 383         out_r[0]=t0;out_r[1]=t1;out_r[2]=t2;out_r[3]=t3;
 384         /* #2 pt1 : load 4 words from ^d */
 385         s0=in_r2[0];s1=in_r2[1];s2=in_r2[2];s3=in_r2[3];
 386         /* #2 pt2 : write to ^b */
 387         out_r2[0]=s0;out_r2[1]=s1;out_r2[2]=s2;out_r2[3]=s3;
 388         /* #3 pt1 : write words from #2 to ^c */
 389         in_r[0]=s3;in_r[1]=s2;in_r[2]=s1;in_r[3]=s0;
 390         /* #3 pt2 : write words from #1 to ^d */
 391         in_r2[0]=t3;in_r2[1]=t2;in_r2[2]=t1;in_r2[3]=t0;
 392
 393         in_r += 4;
 394         in_r2 -= 4;
 395         out_r += 4;
 396         out_r2 -= 4;
 397 #endif
 398     }
 399 }
 400 #else
 401 /* Follows the same structure as the canonical version above */
 402 void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input)
 403 {
 404     const int n = (1<<nbits);
 405     const int n2 = (n>>1);
 406     const int n4 = (n>>2);
 407
 408     ff_imdct_half(nbits,output+n2,input);
 409
 410     fixed32 * in_r, * in_r2, * out_r, * out_r2;
 411
 412     out_r = output;
 413     out_r2 = output+n2;
 414     in_r  = output+n2+n4;
 415     while(out_r<out_r2)
 416     {
 417         asm volatile(
 418             "ldmdb %[in_r]!, {r0-r7}\n\t"
 419             "stmdb %[out_r2]!, {r0-r7}\n\t"
 420             "rsb r8,r0,#0\n\t"
 421             "rsb r0,r7,#0\n\t"
 422             "rsb r7,r1,#0\n\t"
 423             "rsb r1,r6,#0\n\t"
 424             "rsb r6,r2,#0\n\t"
 425             "rsb r2,r5,#0\n\t"
 426             "rsb r5,r3,#0\n\t"
 427             "rsb r3,r4,#0\n\t"
 428             "stmia %[out_r]!, {r0-r3,r5-r8}\n\t"
 429             : [in_r] "+r" (in_r), [out_r] "+r" (out_r), [out_r2] "+r" (out_r2)
 430             :
 431             : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "memory" );
 432     }
 433     in_r = output + n2+n4;
 434     in_r2 = output + n;
 435     out_r = output + n2;
 436     out_r2 = output + n2 + n4;
 437     while(in_r<in_r2)
 438     {
 439         asm volatile(
 440             "ldmia %[in_r], {r0-r3}\n\t"
 441             "stmia %[out_r]!, {r0-r3}\n\t"
 442             "ldmdb %[in_r2], {r5-r8}\n\t"
 443             "stmdb %[out_r2]!, {r5-r8}\n\t"
 444             "mov r4,r0\n\t"
 445             "mov r0,r3\n\t"
 446             "mov r3,r1\n\t"
 447             "stmdb %[in_r2]!, {r0,r2,r3,r4}\n\t"
 448             "mov r4,r8\n\t"
 449             "mov r8,r5\n\t"
 450             "mov r5,r7\n\t"
 451             "stmia %[in_r]!, {r4,r5,r6,r8}\n\t"
 452             :
 453             [in_r] "+r" (in_r), [in_r2] "+r" (in_r2), [out_r] "+r" (out_r), [out_r2] "+r" (out_r2)
 454             :
 455             : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "memory" );
 456     }
 457 }
 458 #endif