apps/codecs/libfaad/filtbank.c

   1 /*
   2 ** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding
   3 ** Copyright (C) 2003-2004 M. Bakker, Ahead Software AG, http://www.nero.com
   4 **
   5 ** This program is free software; you can redistribute it and/or modify
   6 ** it under the terms of the GNU General Public License as published by
   7 ** the Free Software Foundation; either version 2 of the License, or
   8 ** (at your option) any later version.
   9 **
  10 ** This program is distributed in the hope that it will be useful,
  11 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 ** GNU General Public License for more details.
  14 **
  15 ** You should have received a copy of the GNU General Public License
  16 ** along with this program; if not, write to the Free Software
  17 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  18 **
  19 ** Any non-GPL usage of this software or parts of this software is strictly
  20 ** forbidden.
  21 **
  22 ** Commercial non-GPL licensing of this software is possible.
  23 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
  24 **
  25 ** $Id$
  26 **/
  27
  28 #include "common.h"
  29 #include "structs.h"
  30
  31 #include <stdlib.h>
  32 #include <string.h>
  33 #ifdef _WIN32_WCE
  34 #define assert(x)
  35 #else
  36 #include <assert.h>
  37 #endif
  38
  39 #include "filtbank.h"
  40 #include "decoder.h"
  41 #include "syntax.h"
  42 #include "kbd_win.h"
  43 #include "sine_win.h"
  44
  45
  46 /* static variables */
  47 static real_t transf_buf[2*FRAME_LEN] IBSS_ATTR MEM_ALIGN_ATTR;
  48 #ifdef LTP_DEC
  49 static real_t windowed_buf[2*FRAME_LEN] MEM_ALIGN_ATTR = {0};
  50 #endif
  51
  52
  53 /*Windowing functions borrowed from libwmai*/
  54 #ifdef CPU_ARM
  55 static inline
  56 void vector_fmul_add_add(real_t *dst, const real_t *src0, const real_t *src1, const real_t *src2, int len)
  57 {
  58     /* Block sizes are always power of two */
  59     asm volatile (
  60         "0:"
  61         "ldmia %[d]!, {r0, r1};"
  62         "ldmia %[w]!, {r4, r5};"
  63         /* consume the first data and window value so we can use those
  64          * registers again */
  65         "smull r8, r9, r0, r4;"
  66         "ldmia %[src2]!, {r0, r4};"
  67         "add   r0, r0, r9, lsl #1;"  /* *dst=*dst+(r9<<1)*/
  68         "smull r8, r9, r1, r5;"
  69         "add   r1, r4, r9, lsl #1;"
  70         "stmia %[dst]!, {r0, r1};"
  71         "subs  %[n], %[n], #2;"
  72         "bne   0b;"
  73         : [d] "+r" (src0), [w] "+r" (src1), [src2] "+r" (src2), [dst] "+r" (dst), [n] "+r" (len)
  74         :
  75         : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc");
  76 }
  77 static inline
  78 void vector_fmul_reverse(real_t *dst, const real_t *src0, const real_t *src1,
  79                          int len)
  80 {
  81     /* Block sizes are always power of two */
  82     asm volatile (
  83         "add   %[s1], %[s1], %[n], lsl #2;"
  84         "0:"
  85         "ldmia %[s0]!, {r0, r1};"
  86         "ldmdb %[s1]!, {r4, r5};"
  87         "smull r8, r9, r0, r5;"
  88         "mov   r0, r9, lsl #1;"
  89         "smull r8, r9, r1, r4;"
  90         "mov   r1, r9, lsl #1;"
  91         "stmia %[dst]!, {r0, r1};"
  92         "subs  %[n], %[n], #2;"
  93         "bne   0b;"
  94         : [s0] "+r" (src0), [s1] "+r" (src1), [dst] "+r" (dst), [n] "+r" (len)
  95         :
  96         : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc");
  97 }
  98
  99 #elif defined(CPU_COLDFIRE)
 100 static inline
 101 void vector_fmul_add_add(real_t *dst, const real_t *src0, const real_t *src1, const real_t *src2, int len)
 102 {
 103     /* Block sizes are always power of two. Smallest block is always way bigger
 104      * than four too.*/
 105     asm volatile (
 106         "0:"
 107         "movem.l (%[src0]), %%d0-%%d3;"
 108         "movem.l (%[src1]), %%d4-%%d5/%%a0-%%a1;"
 109         "mac.l %%d0, %%d4, %%acc0;"
 110         "mac.l %%d1, %%d5, %%acc1;"
 111         "mac.l %%d2, %%a0, %%acc2;"
 112         "mac.l %%d3, %%a1, %%acc3;"
 113         "lea.l (16, %[src0]), %[src0];"
 114         "lea.l (16, %[src1]), %[src1];"
 115         "movclr.l %%acc0, %%d0;"
 116         "movclr.l %%acc1, %%d1;"
 117         "movclr.l %%acc2, %%d2;"
 118         "movclr.l %%acc3, %%d3;"
 119         "movem.l (%[src2]), %%d4-%%d5/%%a0-%%a1;"
 120         "lea.l (16, %[src2]), %[src2];"
 121         "add.l %%d4, %%d0;"
 122         "add.l %%d5, %%d1;"
 123         "add.l %%a0, %%d2;"
 124         "add.l %%a1, %%d3;"
 125         "movem.l %%d0-%%d3, (%[dst]);"
 126         "lea.l (16, %[dst]), %[dst];"
 127         "subq.l #4, %[n];"
 128         "jne 0b;"
 129         : [src0] "+a" (src0), [src1] "+a" (src1), [src2] "+a" (src2), [dst] "+a" (dst), [n] "+d" (len)
 130         :
 131         : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc");
 132 }
 133
 134 static inline
 135 void vector_fmul_reverse(real_t *dst, const real_t *src0, const real_t *src1,
 136                          int len)
 137 {
 138     /* Block sizes are always power of two. Smallest block is always way bigger
 139      * than four too.*/
 140     asm volatile (
 141         "lea.l (-16, %[s1], %[n]*4), %[s1];"
 142         "0:"
 143         "movem.l (%[s0]), %%d0-%%d3;"
 144         "movem.l (%[s1]), %%d4-%%d5/%%a0-%%a1;"
 145         "mac.l %%d0, %%a1, %%acc0;"
 146         "mac.l %%d1, %%a0, %%acc1;"
 147         "mac.l %%d2, %%d5, %%acc2;"
 148         "mac.l %%d3, %%d4, %%acc3;"
 149         "lea.l (16, %[s0]), %[s0];"
 150         "lea.l (-16, %[s1]), %[s1];"
 151         "movclr.l %%acc0, %%d0;"
 152         "movclr.l %%acc1, %%d1;"
 153         "movclr.l %%acc2, %%d2;"
 154         "movclr.l %%acc3, %%d3;"
 155         "movem.l %%d0-%%d3, (%[dst]);"
 156         "lea.l (16, %[dst]), %[dst];"
 157         "subq.l #4, %[n];"
 158         "jne 0b;"
 159         : [s0] "+a" (src0), [s1] "+a" (src1), [dst] "+a" (dst), [n] "+d" (len)
 160         : : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc");
 161 }
 162
 163 #else
 164 static inline void vector_fmul_add_add(real_t *dst, const real_t *src0, const real_t *src1, const real_t *src2, int len){
 165     int i;
 166     for(i=0; i<len; i++)
 167         dst[i] = MUL_F(src0[i], src1[i]) + src2[i];
 168 }
 169
 170 static inline void vector_fmul_reverse(real_t *dst, const real_t *src0, const real_t *src1, int len){
 171     int i;
 172     src1 += len-1;
 173     for(i=0; i<len; i++)
 174         dst[i] = MUL_F(src0[i], src1[-i]);
 175 }
 176 #endif
 177
 178 #ifdef LTP_DEC
 179 static INLINE void mdct(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len)
 180 {
 181     mdct_info *mdct = NULL;
 182
 183     switch (len)
 184     {
 185     case 2048:
 186     case 1920:
 187         mdct = fb->mdct2048;
 188         break;
 189     case 256:
 190     case 240:
 191         mdct = fb->mdct256;
 192         break;
 193 #ifdef LD_DEC
 194     case 1024:
 195     case 960:
 196         mdct = fb->mdct1024;
 197         break;
 198 #endif
 199     }
 200
 201     faad_mdct(mdct, in_data, out_data);
 202 }
 203 #endif
 204
 205 void ifilter_bank(uint8_t window_sequence, uint8_t window_shape,
 206                   uint8_t window_shape_prev, real_t *freq_in,
 207                   real_t *time_out, real_t *overlap,
 208                   uint8_t object_type, uint16_t frame_len)
 209 {
 210     int32_t i, idx0, idx1;
 211     real_t win0, win1, win2;
 212
 213     const real_t *window_long       = NULL;
 214     const real_t *window_long_prev  = NULL;
 215     const real_t *window_short      = NULL;
 216     const real_t *window_short_prev = NULL;
 217
 218     int32_t nlong    = frame_len;
 219     int32_t nshort   = frame_len/8;
 220     int32_t nflat_ls = (nlong-nshort)/2;
 221
 222 #ifdef PROFILE
 223     int64_t count = faad_get_ts();
 224 #endif
 225
 226     memset(transf_buf,0,sizeof(transf_buf));
 227     /* select windows of current frame and previous frame (Sine or KBD) */
 228 #ifdef LD_DEC
 229     if (object_type == LD)
 230     {
 231         window_long       = fb->ld_window[window_shape];
 232         window_long_prev  = fb->ld_window[window_shape_prev];
 233     } else {
 234 #else
 235         (void) object_type;
 236 #endif
 237
 238     /* AAC uses two different window shapes depending on spectal features */
 239     if (window_shape == 0) {
 240         window_long  = sine_long_1024;
 241         window_short = sine_short_128;
 242     } else {
 243         window_long  = kbd_long_1024;
 244         window_short = kbd_short_128;
 245     }
 246
 247     if (window_shape_prev == 0) {
 248         window_long_prev  = sine_long_1024;
 249         window_short_prev = sine_short_128;
 250     } else {
 251         window_long_prev  = kbd_long_1024;
 252         window_short_prev = kbd_short_128;
 253     }
 254
 255 #ifdef LD_DEC
 256     }
 257 #endif
 258
 259 #if 0
 260     for (i = 0; i < 1024; i++)
 261     {
 262         printf("%d\n", freq_in[i]);
 263     }
 264 #endif
 265
 266 #if 0
 267     printf("%d %d\n", window_sequence, window_shape);
 268 #endif
 269     switch (window_sequence)
 270     {
 271     case ONLY_LONG_SEQUENCE:
 272         /* perform iMDCT */
 273         ff_imdct_calc(11, transf_buf, freq_in);
 274
 275         /* add second half output of previous frame to windowed output of current frame */
 276         vector_fmul_add_add(time_out, transf_buf, window_long_prev, overlap,  nlong);
 277
 278         /* window the second half and save as overlap for next frame */
 279         vector_fmul_reverse(overlap, transf_buf+nlong, window_long, nlong);
 280
 281         break;
 282
 283     case LONG_START_SEQUENCE:
 284         /* perform iMDCT */
 285         ff_imdct_calc(11, transf_buf, freq_in);
 286
 287         /* add second half output of previous frame to windowed output of current frame */
 288         vector_fmul_add_add(time_out, transf_buf, window_long_prev, overlap,  nlong);
 289
 290         /* window the second half and save as overlap for next frame */
 291         /* construct second half window using padding with 1's and 0's */
 292
 293         memcpy(overlap, transf_buf+nlong, nflat_ls*sizeof(real_t));
 294
 295         vector_fmul_reverse(overlap+nflat_ls, transf_buf+nlong+nflat_ls, window_short, nshort);
 296
 297         memset(overlap+nflat_ls+nshort, 0, nflat_ls*sizeof(real_t));
 298         break;
 299
 300     case EIGHT_SHORT_SEQUENCE:
 301         /* this could be assemblerized too, but this case is extremely uncommon */
 302
 303         /* perform iMDCT for each short block */
 304         idx0 = 0;       ff_imdct_calc(8, transf_buf            , freq_in       );
 305         idx0 += nshort; ff_imdct_calc(8, transf_buf + (idx0<<1), freq_in + idx0);
 306         idx0 += nshort; ff_imdct_calc(8, transf_buf + (idx0<<1), freq_in + idx0);
 307         idx0 += nshort; ff_imdct_calc(8, transf_buf + (idx0<<1), freq_in + idx0);
 308         idx0 += nshort; ff_imdct_calc(8, transf_buf + (idx0<<1), freq_in + idx0);
 309         idx0 += nshort; ff_imdct_calc(8, transf_buf + (idx0<<1), freq_in + idx0);
 310         idx0 += nshort; ff_imdct_calc(8, transf_buf + (idx0<<1), freq_in + idx0);
 311         idx0 += nshort; ff_imdct_calc(8, transf_buf + (idx0<<1), freq_in + idx0);
 312
 313         /* Add second half output of previous frame to windowed output of current
 314          * frame */
 315         /* Step 1: copy */
 316         memcpy(time_out, overlap, nflat_ls*sizeof(real_t));
 317         /* Step 2: First window half, first half of nshort */
 318         for (i = 0; i < nshort/2; i++) {
 319             win0 = window_short[nshort-1-i];
 320             win1 = window_short[i];
 321             win2 = window_short_prev[i];
 322             idx0 = nflat_ls + i;
 323             idx1 = i;
 324             time_out[idx0] = overlap[idx0] +                                        MUL_F(transf_buf[idx1], win2); idx0 += nshort; idx1 += (nshort<<1);
 325             time_out[idx0] = overlap[idx0] + MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
 326             time_out[idx0] = overlap[idx0] + MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
 327             time_out[idx0] = overlap[idx0] + MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
 328             time_out[idx0] = overlap[idx0] + MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1);
 329         }
 330         /* Step 3: First window half, second half of nshort */
 331         for (; i < nshort; i++) {
 332             win0 = window_short[nshort-1-i];
 333             win1 = window_short[i];
 334             idx0 = nflat_ls + i;
 335             idx1 = i;
 336             time_out[idx0] = overlap[idx0] +                                        MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
 337             time_out[idx0] = overlap[idx0] + MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
 338             time_out[idx0] = overlap[idx0] + MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
 339             time_out[idx0] = overlap[idx0] + MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1);
 340         }
 341
 342         /* Window the second half and save as overlap for next frame */
 343         /* Step 1: Second window half, first half of nshort */
 344         for (i = 0; i < nshort/2; i++) {
 345             win0 = window_short[nshort-1-i];
 346             win1 = window_short[i];
 347             idx0 = nflat_ls + 5*nshort + i - nlong;
 348             idx1 = nshort*10 + i;
 349             overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
 350             overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
 351             overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
 352             overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0);
 353         }
 354         /* Step 2: Second window half, second half of nshort */
 355         for (; i < nshort; i++) {
 356             win0 = window_short[nshort-1-i];
 357             win1 = window_short[i];
 358             idx0 = nflat_ls + 4*nshort + i - nlong;
 359             idx1 = nshort*8 + i;
 360             overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
 361             overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
 362             overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
 363             overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0) + MUL_F(transf_buf[idx1], win1); idx0 += nshort; idx1 += (nshort<<1);
 364             overlap[idx0] = MUL_F(transf_buf[idx1-nshort], win0);
 365         }
 366         /* Step 3: Set to zero */
 367         memset(overlap+nflat_ls+nshort, 0, nflat_ls*sizeof(real_t));
 368
 369         break;
 370
 371     case LONG_STOP_SEQUENCE:
 372         /* perform iMDCT */
 373         ff_imdct_calc(11, transf_buf, freq_in);
 374
 375         /* add second half output of previous frame to windowed output of current frame */
 376         /* construct first half window using padding with 1's and 0's */
 377         memcpy(time_out, overlap, nflat_ls*sizeof(real_t));
 378
 379         vector_fmul_add_add(time_out+nflat_ls, transf_buf+nflat_ls, window_short_prev, overlap+nflat_ls,  nshort);
 380
 381         /* nflat_ls can be divided by 2. */
 382         idx0 = nflat_ls + nshort;
 383         for (i = 0; i < nflat_ls; i+=2) {
 384             time_out[idx0] = overlap[idx0] + transf_buf[idx0]; idx0++;
 385             time_out[idx0] = overlap[idx0] + transf_buf[idx0]; idx0++;
 386         }
 387
 388         /* window the second half and save as overlap for next frame */
 389         vector_fmul_reverse(overlap, transf_buf+nlong, window_long, nlong);
 390         break;
 391     }
 392
 393 #if 0
 394     for (i = 0; i < 1024; i++)
 395     {
 396         printf("%d\n", time_out[i]);
 397         //printf("0x%.8X\n", time_out[i]);
 398     }
 399 #endif
 400
 401
 402 #ifdef PROFILE
 403     count = faad_get_ts() - count;
 404     fb->cycles += count;
 405 #endif
 406 }
 407
 408
 409 #ifdef LTP_DEC
 410 /* only works for LTP -> no overlapping, no short blocks */
 411 void filter_bank_ltp(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
 412                      uint8_t window_shape_prev, real_t *in_data, real_t *out_mdct,
 413                      uint8_t object_type, uint16_t frame_len)
 414 {
 415     int16_t i;
 416
 417     const real_t *window_long = NULL;
 418     const real_t *window_long_prev = NULL;
 419     const real_t *window_short = NULL;
 420     const real_t *window_short_prev = NULL;
 421
 422     uint16_t nlong = frame_len;
 423     uint16_t nshort = frame_len/8;
 424     uint16_t nflat_ls = (nlong-nshort)/2;
 425
 426     //assert(window_sequence != EIGHT_SHORT_SEQUENCE);
 427
 428     memset(windowed_buf,0,sizeof(windowed_buf));
 429 #ifdef LD_DEC
 430     if (object_type == LD)
 431     {
 432         window_long       = fb->ld_window[window_shape];
 433         window_long_prev  = fb->ld_window[window_shape_prev];
 434     } else {
 435 #else
 436         (void) object_type;
 437 #endif
 438         window_long       = fb->long_window[window_shape];
 439         window_long_prev  = fb->long_window[window_shape_prev];
 440         window_short      = fb->short_window[window_shape];
 441         window_short_prev = fb->short_window[window_shape_prev];
 442 #ifdef LD_DEC
 443     }
 444 #endif
 445
 446     switch(window_sequence)
 447     {
 448     case ONLY_LONG_SEQUENCE:
 449         for (i = nlong-1; i >= 0; i--)
 450         {
 451             windowed_buf[i] = MUL_F(in_data[i], window_long_prev[i]);
 452             windowed_buf[i+nlong] = MUL_F(in_data[i+nlong], window_long[nlong-1-i]);
 453         }
 454         mdct(fb, windowed_buf, out_mdct, 2*nlong);
 455         break;
 456
 457     case LONG_START_SEQUENCE:
 458         for (i = 0; i < nlong; i++)
 459             windowed_buf[i] = MUL_F(in_data[i], window_long_prev[i]);
 460         for (i = 0; i < nflat_ls; i++)
 461             windowed_buf[i+nlong] = in_data[i+nlong];
 462         for (i = 0; i < nshort; i++)
 463             windowed_buf[i+nlong+nflat_ls] = MUL_F(in_data[i+nlong+nflat_ls], window_short[nshort-1-i]);
 464         for (i = 0; i < nflat_ls; i++)
 465             windowed_buf[i+nlong+nflat_ls+nshort] = 0;
 466         mdct(fb, windowed_buf, out_mdct, 2*nlong);
 467         break;
 468
 469     case LONG_STOP_SEQUENCE:
 470         for (i = 0; i < nflat_ls; i++)
 471             windowed_buf[i] = 0;
 472         for (i = 0; i < nshort; i++)
 473             windowed_buf[i+nflat_ls] = MUL_F(in_data[i+nflat_ls], window_short_prev[i]);
 474         for (i = 0; i < nflat_ls; i++)
 475             windowed_buf[i+nflat_ls+nshort] = in_data[i+nflat_ls+nshort];
 476         for (i = 0; i < nlong; i++)
 477             windowed_buf[i+nlong] = MUL_F(in_data[i+nlong], window_long[nlong-1-i]);
 478         mdct(fb, windowed_buf, out_mdct, 2*nlong);
 479         break;
 480     }
 481 }
 482 #endif