From bf4baee6e04472f749a927ff9aea07461e3e2fbc Mon Sep 17 00:00:00 2001 From: Buschel Date: Fri, 9 Jul 2010 18:32:37 +0000 Subject: [PATCH] Submit FS#11461. Major speedup for aac he profile (PP5002 +20%, PP5020 +15%, PP5022 +19%, MCF5249 +35%, MCF5250 +80%), still not realtime on most targets though. This change does a lot of refactoring in the sbr filters and the dct, switching to our optimized codeclib fft and tweaking IRAM usage. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@27358 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libfaad/common.c | 2 +- apps/codecs/libfaad/sbr_dct.c | 345 ++++++------------------------------- apps/codecs/libfaad/sbr_dct.h | 2 +- apps/codecs/libfaad/sbr_dec.c | 6 +- apps/codecs/libfaad/sbr_qmf.c | 371 +++++++++++++++------------------------- apps/codecs/libfaad/sbr_qmf_c.h | 2 +- apps/codecs/libfaad/specrec.c | 18 +- 7 files changed, 202 insertions(+), 544 deletions(-) diff --git a/apps/codecs/libfaad/common.c b/apps/codecs/libfaad/common.c index 025c8f8c5..c838c88d3 100644 --- a/apps/codecs/libfaad/common.c +++ b/apps/codecs/libfaad/common.c @@ -319,7 +319,7 @@ static const uint32_t pow2_tab[] ICONST_ATTR = { UFIX_CONST(2.000000000000000,POWTBL_PRECIS) }; -static const real_t log2_tab[] ICONST_ATTR = { +static const real_t log2_tab[] ICONST_ATTR_FAAD_LARGE_IRAM = { REAL_CONST(0.000000000000000), REAL_CONST(0.022367813028455), REAL_CONST(0.044394119358453), REAL_CONST(0.066089190457772), REAL_CONST(0.087462841250339), REAL_CONST(0.108524456778169), REAL_CONST(0.129283016944966), REAL_CONST(0.149747119504682), REAL_CONST(0.169925001442312), diff --git a/apps/codecs/libfaad/sbr_dct.c b/apps/codecs/libfaad/sbr_dct.c index c916a82a6..123514f22 100644 --- a/apps/codecs/libfaad/sbr_dct.c +++ b/apps/codecs/libfaad/sbr_dct.c @@ -26,6 +26,9 @@ **/ #include "common.h" +#include "../lib/fft.h" +#include "../lib/mdct_lookup.h" + #ifdef SBR_DEC @@ -1447,267 +1450,9 @@ void DCT2_32_unscaled(real_t *y, real_t *x) y[17] = f286 - f285; } -#else - - -#define n 32 -#define log2n 5 - -// w_array_real[i] = cos(2*M_PI*i/32) -static const real_t w_array_real[] = { - FRAC_CONST(1.000000000000000), FRAC_CONST(0.980785279337272), - FRAC_CONST(0.923879528329380), FRAC_CONST(0.831469603195765), - FRAC_CONST(0.707106765732237), FRAC_CONST(0.555570210304169), - FRAC_CONST(0.382683402077046), FRAC_CONST(0.195090284503576), - FRAC_CONST(0.000000000000000), FRAC_CONST(-0.195090370246552), - FRAC_CONST(-0.382683482845162), FRAC_CONST(-0.555570282993553), - FRAC_CONST(-0.707106827549476), FRAC_CONST(-0.831469651765257), - FRAC_CONST(-0.923879561784627), FRAC_CONST(-0.980785296392607) -}; - -// w_array_imag[i] = sin(-2*M_PI*i/32) -static const real_t w_array_imag[] = { - FRAC_CONST(0.000000000000000), FRAC_CONST(-0.195090327375064), - FRAC_CONST(-0.382683442461104), FRAC_CONST(-0.555570246648862), - FRAC_CONST(-0.707106796640858), FRAC_CONST(-0.831469627480512), - FRAC_CONST(-0.923879545057005), FRAC_CONST(-0.980785287864940), - FRAC_CONST(-1.000000000000000), FRAC_CONST(-0.980785270809601), - FRAC_CONST(-0.923879511601754), FRAC_CONST(-0.831469578911016), - FRAC_CONST(-0.707106734823616), FRAC_CONST(-0.555570173959476), - FRAC_CONST(-0.382683361692986), FRAC_CONST(-0.195090241632088) -}; - -// FFT decimation in frequency -// 4*16*2+16=128+16=144 multiplications -// 6*16*2+10*8+4*16*2=192+80+128=400 additions -static void fft_dif(real_t * Real, real_t * Imag) -{ - real_t w_real, w_imag; // For faster access - real_t point1_real, point1_imag, point2_real, point2_imag; // For faster access - uint32_t j, i, i2, w_index; // Counters - - // First 2 stages of 32 point FFT decimation in frequency - // 4*16*2=64*2=128 multiplications - // 6*16*2=96*2=192 additions - // Stage 1 of 32 point FFT decimation in frequency - for (i = 0; i < 16; i++) - { - point1_real = Real[i]; - point1_imag = Imag[i]; - i2 = i+16; - point2_real = Real[i2]; - point2_imag = Imag[i2]; - - w_real = w_array_real[i]; - w_imag = w_array_imag[i]; - - // temp1 = x[i] - x[i2] - point1_real -= point2_real; - point1_imag -= point2_imag; - - // x[i1] = x[i] + x[i2] - Real[i] += point2_real; - Imag[i] += point2_imag; - - // x[i2] = (x[i] - x[i2]) * w - Real[i2] = (MUL_F(point1_real,w_real) - MUL_F(point1_imag,w_imag)); - Imag[i2] = (MUL_F(point1_real,w_imag) + MUL_F(point1_imag,w_real)); - } - // Stage 2 of 32 point FFT decimation in frequency - for (j = 0, w_index = 0; j < 8; j++, w_index += 2) - { - w_real = w_array_real[w_index]; - w_imag = w_array_imag[w_index]; - - i = j; - point1_real = Real[i]; - point1_imag = Imag[i]; - i2 = i+8; - point2_real = Real[i2]; - point2_imag = Imag[i2]; - - // temp1 = x[i] - x[i2] - point1_real -= point2_real; - point1_imag -= point2_imag; - - // x[i1] = x[i] + x[i2] - Real[i] += point2_real; - Imag[i] += point2_imag; +#else /* #ifdef SBR_LOW_POWER */ - // x[i2] = (x[i] - x[i2]) * w - Real[i2] = (MUL_F(point1_real,w_real) - MUL_F(point1_imag,w_imag)); - Imag[i2] = (MUL_F(point1_real,w_imag) + MUL_F(point1_imag,w_real)); - - i = j+16; - point1_real = Real[i]; - point1_imag = Imag[i]; - i2 = i+8; - point2_real = Real[i2]; - point2_imag = Imag[i2]; - - // temp1 = x[i] - x[i2] - point1_real -= point2_real; - point1_imag -= point2_imag; - - // x[i1] = x[i] + x[i2] - Real[i] += point2_real; - Imag[i] += point2_imag; - - // x[i2] = (x[i] - x[i2]) * w - Real[i2] = (MUL_F(point1_real,w_real) - MUL_F(point1_imag,w_imag)); - Imag[i2] = (MUL_F(point1_real,w_imag) + MUL_F(point1_imag,w_real)); - } - - // Stage 3 of 32 point FFT decimation in frequency - // 2*4*2=16 multiplications - // 4*4*2+6*4*2=10*8=80 additions - for (i = 0; i < n; i += 8) - { - i2 = i+4; - point1_real = Real[i]; - point1_imag = Imag[i]; - - point2_real = Real[i2]; - point2_imag = Imag[i2]; - - // out[i1] = point1 + point2 - Real[i] += point2_real; - Imag[i] += point2_imag; - - // out[i2] = point1 - point2 - Real[i2] = point1_real - point2_real; - Imag[i2] = point1_imag - point2_imag; - } - w_real = w_array_real[4]; // = sqrt(2)/2 - // w_imag = -w_real; // = w_array_imag[4]; // = -sqrt(2)/2 - for (i = 1; i < n; i += 8) - { - i2 = i+4; - point1_real = Real[i]; - point1_imag = Imag[i]; - - point2_real = Real[i2]; - point2_imag = Imag[i2]; - - // temp1 = x[i] - x[i2] - point1_real -= point2_real; - point1_imag -= point2_imag; - - // x[i1] = x[i] + x[i2] - Real[i] += point2_real; - Imag[i] += point2_imag; - - // x[i2] = (x[i] - x[i2]) * w - Real[i2] = MUL_F(point1_real+point1_imag, w_real); - Imag[i2] = MUL_F(point1_imag-point1_real, w_real); - } - for (i = 2; i < n; i += 8) - { - i2 = i+4; - point1_real = Real[i]; - point1_imag = Imag[i]; - - point2_real = Real[i2]; - point2_imag = Imag[i2]; - - // x[i] = x[i] + x[i2] - Real[i] += point2_real; - Imag[i] += point2_imag; - - // x[i2] = (x[i] - x[i2]) * (-i) - Real[i2] = point1_imag - point2_imag; - Imag[i2] = point2_real - point1_real; - } - w_real = w_array_real[12]; // = -sqrt(2)/2 - // w_imag = w_real; // = w_array_imag[12]; // = -sqrt(2)/2 - for (i = 3; i < n; i += 8) - { - i2 = i+4; - point1_real = Real[i]; - point1_imag = Imag[i]; - - point2_real = Real[i2]; - point2_imag = Imag[i2]; - - // temp1 = x[i] - x[i2] - point1_real -= point2_real; - point1_imag -= point2_imag; - - // x[i1] = x[i] + x[i2] - Real[i] += point2_real; - Imag[i] += point2_imag; - - // x[i2] = (x[i] - x[i2]) * w - Real[i2] = MUL_F(point1_real-point1_imag, w_real); - Imag[i2] = MUL_F(point1_real+point1_imag, w_real); - } - - - // Stage 4 of 32 point FFT decimation in frequency (no multiplications) - // 16*4=64 additions - for (i = 0; i < n; i += 4) - { - i2 = i+2; - point1_real = Real[i]; - point1_imag = Imag[i]; - - point2_real = Real[i2]; - point2_imag = Imag[i2]; - - // x[i1] = x[i] + x[i2] - Real[i] += point2_real; - Imag[i] += point2_imag; - - // x[i2] = x[i] - x[i2] - Real[i2] = point1_real - point2_real; - Imag[i2] = point1_imag - point2_imag; - } - for (i = 1; i < n; i += 4) - { - i2 = i+2; - point1_real = Real[i]; - point1_imag = Imag[i]; - - point2_real = Real[i2]; - point2_imag = Imag[i2]; - - // x[i] = x[i] + x[i2] - Real[i] += point2_real; - Imag[i] += point2_imag; - - // x[i2] = (x[i] - x[i2]) * (-i) - Real[i2] = point1_imag - point2_imag; - Imag[i2] = point2_real - point1_real; - } - - // Stage 5 of 32 point FFT decimation in frequency (no multiplications) - // 16*4=64 additions - for (i = 0; i < n; i += 2) - { - i2 = i+1; - point1_real = Real[i]; - point1_imag = Imag[i]; - - point2_real = Real[i2]; - point2_imag = Imag[i2]; - - // out[i1] = point1 + point2 - Real[i] += point2_real; - Imag[i] += point2_imag; - - // out[i2] = point1 - point2 - Real[i2] = point1_real - point2_real; - Imag[i2] = point1_imag - point2_imag; - } - -#ifdef REORDER_IN_FFT - FFTReorder(Real, Imag); -#endif // #ifdef REORDER_IN_FFT -} -#undef n -#undef log2n - -static const real_t dct4_64_tab[] = { +static const real_t dct4_64_tab[] ICONST_ATTR = { COEF_CONST(0.999924719333649), COEF_CONST(0.998118102550507), COEF_CONST(0.993906974792480), COEF_CONST(0.987301409244537), COEF_CONST(0.978317379951477), COEF_CONST(0.966976463794708), @@ -1806,57 +1551,65 @@ static const real_t dct4_64_tab[] = { COEF_CONST(0.897167563438416), COEF_CONST(0.949727773666382) }; +// Table adapted from codeclib to fit into IRAM +const uint32_t dct4_revtab[32] ICONST_ATTR = { + 0, 24, 12, 22, 6, 30, 11, 19, 3, 27, 15, 21, 5, 29, 9, 17, + 1, 25, 13, 23, 7, 31, 10, 18, 2, 26, 14, 20, 4, 28, 8, 16}; + /* size 64 only! */ -void dct4_kernel(real_t * in_real, real_t * in_imag, real_t * out_real, real_t * out_imag) +void dct4_kernel(real_t *real, real_t *imag) { - // Tables with bit reverse values for 5 bits, bit reverse of i at i-th position - const uint8_t bit_rev_tab[32] = { 0,16,8,24,4,20,12,28,2,18,10,26,6,22,14,30,1,17,9,25,5,21,13,29,3,19,11,27,7,23,15,31 }; - uint16_t i, i_rev; + uint32_t i, idx; + real_t x_re, x_im, tmp; + FFTComplex xc[32]; /* used for calling codeclib's fft implementation */ - /* Step 2: modulate */ + /* Step 2: modulate and pre-rotate for codeclib's fft implementation */ // 3*32=96 multiplications // 3*32=96 additions for (i = 0; i < 32; i++) { - real_t x_re, x_im, tmp; - x_re = in_real[i]; - x_im = in_imag[i]; - tmp = MUL_C(x_re + x_im, dct4_64_tab[i]); - in_real[i] = MUL_C(x_im, dct4_64_tab[i + 64]) + tmp; - in_imag[i] = MUL_C(x_re, dct4_64_tab[i + 32]) + tmp; + idx = dct4_revtab[i]; + x_re = real[i]; + x_im = imag[i]; + tmp = MUL_C(x_re + x_im, dct4_64_tab[i ]); + xc[idx].re = MUL_C(x_im , dct4_64_tab[i + 64]) + tmp; + xc[idx].im = MUL_C(x_re , dct4_64_tab[i + 32]) + tmp; } - /* Step 3: FFT, but with output in bit reverse order */ - fft_dif(in_real, in_imag); + /* Step 3: FFT (codeclib's implementation) */ + ff_fft_calc_c(5, xc); - /* Step 4: modulate + bitreverse reordering */ + /* Step 4: modulate + reordering */ // 3*31+2=95 multiplications // 3*31+2=95 additions - for (i = 0; i < 16; i++) + x_re = xc[0].re; + x_im = xc[0].im; + tmp = MUL_C(x_re + x_im, dct4_64_tab[0 + 3*32]); + real[0] = MUL_C(x_im , dct4_64_tab[0 + 5*32]) + tmp; + imag[0] = MUL_C(x_re , dct4_64_tab[0 + 4*32]) + tmp; + for (i = 1; i < 16; i++) { - real_t x_re, x_im, tmp; - i_rev = bit_rev_tab[i]; - x_re = in_real[i_rev]; - x_im = in_imag[i_rev]; - - tmp = MUL_C(x_re + x_im, dct4_64_tab[i + 3*32]); - out_real[i] = MUL_C(x_im, dct4_64_tab[i + 5*32]) + tmp; - out_imag[i] = MUL_C(x_re, dct4_64_tab[i + 4*32]) + tmp; + idx = 32-i; + x_re = xc[idx].re; + x_im = xc[idx].im; + tmp = MUL_C(x_re + x_im, dct4_64_tab[i + 3*32]); + real[i] = MUL_C(x_im , dct4_64_tab[i + 5*32]) + tmp; + imag[i] = MUL_C(x_re , dct4_64_tab[i + 4*32]) + tmp; } - // i = 16, i_rev = 1 = rev(16); - out_imag[16] = MUL_C(in_imag[1] - in_real[1], dct4_64_tab[16 + 3*32]); - out_real[16] = MUL_C(in_real[1] + in_imag[1], dct4_64_tab[16 + 3*32]); + // i = 16, idx = 16 = reorder_tab[16]; + x_re = xc[16].re; + x_im = xc[16].im; + imag[16] = MUL_C(x_im - x_re, dct4_64_tab[16 + 3*32]); + real[16] = MUL_C(x_re + x_im, dct4_64_tab[16 + 3*32]); for (i = 17; i < 32; i++) { - real_t x_re, x_im, tmp; - i_rev = bit_rev_tab[i]; - x_re = in_real[i_rev]; - x_im = in_imag[i_rev]; - tmp = MUL_C(x_re + x_im, dct4_64_tab[i + 3*32]); - out_real[i] = MUL_C(x_im, dct4_64_tab[i + 5*32]) + tmp; - out_imag[i] = MUL_C(x_re, dct4_64_tab[i + 4*32]) + tmp; + idx = 32-i; + x_re = xc[idx].re; + x_im = xc[idx].im; + tmp = MUL_C(x_re + x_im, dct4_64_tab[i + 3*32]); + real[i] = MUL_C(x_im , dct4_64_tab[i + 5*32]) + tmp; + imag[i] = MUL_C(x_re , dct4_64_tab[i + 4*32]) + tmp; } - } void DST4_32(real_t *y, real_t *x) @@ -2266,6 +2019,6 @@ void DST4_32(real_t *y, real_t *x) y[0] = MUL_R(REAL_CONST(20.3738781672314530), f304); } -#endif +#endif /* #ifdef SBR_LOW_POWER */ -#endif +#endif /* #ifdef SBR_DEC */ diff --git a/apps/codecs/libfaad/sbr_dct.h b/apps/codecs/libfaad/sbr_dct.h index 124f159d5..95394df30 100644 --- a/apps/codecs/libfaad/sbr_dct.h +++ b/apps/codecs/libfaad/sbr_dct.h @@ -32,7 +32,7 @@ extern "C" { #endif -void dct4_kernel(real_t * in_real, real_t * in_imag, real_t * out_real, real_t * out_imag); +void dct4_kernel(real_t *real, real_t *imag); void DCT3_32_unscaled(real_t *y, real_t *x); void DCT4_32(real_t *y, real_t *x); diff --git a/apps/codecs/libfaad/sbr_dec.c b/apps/codecs/libfaad/sbr_dec.c index 97f1d9b64..60bb2a6bd 100644 --- a/apps/codecs/libfaad/sbr_dec.c +++ b/apps/codecs/libfaad/sbr_dec.c @@ -454,6 +454,7 @@ uint8_t sbrDecodeCoupleFrame(sbr_info *sbr, real_t *left_chan, real_t *right_cha } ALIGN qmf_t X[MAX_NTSR][64]; + uint8_t sbrDecodeSingleFrame(sbr_info *sbr, real_t *channel, const uint8_t just_seeked, const uint8_t downSampledSBR) { @@ -520,9 +521,8 @@ uint8_t sbrDecodeSingleFrame(sbr_info *sbr, real_t *channel, return 0; } - -ALIGN qmf_t X_left[38][64];// = {{0}}; -ALIGN qmf_t X_right[38][64];// = {{0}}; /* must set this to 0 */ +ALIGN qmf_t X_left[MAX_NTSRHFG][64];// = {{0}}; +ALIGN qmf_t X_right[MAX_NTSRHFG][64];// = {{0}}; /* must set this to 0 */ #if (defined(PS_DEC) || defined(DRM_PS)) uint8_t sbrDecodeSingleFramePS(sbr_info *sbr, real_t *left_channel, real_t *right_channel, diff --git a/apps/codecs/libfaad/sbr_qmf.c b/apps/codecs/libfaad/sbr_qmf.c index 5486cd283..7b70cc6a5 100644 --- a/apps/codecs/libfaad/sbr_qmf.c +++ b/apps/codecs/libfaad/sbr_qmf.c @@ -38,6 +38,16 @@ #include "sbr_qmf_c.h" #include "sbr_syntax.h" +#ifdef FIXED_POINT + #define FAAD_SYNTHESIS_SCALE(X) ((X)>>1) + #define FAAD_ANALYSIS_SCALE1(X) ((X)>>4) + #define FAAD_ANALYSIS_SCALE2(X) ((X)) +#else + #define FAAD_ANALYSIS_SCALE1(X) ((X)*scale) + #define FAAD_ANALYSIS_SCALE1(X) ((X)) + #define FAAD_ANALYSIS_SCALE2(X) (2.*(X)) +#endif + qmfa_info *qmfa_init(uint8_t channels) { qmfa_info *qmfa = (qmfa_info*)faad_malloc(sizeof(qmfa_info)); @@ -68,40 +78,44 @@ void sbr_qmf_analysis_32(sbr_info *sbr, qmfa_info *qmfa, const real_t *input, { ALIGN real_t u[64]; #ifndef SBR_LOW_POWER - static ALIGN real_t in_real[32], in_imag[32], out_real[32], out_imag[32]; + ALIGN real_t real[32]; + ALIGN real_t imag[32]; #else ALIGN real_t y[32]; #endif - uint16_t in = 0; - uint8_t l; + qmf_t *pX; + uint32_t in = 0; + uint32_t l, idx0, idx1; /* qmf subsample l */ for (l = 0; l < sbr->numTimeSlotsRate; l++) { - int16_t n; + int32_t n; /* shift input buffer x */ /* input buffer is not shifted anymore, x is implemented as double ringbuffer */ //memmove(qmfa->x + 32, qmfa->x, (320-32)*sizeof(real_t)); /* add new samples to input buffer x */ - for (n = 32 - 1; n >= 0; n--) + idx0 = qmfa->x_index + 31; idx1 = idx0 + 320; + for (n = 32 - 1; n >= 0; n-=4) { -#ifdef FIXED_POINT - qmfa->x[qmfa->x_index + n] = qmfa->x[qmfa->x_index + n + 320] = (input[in++]) >> 4; -#else - qmfa->x[qmfa->x_index + n] = qmfa->x[qmfa->x_index + n + 320] = input[in++]; -#endif + qmfa->x[idx0--] = qmfa->x[idx1--] = (input[in++]); + qmfa->x[idx0--] = qmfa->x[idx1--] = (input[in++]); + qmfa->x[idx0--] = qmfa->x[idx1--] = (input[in++]); + qmfa->x[idx0--] = qmfa->x[idx1--] = (input[in++]); } /* window and summation to create array u */ for (n = 0; n < 64; n++) { - u[n] = MUL_F(qmfa->x[qmfa->x_index + n], qmf_c[2*n]) + - MUL_F(qmfa->x[qmfa->x_index + n + 64], qmf_c[2*(n + 64)]) + - MUL_F(qmfa->x[qmfa->x_index + n + 128], qmf_c[2*(n + 128)]) + - MUL_F(qmfa->x[qmfa->x_index + n + 192], qmf_c[2*(n + 192)]) + - MUL_F(qmfa->x[qmfa->x_index + n + 256], qmf_c[2*(n + 256)]); + idx0 = qmfa->x_index + n; idx1 = n * 2; + u[n] = FAAD_ANALYSIS_SCALE1( + MUL_F(qmfa->x[idx0 ], qmf_c[idx1]) + + MUL_F(qmfa->x[idx0 + 64], qmf_c[idx1 + 2 * 64]) + + MUL_F(qmfa->x[idx0 + 128], qmf_c[idx1 + 2 * 128]) + + MUL_F(qmfa->x[idx0 + 192], qmf_c[idx1 + 2 * 192]) + + MUL_F(qmfa->x[idx0 + 256], qmf_c[idx1 + 2 * 256])); } /* update ringbuffer index */ @@ -123,64 +137,52 @@ void sbr_qmf_analysis_32(sbr_info *sbr, qmfa_info *qmfa, const real_t *input, { if (n < kx) { -#ifdef FIXED_POINT - QMF_RE(X[l + offset][n]) = u[n] /*<< 1*/; -#else - QMF_RE(X[l + offset][n]) = 2. * u[n]; -#endif + QMF_RE(X[l + offset][n]) = FAAD_ANALYSIS_SCALE2(u[n]); } else { QMF_RE(X[l + offset][n]) = 0; } } -#else +#else /* #ifdef SBR_LOW_POWER */ // Reordering of data moved from DCT_IV to here - in_imag[31] = u[1]; - in_real[0] = u[0]; - for (n = 1; n < 31; n++) + idx0 = 30; idx1 = 63; + imag[31] = u[ 1]; real[ 0] = u[ 0]; + for (n = 1; n < 31; n+=3) { - in_imag[31 - n] = u[n+1]; - in_real[n] = -u[64-n]; + imag[idx0--] = u[n+1]; real[n ] = -u[idx1--]; + imag[idx0--] = u[n+2]; real[n+1] = -u[idx1--]; + imag[idx0--] = u[n+3]; real[n+2] = -u[idx1--]; } - in_imag[0] = u[32]; - in_real[31] = -u[33]; + imag[ 0] = u[32]; real[31] = -u[33]; // dct4_kernel is DCT_IV without reordering which is done before and after FFT - dct4_kernel(in_real, in_imag, out_real, out_imag); + dct4_kernel(real, imag); // Reordering of data moved from DCT_IV to here - for (n = 0; n < 16; n++) { - if (2*n+1 < kx) { -#ifdef FIXED_POINT - QMF_RE(X[l + offset][2*n]) = out_real[n]; - QMF_IM(X[l + offset][2*n]) = out_imag[n]; - QMF_RE(X[l + offset][2*n+1]) = -out_imag[31-n]; - QMF_IM(X[l + offset][2*n+1]) = -out_real[31-n]; -#else - QMF_RE(X[l + offset][2*n]) = 2. * out_real[n]; - QMF_IM(X[l + offset][2*n]) = 2. * out_imag[n]; - QMF_RE(X[l + offset][2*n+1]) = -2. * out_imag[31-n]; - QMF_IM(X[l + offset][2*n+1]) = -2. * out_real[31-n]; -#endif - } else { - if (2*n < kx) { -#ifdef FIXED_POINT - QMF_RE(X[l + offset][2*n]) = out_real[n]; - QMF_IM(X[l + offset][2*n]) = out_imag[n]; -#else - QMF_RE(X[l + offset][2*n]) = 2. * out_real[n]; - QMF_IM(X[l + offset][2*n]) = 2. * out_imag[n]; -#endif - } - else { - QMF_RE(X[l + offset][2*n]) = 0; - QMF_IM(X[l + offset][2*n]) = 0; - } - QMF_RE(X[l + offset][2*n+1]) = 0; - QMF_IM(X[l + offset][2*n+1]) = 0; - } + /* Step 1: Calculate all non-zero pairs */ + pX = X[l + offset]; + for (n = 0; n < kx/2; n++) { + idx0 = 2*n; idx1 = idx0 + 1; + QMF_RE(pX[idx0]) = FAAD_ANALYSIS_SCALE2( real[n ]); + QMF_IM(pX[idx0]) = FAAD_ANALYSIS_SCALE2( imag[n ]); + QMF_RE(pX[idx1]) = FAAD_ANALYSIS_SCALE2(-imag[31-n]); + QMF_IM(pX[idx1]) = FAAD_ANALYSIS_SCALE2(-real[31-n]); } -#endif + /* Step 2: Calculate a single pair with half zero'ed */ + if (kx&1) { + idx0 = 2*n; idx1 = idx0 + 1; + QMF_RE(pX[idx0]) = FAAD_ANALYSIS_SCALE2( real[n]); + QMF_IM(pX[idx0]) = FAAD_ANALYSIS_SCALE2( imag[n]); + QMF_RE(pX[idx1]) = QMF_IM(pX[idx1]) = 0; + n++; + } + /* Step 3: All other are zero'ed */ + for (; n < 16; n++) { + idx0 = 2*n; idx1 = idx0 + 1; + QMF_RE(pX[idx0]) = QMF_IM(pX[idx0]) = 0; + QMF_RE(pX[idx1]) = QMF_IM(pX[idx1]) = 0; + } +#endif /* #ifdef SBR_LOW_POWER */ } } @@ -297,7 +299,7 @@ void sbr_qmf_synthesis_32(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][6 for (k = 0; k < 32; k++) { output[out++] = MUL_F(qmfs->v[qmfs->v_index + k], qmf_c[2*k]) + - MUL_F(qmfs->v[qmfs->v_index + 96 + k], qmf_c[64 + 2*k]) + + MUL_F(qmfs->v[qmfs->v_index + 96 + k], qmf_c[ 64 + 2*k]) + MUL_F(qmfs->v[qmfs->v_index + 128 + k], qmf_c[128 + 2*k]) + MUL_F(qmfs->v[qmfs->v_index + 224 + k], qmf_c[192 + 2*k]) + MUL_F(qmfs->v[qmfs->v_index + 256 + k], qmf_c[256 + 2*k]) + @@ -384,17 +386,26 @@ void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][6 qmfs->v_index = (1280-128); } } -#else +#else /* #ifdef SBR_LOW_POWER */ + +#define FAAD_CMPLX_PRETWIDDLE_SUB(k) \ + (MUL_F(QMF_RE(X[l][k]), RE(qmf32_pre_twiddle[k])) - \ + MUL_F(QMF_IM(X[l][k]), IM(qmf32_pre_twiddle[k]))) \ + +#define FAAD_CMPLX_PRETWIDDLE_ADD(k) \ + (MUL_F(QMF_IM(X[l][k]), RE(qmf32_pre_twiddle[k])) + \ + MUL_F(QMF_RE(X[l][k]), IM(qmf32_pre_twiddle[k]))) \ + void sbr_qmf_synthesis_32(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64], real_t *output) { - ALIGN real_t x1[32], x2[32]; + ALIGN real_t x1[32]; + ALIGN real_t x2[32]; #ifndef FIXED_POINT real_t scale = 1.f/64.f; #endif - int16_t n, k, out = 0; - uint8_t l; - + int32_t n, k, idx0, idx1, out = 0; + uint32_t l; /* qmf subsample l */ for (l = 0; l < sbr->numTimeSlotsRate; l++) @@ -405,43 +416,43 @@ void sbr_qmf_synthesis_32(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][6 /* calculate 64 samples */ /* complex pre-twiddle */ - for (k = 0; k < 32; k++) + for (k = 0; k < 32;) { - x1[k] = MUL_F(QMF_RE(X[l][k]), RE(qmf32_pre_twiddle[k])) - MUL_F(QMF_IM(X[l][k]), IM(qmf32_pre_twiddle[k])); - x2[k] = MUL_F(QMF_IM(X[l][k]), RE(qmf32_pre_twiddle[k])) + MUL_F(QMF_RE(X[l][k]), IM(qmf32_pre_twiddle[k])); - -#ifndef FIXED_POINT - x1[k] *= scale; - x2[k] *= scale; -#else - x1[k] >>= 1; - x2[k] >>= 1; -#endif + x1[k] = FAAD_CMPLX_PRETWIDDLE_SUB(k); x2[k] = FAAD_CMPLX_PRETWIDDLE_ADD(k); k++; + x1[k] = FAAD_CMPLX_PRETWIDDLE_SUB(k); x2[k] = FAAD_CMPLX_PRETWIDDLE_ADD(k); k++; + x1[k] = FAAD_CMPLX_PRETWIDDLE_SUB(k); x2[k] = FAAD_CMPLX_PRETWIDDLE_ADD(k); k++; + x1[k] = FAAD_CMPLX_PRETWIDDLE_SUB(k); x2[k] = FAAD_CMPLX_PRETWIDDLE_ADD(k); k++; } /* transform */ DCT4_32(x1, x1); DST4_32(x2, x2); - for (n = 0; n < 32; n++) + idx0 = qmfs->v_index; + idx1 = qmfs->v_index + 63; + for (n = 0; n < 32; n+=2) { - qmfs->v[qmfs->v_index + n] = qmfs->v[qmfs->v_index + 640 + n] = -x1[n] + x2[n]; - qmfs->v[qmfs->v_index + 63 - n] = qmfs->v[qmfs->v_index + 640 + 63 - n] = x1[n] + x2[n]; + qmfs->v[idx0] = qmfs->v[idx0 + 640] = -x1[n ] + x2[n ]; idx0++; + qmfs->v[idx1] = qmfs->v[idx1 + 640] = x1[n ] + x2[n ]; idx1--; + qmfs->v[idx0] = qmfs->v[idx0 + 640] = -x1[n+1] + x2[n+1]; idx0++; + qmfs->v[idx1] = qmfs->v[idx1 + 640] = x1[n+1] + x2[n+1]; idx1--; } /* calculate 32 output samples and window */ for (k = 0; k < 32; k++) { - output[out++] = MUL_F(qmfs->v[qmfs->v_index + k], qmf_c[2*k]) + - MUL_F(qmfs->v[qmfs->v_index + 96 + k], qmf_c[64 + 2*k]) + - MUL_F(qmfs->v[qmfs->v_index + 128 + k], qmf_c[128 + 2*k]) + - MUL_F(qmfs->v[qmfs->v_index + 224 + k], qmf_c[192 + 2*k]) + - MUL_F(qmfs->v[qmfs->v_index + 256 + k], qmf_c[256 + 2*k]) + - MUL_F(qmfs->v[qmfs->v_index + 352 + k], qmf_c[320 + 2*k]) + - MUL_F(qmfs->v[qmfs->v_index + 384 + k], qmf_c[384 + 2*k]) + - MUL_F(qmfs->v[qmfs->v_index + 480 + k], qmf_c[448 + 2*k]) + - MUL_F(qmfs->v[qmfs->v_index + 512 + k], qmf_c[512 + 2*k]) + - MUL_F(qmfs->v[qmfs->v_index + 608 + k], qmf_c[576 + 2*k]); + idx0 = qmfs->v_index + k; idx1 = 2*k; + output[out++] = FAAD_SYNTHESIS_SCALE( + MUL_F(qmfs->v[idx0 ], qmf_c[idx1 ]) + + MUL_F(qmfs->v[idx0 + 96], qmf_c[idx1 + 64]) + + MUL_F(qmfs->v[idx0 + 128], qmf_c[idx1 + 128]) + + MUL_F(qmfs->v[idx0 + 224], qmf_c[idx1 + 192]) + + MUL_F(qmfs->v[idx0 + 256], qmf_c[idx1 + 256]) + + MUL_F(qmfs->v[idx0 + 352], qmf_c[idx1 + 320]) + + MUL_F(qmfs->v[idx0 + 384], qmf_c[idx1 + 384]) + + MUL_F(qmfs->v[idx0 + 480], qmf_c[idx1 + 448]) + + MUL_F(qmfs->v[idx0 + 512], qmf_c[idx1 + 512]) + + MUL_F(qmfs->v[idx0 + 608], qmf_c[idx1 + 576])); } /* update ringbuffer index */ @@ -454,31 +465,18 @@ void sbr_qmf_synthesis_32(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][6 void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64], real_t *output) { -// ALIGN real_t x1[64], x2[64]; -#ifndef SBR_LOW_POWER - static ALIGN real_t in_real1[32], in_imag1[32], out_real1[32], out_imag1[32]; - static ALIGN real_t in_real2[32], in_imag2[32], out_real2[32], out_imag2[32]; -#endif + ALIGN real_t real1[32]; + ALIGN real_t imag1[32]; + ALIGN real_t real2[32]; + ALIGN real_t imag2[32]; qmf_t * pX; - real_t * pring_buffer_1, * pring_buffer_3; -// real_t * ptemp_1, * ptemp_2; -#ifdef PREFER_POINTERS - // These pointers are used if target platform has autoinc address generators - real_t * pring_buffer_2, * pring_buffer_4; - real_t * pring_buffer_5, * pring_buffer_6; - real_t * pring_buffer_7, * pring_buffer_8; - real_t * pring_buffer_9, * pring_buffer_10; - const real_t * pqmf_c_1, * pqmf_c_2, * pqmf_c_3, * pqmf_c_4; - const real_t * pqmf_c_5, * pqmf_c_6, * pqmf_c_7, * pqmf_c_8; - const real_t * pqmf_c_9, * pqmf_c_10; -#endif // #ifdef PREFER_POINTERS + real_t * p_buf_1, * p_buf_3; #ifndef FIXED_POINT real_t scale = 1.f/64.f; #endif - int16_t n, k, out = 0; - uint8_t l; - - + int32_t n, k, idx0, idx1, out = 0; + uint32_t l; + /* qmf subsample l */ for (l = 0; l < sbr->numTimeSlotsRate; l++) { @@ -487,139 +485,46 @@ void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][6 //memmove(qmfs->v + 128, qmfs->v, (1280-128)*sizeof(real_t)); /* calculate 128 samples */ -#ifndef FIXED_POINT - pX = X[l]; - - in_imag1[31] = scale*QMF_RE(pX[1]); - in_real1[0] = scale*QMF_RE(pX[0]); - in_imag2[31] = scale*QMF_IM(pX[63-1]); - in_real2[0] = scale*QMF_IM(pX[63-0]); - for (k = 1; k < 31; k++) - { - in_imag1[31 - k] = scale*QMF_RE(pX[2*k + 1]); - in_real1[ k] = scale*QMF_RE(pX[2*k ]); - in_imag2[31 - k] = scale*QMF_IM(pX[63 - (2*k + 1)]); - in_real2[ k] = scale*QMF_IM(pX[63 - (2*k )]); - } - in_imag1[0] = scale*QMF_RE(pX[63]); - in_real1[31] = scale*QMF_RE(pX[62]); - in_imag2[0] = scale*QMF_IM(pX[63-63]); - in_real2[31] = scale*QMF_IM(pX[63-62]); - -#else - - pX = X[l]; - - in_imag1[31] = QMF_RE(pX[1]) >> 1; - in_real1[0] = QMF_RE(pX[0]) >> 1; - in_imag2[31] = QMF_IM(pX[62]) >> 1; - in_real2[0] = QMF_IM(pX[63]) >> 1; - for (k = 1; k < 31; k++) + for (k = 0; k < 32; k++) { - in_imag1[31 - k] = QMF_RE(pX[2*k + 1]) >> 1; - in_real1[ k] = QMF_RE(pX[2*k ]) >> 1; - in_imag2[31 - k] = QMF_IM(pX[63 - (2*k + 1)]) >> 1; - in_real2[ k] = QMF_IM(pX[63 - (2*k )]) >> 1; + idx0 = 2*k; idx1 = idx0+1; + real1[ k] = QMF_RE(pX[idx0]); imag2[ k] = QMF_IM(pX[idx0]); + imag1[31-k] = QMF_RE(pX[idx1]); real2[31-k] = QMF_IM(pX[idx1]); } - in_imag1[0] = QMF_RE(pX[63]) >> 1; - in_real1[31] = QMF_RE(pX[62]) >> 1; - in_imag2[0] = QMF_IM(pX[0]) >> 1; - in_real2[31] = QMF_IM(pX[1]) >> 1; - -#endif - - + // dct4_kernel is DCT_IV without reordering which is done before and after FFT - dct4_kernel(in_real1, in_imag1, out_real1, out_imag1); - dct4_kernel(in_real2, in_imag2, out_real2, out_imag2); - - - pring_buffer_1 = qmfs->v + qmfs->v_index; - pring_buffer_3 = pring_buffer_1 + 1280; -#ifdef PREFER_POINTERS - pring_buffer_2 = pring_buffer_1 + 127; - pring_buffer_4 = pring_buffer_1 + (1280 + 127); -#endif // #ifdef PREFER_POINTERS -// ptemp_1 = x1; -// ptemp_2 = x2; -#ifdef PREFER_POINTERS - for (n = 0; n < 32; n ++) - { - //real_t x1 = *ptemp_1++; - //real_t x2 = *ptemp_2++; - // pring_buffer_3 and pring_buffer_4 are needed only for double ring buffer - *pring_buffer_1++ = *pring_buffer_3++ = out_real2[n] - out_real1[n]; - *pring_buffer_2-- = *pring_buffer_4-- = out_real2[n] + out_real1[n]; - //x1 = *ptemp_1++; - //x2 = *ptemp_2++; - *pring_buffer_1++ = *pring_buffer_3++ = out_imag2[31-n] + out_imag1[31-n]; - *pring_buffer_2-- = *pring_buffer_4-- = out_imag2[31-n] - out_imag1[31-n]; - } -#else // #ifdef PREFER_POINTERS + dct4_kernel(real1, imag1); + dct4_kernel(real2, imag2); + + p_buf_1 = qmfs->v + qmfs->v_index; + p_buf_3 = p_buf_1 + 1280; + idx0 = 0; idx1 = 127; for (n = 0; n < 32; n++) { - // pring_buffer_3 and pring_buffer_4 are needed only for double ring buffer - pring_buffer_1[2*n] = pring_buffer_3[2*n] = out_real2[n] - out_real1[n]; - pring_buffer_1[127-2*n] = pring_buffer_3[127-2*n] = out_real2[n] + out_real1[n]; - pring_buffer_1[2*n+1] = pring_buffer_3[2*n+1] = out_imag2[31-n] + out_imag1[31-n]; - pring_buffer_1[127-(2*n+1)] = pring_buffer_3[127-(2*n+1)] = out_imag2[31-n] - out_imag1[31-n]; + p_buf_1[idx0] = p_buf_3[idx0] = real2[ n] - real1[ n]; idx0++; + p_buf_1[idx1] = p_buf_3[idx1] = real2[ n] + real1[ n]; idx1--; + p_buf_1[idx0] = p_buf_3[idx0] = imag2[31-n] + imag1[31-n]; idx0++; + p_buf_1[idx1] = p_buf_3[idx1] = imag2[31-n] - imag1[31-n]; idx1--; } -#endif // #ifdef PREFER_POINTERS - - pring_buffer_1 = qmfs->v + qmfs->v_index; -#ifdef PREFER_POINTERS - pring_buffer_2 = pring_buffer_1 + 192; - pring_buffer_3 = pring_buffer_1 + 256; - pring_buffer_4 = pring_buffer_1 + (256 + 192); - pring_buffer_5 = pring_buffer_1 + 512; - pring_buffer_6 = pring_buffer_1 + (512 + 192); - pring_buffer_7 = pring_buffer_1 + 768; - pring_buffer_8 = pring_buffer_1 + (768 + 192); - pring_buffer_9 = pring_buffer_1 + 1024; - pring_buffer_10 = pring_buffer_1 + (1024 + 192); - pqmf_c_1 = qmf_c; - pqmf_c_2 = qmf_c + 64; - pqmf_c_3 = qmf_c + 128; - pqmf_c_4 = qmf_c + 192; - pqmf_c_5 = qmf_c + 256; - pqmf_c_6 = qmf_c + 320; - pqmf_c_7 = qmf_c + 384; - pqmf_c_8 = qmf_c + 448; - pqmf_c_9 = qmf_c + 512; - pqmf_c_10 = qmf_c + 576; -#endif // #ifdef PREFER_POINTERS + p_buf_1 = qmfs->v + qmfs->v_index; /* calculate 64 output samples and window */ for (k = 0; k < 64; k++) { -#ifdef PREFER_POINTERS - output[out++] = - MUL_F(*pring_buffer_1++, *pqmf_c_1++) + - MUL_F(*pring_buffer_2++, *pqmf_c_2++) + - MUL_F(*pring_buffer_3++, *pqmf_c_3++) + - MUL_F(*pring_buffer_4++, *pqmf_c_4++) + - MUL_F(*pring_buffer_5++, *pqmf_c_5++) + - MUL_F(*pring_buffer_6++, *pqmf_c_6++) + - MUL_F(*pring_buffer_7++, *pqmf_c_7++) + - MUL_F(*pring_buffer_8++, *pqmf_c_8++) + - MUL_F(*pring_buffer_9++, *pqmf_c_9++) + - MUL_F(*pring_buffer_10++, *pqmf_c_10++); -#else // #ifdef PREFER_POINTERS - output[out++] = - MUL_F(pring_buffer_1[k+0], qmf_c[k+0]) + - MUL_F(pring_buffer_1[k+192], qmf_c[k+64]) + - MUL_F(pring_buffer_1[k+256], qmf_c[k+128]) + - MUL_F(pring_buffer_1[k+(256+192)], qmf_c[k+192]) + - MUL_F(pring_buffer_1[k+512], qmf_c[k+256]) + - MUL_F(pring_buffer_1[k+(512+192)], qmf_c[k+320]) + - MUL_F(pring_buffer_1[k+768], qmf_c[k+384]) + - MUL_F(pring_buffer_1[k+(768+192)], qmf_c[k+448]) + - MUL_F(pring_buffer_1[k+1024], qmf_c[k+512]) + - MUL_F(pring_buffer_1[k+(1024+192)], qmf_c[k+576]); -#endif // #ifdef PREFER_POINTERS + output[out++] = FAAD_SYNTHESIS_SCALE( + MUL_F(p_buf_1[k ], qmf_c[k ]) + + MUL_F(p_buf_1[k+ 192 ], qmf_c[k+ 64]) + + MUL_F(p_buf_1[k+ 256 ], qmf_c[k+128]) + + MUL_F(p_buf_1[k+ 256+192], qmf_c[k+192]) + + MUL_F(p_buf_1[k+ 512 ], qmf_c[k+256]) + + MUL_F(p_buf_1[k+ 512+192], qmf_c[k+320]) + + MUL_F(p_buf_1[k+ 768 ], qmf_c[k+384]) + + MUL_F(p_buf_1[k+ 768+192], qmf_c[k+448]) + + MUL_F(p_buf_1[k+1024 ], qmf_c[k+512]) + + MUL_F(p_buf_1[k+1024+192], qmf_c[k+576])); } /* update ringbuffer index */ @@ -628,6 +533,6 @@ void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][6 qmfs->v_index = (1280 - 128); } } -#endif +#endif /* #ifdef SBR_LOW_POWER */ -#endif +#endif /* #ifdef SBR_DEC */ diff --git a/apps/codecs/libfaad/sbr_qmf_c.h b/apps/codecs/libfaad/sbr_qmf_c.h index 19592a7ff..150d72e1a 100644 --- a/apps/codecs/libfaad/sbr_qmf_c.h +++ b/apps/codecs/libfaad/sbr_qmf_c.h @@ -38,7 +38,7 @@ extern "C" { #pragma warning(disable:4244) #endif -ALIGN static const real_t qmf_c[640] = { +ALIGN static const real_t qmf_c[640] ICONST_ATTR_FAAD_LARGE_IRAM = { FRAC_CONST(0), FRAC_CONST(-0.00055252865047), FRAC_CONST(-0.00056176925738), FRAC_CONST(-0.00049475180896), FRAC_CONST(-0.00048752279712), FRAC_CONST(-0.00048937912498), diff --git a/apps/codecs/libfaad/specrec.c b/apps/codecs/libfaad/specrec.c index 74bf1f36f..d21a92338 100644 --- a/apps/codecs/libfaad/specrec.c +++ b/apps/codecs/libfaad/specrec.c @@ -458,14 +458,14 @@ static INLINE real_t iquant(int16_t q, const real_t *tab, uint8_t *error) if (q < 0) { /* tab contains a value for all possible q [0,8192] */ - if (-q < IQ_TABLE_SIZE) + if (LIKELY(-q < IQ_TABLE_SIZE)) return -tab[-q]; *error = 17; return 0; } else { /* tab contains a value for all possible q [0,8192] */ - if (q < IQ_TABLE_SIZE) + if (LIKELY(q < IQ_TABLE_SIZE)) return tab[q]; *error = 17; @@ -523,17 +523,17 @@ ALIGN static const real_t pow2sf_tab[] = { - Within a scalefactor window band, the coefficients are in ascending spectral order. */ +ALIGN static const real_t pow2_table[] ICONST_ATTR = +{ + COEF_CONST(1.0), + COEF_CONST(1.1892071150027210667174999705605), /* 2^0.25 */ + COEF_CONST(1.4142135623730950488016887242097), /* 2^0.50 */ + COEF_CONST(1.6817928305074290860622509524664) /* 2^0.75 */ +}; static uint8_t quant_to_spec(NeAACDecHandle hDecoder, ic_stream *ics, int16_t *quant_data, real_t *spec_data, uint16_t frame_len) { - ALIGN static const real_t pow2_table[] ICONST_ATTR = - { - COEF_CONST(1.0), - COEF_CONST(1.1892071150027210667174999705605), /* 2^0.25 */ - COEF_CONST(1.4142135623730950488016887242097), /* 2^0.5 */ - COEF_CONST(1.6817928305074290860622509524664) /* 2^0.75 */ - }; const real_t *tab = iq_table; (void)frame_len; -- 2.11.4.GIT