From 67ff5d0a2e46ebc267b86442641c86935deef65a Mon Sep 17 00:00:00 2001 From: Buschel Date: Sun, 30 Aug 2009 14:14:22 +0000 Subject: [PATCH] Further performance optimization of the atrac3 decoder. Rework the internal sample representation and usage of dsp routines. For now a quick and dirty solution is used to add a fract part of 2 bits. Through this several buffers and functions as well as copy loops could be removed. Furthermore add some ASM for coldfire and place some additional data in IRAM on PP5022/24 and X5/M5. Speedup on ARM: +3%, speedup on Coldfire: +639%. Both ARM and Coldfire can decode in realtime now. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@22561 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/atrac3_rm.c | 9 +++---- apps/codecs/libatrac/atrac3.c | 46 ++++++++++++------------------- apps/codecs/libatrac/atrac3.h | 11 ++++++-- apps/codecs/libatrac/fixp_math.h | 58 +++++++++++++++++++++++----------------- 4 files changed, 64 insertions(+), 60 deletions(-) diff --git a/apps/codecs/atrac3_rm.c b/apps/codecs/atrac3_rm.c index a8610f5764..f3bfa2f801 100644 --- a/apps/codecs/atrac3_rm.c +++ b/apps/codecs/atrac3_rm.c @@ -41,7 +41,6 @@ enum codec_status codec_main(void) static size_t buff_size; int datasize, res, consumed, i, time_offset; uint8_t *bit_buffer; - int16_t outbuf[2048] __attribute__((aligned(32))); uint16_t fs,sps,h; uint32_t packet_count; int scrambling_unit_size, num_units, elapsed = 0; @@ -62,9 +61,9 @@ next_track: init_rm(&rmctx); ci->configure(DSP_SET_FREQUENCY, ci->id3->frequency); - ci->configure(DSP_SET_SAMPLE_DEPTH, 16); + ci->configure(DSP_SET_SAMPLE_DEPTH, 17); /* Remark: atrac3 uses s15.0 by default, s15.2 was hacked. */ ci->configure(DSP_SET_STEREO_MODE, rmctx.nb_channels == 1 ? - STEREO_MONO : STEREO_INTERLEAVED); + STEREO_MONO : STEREO_NONINTERLEAVED); packet_count = rmctx.nb_packets; rmctx.audio_framesize = rmctx.block_align; @@ -145,7 +144,7 @@ seek_start : ci->seek_complete(); } if(pkt.length) - res = atrac3_decode_frame(&rmctx,&q, outbuf, &datasize, pkt.frames[i], rmctx.block_align); + res = atrac3_decode_frame(&rmctx, &q, &datasize, pkt.frames[i], rmctx.block_align); else /* indicates that there are no remaining frames */ goto done; @@ -155,7 +154,7 @@ seek_start : } if(datasize) - ci->pcmbuf_insert(outbuf, NULL, q.samples_per_frame / rmctx.nb_channels); + ci->pcmbuf_insert(q.outSamples, q.outSamples + 1024, q.samples_per_frame / rmctx.nb_channels); elapsed = rmctx.audiotimestamp+(1000*8*sps/rmctx.bit_rate)*i; ci->set_elapsed(elapsed); rmctx.frame_number++; diff --git a/apps/codecs/libatrac/atrac3.c b/apps/codecs/libatrac/atrac3.c index dd00224e48..cd49aec348 100644 --- a/apps/codecs/libatrac/atrac3.c +++ b/apps/codecs/libatrac/atrac3.c @@ -55,18 +55,9 @@ #define FFMIN(a,b) ((a) > (b) ? (b) : (a)) #define FFSWAP(type,a,b) do{type SWAP_tmp= b; b= a; a= SWAP_tmp;}while(0) -/** - * Clips a signed integer value into the -32768,32767 range. - */ -static inline int16_t av_clip_int16(int a) -{ - if ((a+32768) & ~65535) return (a>>31) ^ 32767; - else return a; -} - static int32_t qmf_window[48] IBSS_ATTR; static VLC spectral_coeff_tab[7]; -static channel_unit channel_units[2]; +static channel_unit channel_units[2] IBSS_ATTR_LARGE_IRAM; /** * Matrixing within quadrature mirror synthesis filter. @@ -516,7 +507,6 @@ static void gainCompensateAndOverlap (int32_t *pIn, int32_t *pPrev, int32_t *pOu int32_t gain1, gain2, gain_inc; int cnt, numdata, nsample, startLoc, endLoc; - if (pGain2->num_gain_data == 0) gain1 = ONE_16; else @@ -735,7 +725,16 @@ static int decodeChannelSoundUnit (GetBitContext *gb, channel_unit *pSnd, int32_ numBands = (subbandTab[numSubbands] - 1) >> 8; if (lastTonal >= 0) numBands = FFMAX((lastTonal + 256) >> 8, numBands); - + + /* Remark: Hardcoded hack to add 2 bits (empty) fract part to internal sample + * representation. Needed for higher accuracy in internal calculations as + * well as for DSP configuration. See also: ../atrac3_rm.c, DSP_SET_SAMPLE_DEPTH + * Todo: Check spectral requantisation for using and outputting samples with + * fract part. */ + int32_t i; + for (i=0; i<1024; ++i) { + pSnd->spectrum[i] <<= 2; + } /* Reconstruct time domain samples. */ for (band=0; band<4; band++) { @@ -863,11 +862,9 @@ static int decodeFrame(ATRAC3Context *q, const uint8_t* databuf, int off) */ int atrac3_decode_frame(RMContext *rmctx, ATRAC3Context *q, - void *data, int *data_size, - const uint8_t *buf, int buf_size) { - int result = 0, off = 0, i; + int *data_size, const uint8_t *buf, int buf_size) { + int result = 0, off = 0; const uint8_t* databuf; - int16_t* samples = data; if (buf_size < rmctx->block_align) return buf_size; @@ -887,19 +884,10 @@ int atrac3_decode_frame(RMContext *rmctx, ATRAC3Context *q, return -1; } - if (q->channels == 1) { - /* mono */ - for (i = 0; i<1024; i++) - samples[i] = av_clip_int16(q->outSamples[i]); - *data_size = 1024 * sizeof(int16_t); - } else { - /* stereo */ - for (i = 0; i < 1024; i++) { - samples[i*2] = av_clip_int16(q->outSamples[i]); - samples[i*2+1] = av_clip_int16(q->outSamples[1024+i]); - } - *data_size = 2048 * sizeof(int16_t); - } + if (q->channels == 1) + *data_size = 1024 * sizeof(int32_t); + else + *data_size = 2048 * sizeof(int32_t); return rmctx->block_align; } diff --git a/apps/codecs/libatrac/atrac3.h b/apps/codecs/libatrac/atrac3.h index f81fc0a734..a817db2b55 100644 --- a/apps/codecs/libatrac/atrac3.h +++ b/apps/codecs/libatrac/atrac3.h @@ -1,6 +1,14 @@ #include "ffmpeg_bitstream.h" #include "../librm/rm.h" +#if (CONFIG_CPU == PP5022) || (CONFIG_CPU == PP5024) || (CONFIG_CPU == MCF5250) +/* PP5022/24 and MCF5250 have larger IRAM */ +#define IBSS_ATTR_LARGE_IRAM IBSS_ATTR +#else +/* other CPUs IRAM is not large enough */ +#define IBSS_ATTR_LARGE_IRAM +#endif + /* These structures are needed to store the parsed gain control data. */ typedef struct { int num_gain_data; @@ -75,6 +83,5 @@ typedef struct { int atrac3_decode_init(ATRAC3Context *q, RMContext *rmctx); int atrac3_decode_frame(RMContext *rmctx, ATRAC3Context *q, - void *data, int *data_size, - const uint8_t *buf, int buf_size); + int *data_size, const uint8_t *buf, int buf_size); diff --git a/apps/codecs/libatrac/fixp_math.h b/apps/codecs/libatrac/fixp_math.h index 88cb5e4b66..5174cc7cc6 100644 --- a/apps/codecs/libatrac/fixp_math.h +++ b/apps/codecs/libatrac/fixp_math.h @@ -36,17 +36,38 @@ : "r"(X),"r"(Y)); \ low; \ }) - - #define fixmul32(X,Y) \ - ({ \ - int32_t low; \ - int32_t high; \ - asm volatile ( /* calculates: result = (X*Y)>>32 */ \ - "smull %0,%1,%2,%3 \n\t" /* 64 = 32x32 multiply */ \ - : "=&r"(low), "=&r" (high) \ - : "r"(X),"r"(Y)); \ - high; \ - }) +#elif defined(CPU_COLDFIRE) + #define fixmul16(X,Y) \ + ({ \ + int32_t t1, t2; \ + asm volatile ( \ + "mac.l %[x],%[y],%%acc0\n\t" /* multiply */ \ + "mulu.l %[y],%[x] \n\t" /* get lower half, avoid emac stall */ \ + "movclr.l %%acc0,%[t1] \n\t" /* get higher half */ \ + "moveq.l #15,%[t2] \n\t" \ + "asl.l %[t2],%[t1] \n\t" /* hi <<= 15, plus one free */ \ + "moveq.l #16,%[t2] \n\t" \ + "lsr.l %[t2],%[x] \n\t" /* (unsigned)lo >>= 16 */ \ + "or.l %[x],%[t1] \n\t" /* combine result */ \ + : /* outputs */ \ + [t1]"=&d"(t1), \ + [t2]"=&d"(t2) \ + : /* inputs */ \ + [x] "d" ((X)), \ + [y] "d" ((Y))); \ + t1; \ + }) + + #define fixmul31(X,Y) \ + ({ \ + int32_t t; \ + asm volatile ( \ + "mac.l %[x], %[y], %%acc0\n\t" /* multiply */ \ + "movclr.l %%acc0, %[t]\n\t" /* get higher half as result */ \ + : [t] "=d" (t) \ + : [x] "r" ((X)), [y] "r" ((Y))); \ + t; \ + }) #else static inline int32_t fixmul16(int32_t x, int32_t y) { @@ -69,17 +90,6 @@ return (int32_t)temp; } - - static inline int32_t fixmul32(int32_t x, int32_t y) - { - int64_t temp; - temp = x; - temp *= y; - - temp >>= 32; //16+31-16 = 31 bits - - return (int32_t)temp; - } #endif static inline int32_t fixdiv16(int32_t x, int32_t y) @@ -104,13 +114,13 @@ static inline int32_t fastSqrt(int32_t n) /* * Logically, these are unsigned. * We need the sign bit to test - * whether (op - res - one) underflowed. + * whether (op - res - one) underflowed. */ int32_t op, res, one; op = n; res = 0; /* "one" starts at the highest power of four <= than the argument. */ - one = 1 << 30; /* second-to-top bit set */ + one = 1 << 30; /* second-to-top bit set */ while (one > op) one >>= 2; while (one != 0) { -- 2.11.4.GIT