From 40ccde910fdd25ca5d40288349423cc4c8d38caa Mon Sep 17 00:00:00 2001 From: dave Date: Tue, 7 Feb 2006 22:16:35 +0000 Subject: [PATCH] Patch #1426489 - Shorten codec optimisations from Mark Arigo git-svn-id: svn://svn.rockbox.org/rockbox/trunk@8615 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libffmpegFLAC/shndec.c | 190 +++++++++++++++++++++++++++---------- apps/codecs/libffmpegFLAC/shndec.h | 44 ++++++--- apps/codecs/shorten.c | 92 ++++++++---------- 3 files changed, 212 insertions(+), 114 deletions(-) diff --git a/apps/codecs/libffmpegFLAC/shndec.c b/apps/codecs/libffmpegFLAC/shndec.c index 6dde8f7a7..d7fc6a15a 100644 --- a/apps/codecs/libffmpegFLAC/shndec.c +++ b/apps/codecs/libffmpegFLAC/shndec.c @@ -28,12 +28,6 @@ #include "golomb.h" #include "shndec.h" -/* These seem reasonable from my test files. - Does MAX_HEADER_SIZE really need to be 16384? */ -#define MAX_PRED_ORDER 16 -#define MAX_HEADER_SIZE DEFAULT_BLOCK_SIZE*4 -//#define MAX_HEADER_SIZE 16384 - #define ULONGSIZE 2 #define WAVE_FORMAT_PCM 0x0001 @@ -54,16 +48,6 @@ #define V2LPCQOFFSET (1 << LPCQUANT) #define FNSIZE 2 -#define FN_DIFF0 0 -#define FN_DIFF1 1 -#define FN_DIFF2 2 -#define FN_DIFF3 3 -#define FN_QUIT 4 -#define FN_BLOCKSIZE 5 -#define FN_BITSHIFT 6 -#define FN_QLPC 7 -#define FN_ZERO 8 -#define FN_VERBATIM 9 #define VERBATIM_CKSIZE_SIZE 5 #define VERBATIM_BYTE_SIZE 8 @@ -76,22 +60,21 @@ #define get_le16(gb) bswap_16(get_bits_long(gb, 16)) #define get_le32(gb) bswap_32(get_bits_long(gb, 32)) -static inline uint32_t bswap_32(uint32_t x){ +static uint32_t bswap_32(uint32_t x){ x= ((x<<8)&0xFF00FF00) | ((x>>8)&0x00FF00FF); return (x>>16) | (x<<16); } -static inline uint16_t bswap_16(uint16_t x){ +static uint16_t bswap_16(uint16_t x){ return (x>>8) | (x<<8); } /* converts fourcc string to int */ -static inline int ff_get_fourcc(const char *s){ +static int ff_get_fourcc(const char *s){ //assert( strlen(s)==4 ); return (s[0]) + (s[1]<<8) + (s[2]<<16) + (s[3]<<24); } -static unsigned int get_uint(ShortenContext *s, int k) ICODE_ATTR; static unsigned int get_uint(ShortenContext *s, int k) { if (s->version != 0) @@ -99,10 +82,77 @@ static unsigned int get_uint(ShortenContext *s, int k) return get_ur_golomb_shorten(&s->gb, k); } -static void decode_subframe_lpc(ShortenContext *s, int32_t *decoded, - int residual_size, int pred_order) ICODE_ATTR; -static void decode_subframe_lpc(ShortenContext *s, int32_t *decoded, - int residual_size, int pred_order) +#if defined(CPU_COLDFIRE) && !defined(SIMULATOR) +static void coldfire_lshift_samples(int n, int shift, int32_t *samples) ICODE_ATTR; +static void coldfire_lshift_samples(int n, int shift, int32_t *samples) +{ +/* + for (i = 0; i < n; i++) + samples[i] =<< shift; +*/ + asm volatile ( + "move.l %[n], %%d0 \n" /* d0 = loop counter */ + "asr.l #2, %%d0 \n" + "beq l1_shift \n" + "l2_shift:" /* main loop (unroll by 4) */ + "movem.l (%[x]), %%d4-%%d7 \n" + "asl.l %[s], %%d4 \n" + "asl.l %[s], %%d5 \n" + "asl.l %[s], %%d6 \n" + "asl.l %[s], %%d7 \n" + "movem.l %%d4-%%d7, (%[x]) \n" + "add.l #16, %[x] \n" + + "subq.l #1, %%d0 \n" + "bne l2_shift \n" + "l1_shift:" /* any loops left? */ + "and.l #3, %[n] \n" + "beq l4_shift \n" + "l3_shift:" /* remaining loops */ + "move.l (%[x]), %%d4 \n" + "asl.l %[s], %%d4 \n" + "move.l %%d4, (%[x])+ \n" + + "subq.l #1, %[n] \n" + "bne l3_shift \n" + "l4_shift:" /* exit */ + : [n] "+d" (n), /* d1 */ + [s] "+d" (shift), /* d2 */ + [x] "+a" (samples) /* a0 */ + : + : "%d0", "%d4", "%d5", "%d6", "%d7" + ); +} +#endif + +static inline void fix_bitshift(ShortenContext *s, int32_t *samples) +{ + int i; + + /* Wrapped samples don't get bitshifted, so we'll do them during + the next iteration. */ + if (s->bitshift != 0) { +#if defined(CPU_COLDFIRE) && !defined(SIMULATOR) + coldfire_lshift_samples(s->blocksize, s->bitshift, samples - s->nwrap); +#else + for (i = -s->nwrap; i < (s->blocksize - s->nwrap); i++) + samples[i] <<= s->bitshift; +#endif + } + + /* Also, when we have to remember to fix the wrapped samples when + the bitshift changes.*/ + if (s->bitshift != s->last_bitshift) { + if (s->last_bitshift != 0) + for (i = -s->nwrap; i < 0; i++) + samples[i] <<= s->last_bitshift; + + s->last_bitshift = s->bitshift; + } +} + +static inline void decode_subframe_lpc(ShortenContext *s, int32_t *decoded, + int residual_size, int pred_order) { int sum, i, j; int coeffs[MAX_PRED_ORDER]; @@ -121,18 +171,12 @@ static void decode_subframe_lpc(ShortenContext *s, int32_t *decoded, } } -int shorten_decode_frame(ShortenContext *s, - int32_t *decoded, - int32_t *offset, - uint8_t *buf, - int buf_size) +static inline int shorten_decode_frame(ShortenContext *s, int32_t *decoded, + int32_t *offset) { int i; int32_t sum; - init_get_bits(&s->gb, buf, buf_size*8); - get_bits(&s->gb, s->bitindex); - int cmd = get_ur_golomb_shorten(&s->gb, FNSIZE); switch (cmd) { case FN_ZERO: @@ -201,10 +245,6 @@ int shorten_decode_frame(ShortenContext *s, case FN_QLPC: { int pred_order = get_ur_golomb_shorten(&s->gb, LPCQSIZE); - if (pred_order > MAX_PRED_ORDER) { - return -2; - } - for (i=0; inwrap; i<0; i++) - decoded[i] = decoded[i + s->blocksize]; - - int scale = s->bitshift + SHN_OUTPUT_DEPTH - s->bits_per_sample; - for (i = 0; i < s->blocksize; i++) - decoded[i] <<= scale; + fix_bitshift(s, decoded); break; } @@ -244,29 +279,88 @@ int shorten_decode_frame(ShortenContext *s, i = get_ur_golomb_shorten(&s->gb, VERBATIM_CKSIZE_SIZE); while (i--) get_ur_golomb_shorten(&s->gb, VERBATIM_BYTE_SIZE); - return 4; break; case FN_BITSHIFT: s->bitshift = get_ur_golomb_shorten(&s->gb, BITSHIFTSIZE); - return 3; break; case FN_BLOCKSIZE: s->blocksize = get_uint(s, av_log2(s->blocksize)); - return 2; break; case FN_QUIT: - return 1; break; default: - return -1; + return FN_ERROR; break; } - return 0; + return cmd; +} + +int shorten_decode_frames(ShortenContext *s, int *nsamples, + int32_t *decoded0, int32_t *decoded1, + int32_t *offset0, int32_t *offset1, + uint8_t *buf, int buf_size, + void (*yield)(void)) +{ + int32_t *decoded, *offset; + int cmd; + + *nsamples = 0; + + init_get_bits(&s->gb, buf, buf_size*8); + get_bits(&s->gb, s->bitindex); + + int n = 0; + while (n < NUM_DEC_LOOPS) { + int chan = n%2; + if (chan == 0) { + decoded = decoded0 + s->nwrap + *nsamples; + offset = offset0; + } else { + decoded = decoded1 + s->nwrap + *nsamples; + offset = offset1; + } + + yield(); + + cmd = shorten_decode_frame(s, decoded, offset); + + if (cmd == FN_VERBATIM || cmd == FN_BITSHIFT || cmd == FN_BLOCKSIZE) { + continue; + } else if (cmd == FN_QUIT || cmd == FN_ERROR) { + break; + } + + *nsamples += chan * s->blocksize; + n++; + } + + if (*nsamples) { + /* Wrap the samples for the next loop */ + int i; + for (i = 0; i < s->nwrap; i++) { + decoded0[i] = decoded0[*nsamples + i]; + decoded1[i] = decoded1[*nsamples + i]; + } + + /* Scale the samples for the pcmbuf */ + int scale = SHN_OUTPUT_DEPTH - s->bits_per_sample; +#if defined(CPU_COLDFIRE) && !defined(SIMULATOR) + coldfire_lshift_samples(*nsamples, scale, decoded0 + s->nwrap); + coldfire_lshift_samples(*nsamples, scale, decoded1 + s->nwrap); +#else + for (i = 0; i < *nsamples; i++) { + decoded0[i + s->nwrap] <<= scale; + decoded1[i + s->nwrap] <<= scale; + } +#endif + } + + return cmd; } static int decode_wave_header(ShortenContext *s, diff --git a/apps/codecs/libffmpegFLAC/shndec.h b/apps/codecs/libffmpegFLAC/shndec.h index 6b830dcaf..713a5b56d 100644 --- a/apps/codecs/libffmpegFLAC/shndec.h +++ b/apps/codecs/libffmpegFLAC/shndec.h @@ -1,11 +1,31 @@ #include "bitstream.h" #define SHN_OUTPUT_DEPTH 28 + +#define MAX_CHANNELS 2 +#define MAX_PRED_ORDER 16 +#define MAX_NWRAP MAX_PRED_ORDER +#define MAX_NMEAN 4 + +/* NUM_DEC_LOOPS should be even number */ +#define NUM_DEC_LOOPS 26 #define DEFAULT_BLOCK_SIZE 256 -#define MAX_FRAMESIZE 1024 -#define MAX_CHANNELS 2 -#define MAX_NWRAP 3 -#define MAX_NMEAN 4 +#define MAX_HEADER_SIZE DEFAULT_BLOCK_SIZE*4 +#define MAX_BUFFER_SIZE 2*DEFAULT_BLOCK_SIZE*NUM_DEC_LOOPS +#define MAX_DECODE_SIZE ((DEFAULT_BLOCK_SIZE*NUM_DEC_LOOPS/2) + MAX_NWRAP) +#define MAX_OFFSET_SIZE MAX_NMEAN + +#define FN_DIFF0 0 +#define FN_DIFF1 1 +#define FN_DIFF2 2 +#define FN_DIFF3 3 +#define FN_QUIT 4 +#define FN_BLOCKSIZE 5 +#define FN_BITSHIFT 6 +#define FN_QLPC 7 +#define FN_ZERO 8 +#define FN_VERBATIM 9 +#define FN_ERROR 10 typedef struct ShortenContext { GetBitContext gb; @@ -17,20 +37,16 @@ typedef struct ShortenContext { int bits_per_sample; int version; int bitshift; + int last_bitshift; int nmean; int nwrap; int blocksize; int bitindex; -/* Not needed... - int bit_rate; - int block_align; - int chunk_size; -*/ } ShortenContext; int shorten_init(ShortenContext* s, uint8_t *buf, int buf_size); -int shorten_decode_frame(ShortenContext *s, - int32_t *decoded, - int32_t *offset, - uint8_t *buf, - int buf_size) ICODE_ATTR; +int shorten_decode_frames(ShortenContext *s, int *nsamples, + int32_t *decoded0, int32_t *decoded1, + int32_t *offset0, int32_t *offset1, + uint8_t *buf, int buf_size, + void (*yield)(void)) ICODE_ATTR; diff --git a/apps/codecs/shorten.c b/apps/codecs/shorten.c index 290686e96..3edc14377 100644 --- a/apps/codecs/shorten.c +++ b/apps/codecs/shorten.c @@ -33,13 +33,13 @@ extern char iend[]; struct codec_api* rb; struct codec_api* ci; -#define MAX_DECODED (DEFAULT_BLOCK_SIZE + MAX_NWRAP) -int32_t decoded0[MAX_DECODED] IBSS_ATTR; -int32_t decoded1[MAX_DECODED] IBSS_ATTR; +int32_t decoded0[MAX_DECODE_SIZE] IBSS_ATTR; +int32_t decoded1[MAX_DECODE_SIZE] IBSS_ATTR; -#define MAX_OFFSETS MAX_NMEAN -int32_t offset0[MAX_OFFSETS] IBSS_ATTR; -int32_t offset1[MAX_OFFSETS] IBSS_ATTR; +int32_t offset0[MAX_OFFSET_SIZE] IBSS_ATTR; +int32_t offset1[MAX_OFFSET_SIZE] IBSS_ATTR; + +int8_t ibuf[MAX_BUFFER_SIZE] IBSS_ATTR; /* this is the codec entry point */ enum codec_status codec_start(struct codec_api* api) @@ -48,9 +48,8 @@ enum codec_status codec_start(struct codec_api* api) uint32_t samplesdone; uint32_t elapsedtime; int8_t *buf; - int cur_chan, consumed, res; + int consumed, res, nsamples; long bytesleft; - int retval; /* Generic codec initialisation */ rb = api; @@ -72,9 +71,8 @@ enum codec_status codec_start(struct codec_api* api) next_track: /* Codec initialization */ if (codec_init(api)) { - LOGF("Shorten: Error initialising codec\n"); - retval = CODEC_ERROR; - goto exit; + LOGF("Shorten: codec_init error\n"); + return CODEC_ERROR; } while (!*ci->taginfo_ready) @@ -90,12 +88,11 @@ next_track: } /* Read the shorten & wave headers */ - buf = ci->request_buffer(&bytesleft, MAX_FRAMESIZE); + buf = ci->request_buffer(&bytesleft, MAX_HEADER_SIZE); res = shorten_init(&sc, (unsigned char *)buf, bytesleft); if (res < 0) { - LOGF("shorten_init error: %d\n", res); - retval = CODEC_ERROR; - goto exit; + LOGF("Shorten: shorten_init error: %d\n", res); + return CODEC_ERROR; } ci->id3->frequency = sc.sample_rate; @@ -117,14 +114,13 @@ next_track: seek_start: /* The main decoding loop */ - ci->memset(&decoded0, 0, sizeof(int32_t)*MAX_DECODED); - ci->memset(&decoded1, 0, sizeof(int32_t)*MAX_DECODED); - ci->memset(&offset0, 0, sizeof(int32_t)*MAX_OFFSETS); - ci->memset(&offset1, 0, sizeof(int32_t)*MAX_OFFSETS); + ci->memset(&decoded0, 0, sizeof(int32_t)*MAX_DECODE_SIZE); + ci->memset(&decoded1, 0, sizeof(int32_t)*MAX_DECODE_SIZE); + ci->memset(&offset0, 0, sizeof(int32_t)*MAX_OFFSET_SIZE); + ci->memset(&offset1, 0, sizeof(int32_t)*MAX_OFFSET_SIZE); - cur_chan = 0; samplesdone = 0; - buf = ci->request_buffer(&bytesleft, MAX_FRAMESIZE); + buf = ci->request_buffer(&bytesleft, MAX_BUFFER_SIZE); while (bytesleft) { ci->yield(); if (ci->stop_codec || ci->reload_codec) { @@ -143,51 +139,43 @@ seek_start: } /* Decode a frame */ - ci->yield(); - if (cur_chan == 0) { - res = shorten_decode_frame(&sc, decoded0 + sc.nwrap, offset0, - (unsigned char *)buf, bytesleft); + ci->memcpy(ibuf, buf, bytesleft); /* copy buf to iram */ + res = shorten_decode_frames(&sc, &nsamples, decoded0, decoded1, + offset0, offset1, (unsigned char *)ibuf, + bytesleft, ci->yield); + + if (res == FN_ERROR) { + LOGF("Shorten: shorten_decode_frames error (%d)\n", samplesdone); + return CODEC_ERROR; } else { - res = shorten_decode_frame(&sc, decoded1 + sc.nwrap, offset1, - (unsigned char *)buf, bytesleft); - } - cur_chan++; - - if (res == 0 && cur_chan == sc.channels) { - cur_chan = 0; - /* Insert decoded samples in pcmbuf */ - ci->yield(); - while (!ci->pcmbuf_insert_split((char*)(decoded0 + sc.nwrap), - (char*)(decoded1 + sc.nwrap), sc.blocksize*4)) { + if (nsamples) { ci->yield(); + while (!ci->pcmbuf_insert_split((char*)(decoded0 + sc.nwrap), + (char*)(decoded1 + sc.nwrap), + 4*nsamples)) { + ci->yield(); + } + + /* Update the elapsed-time indicator */ + samplesdone += nsamples; + elapsedtime = (samplesdone*10) / (sc.sample_rate/100); + ci->set_elapsed(elapsedtime); } - /* Update the elapsed-time indicator */ - samplesdone += sc.blocksize; - elapsedtime = (samplesdone*10) / (sc.sample_rate/100); - ci->set_elapsed(elapsedtime); - } else if (res == 1) { /* End of shorten stream...go to next track */ - break; - } else if (res < 0) { - LOGF("shorten_decode_frame error: \n", res); - retval = CODEC_ERROR; - goto exit; + if (res == FN_QUIT) + break; } consumed = sc.gb.index/8; ci->advance_buffer(consumed); + buf = ci->request_buffer(&bytesleft, MAX_BUFFER_SIZE); sc.bitindex = sc.gb.index - 8*consumed; - buf = ci->request_buffer(&bytesleft, MAX_FRAMESIZE); } - LOGF("Shorten: Decoded %d samples\n", samplesdone); - if (ci->request_next_track()) goto next_track; - retval = CODEC_OK; -exit: - return retval; + return CODEC_OK; } -- 2.11.4.GIT