From d236065b3690e8c7d9508a1483f3353d188d0786 Mon Sep 17 00:00:00 2001 From: jethead71 Date: Sun, 2 May 2010 02:44:45 +0000 Subject: [PATCH] Do some SPC codec optimizing for ARMv6 (as a training exercise), tweak realtime BRR for all CPU that use it, add Gaussian ASM optimization for all ARM that can use it. Add some LIKELY/UNLIKELY branch hints. On Gigabeat-S gives +22% speedup. For Gigabeat F, about +5% speedup. For less-powerful players, no real change aside possibly from branch hints. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@25771 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libspc/spc_codec.h | 32 ++- apps/codecs/libspc/spc_dsp.c | 510 ++++++++++++++++++++++++++++++++++------- 2 files changed, 452 insertions(+), 90 deletions(-) diff --git a/apps/codecs/libspc/spc_codec.h b/apps/codecs/libspc/spc_codec.h index cf72f90af..95d09fa09 100644 --- a/apps/codecs/libspc/spc_codec.h +++ b/apps/codecs/libspc/spc_codec.h @@ -37,6 +37,10 @@ /** Basic configuration options **/ +#ifndef ARM_ARCH +#define ARM_ARCH 0 +#endif + #define SPC_DUAL_CORE 1 #if !defined(SPC_DUAL_CORE) || NUM_CORES == 1 @@ -293,6 +297,15 @@ enum FIR_BUF_MASK = ~((FIR_BUF_ALIGN / 2) | (sizeof ( int32_t ) - 1)) }; #elif defined (CPU_ARM) +#if ARM_ARCH >= 6 +enum +{ + FIR_BUF_CNT = FIR_BUF_HALF * 2, + FIR_BUF_SIZE = FIR_BUF_CNT * sizeof ( int32_t ), + FIR_BUF_ALIGN = FIR_BUF_SIZE, + FIR_BUF_MASK = ~((FIR_BUF_ALIGN / 2) | (sizeof ( int32_t ) - 1)) +}; +#else enum { FIR_BUF_CNT = FIR_BUF_HALF * 2 * 2, @@ -300,6 +313,7 @@ enum FIR_BUF_ALIGN = FIR_BUF_SIZE, FIR_BUF_MASK = ~((FIR_BUF_ALIGN / 2) | (sizeof ( int32_t ) * 2 - 1)) }; +#endif /* ARM_ARCH */ #endif /* CPU_* */ struct Spc_Dsp @@ -318,7 +332,8 @@ struct Spc_Dsp uint16_t noise; /* also read as int16_t */ #if defined(CPU_COLDFIRE) - /* circularly hardware masked address */ + /* FIR history is interleaved. Hardware handles wrapping by mask. + * |LR|LR|LR|LR|LR|LR|LR|LR| */ int32_t *fir_ptr; /* wrapped address just behind current position - allows mac.w to increment and mask fir_ptr */ @@ -328,9 +343,22 @@ struct Spc_Dsp #elif defined (CPU_ARM) /* fir_buf [i + 8] == fir_buf [i], to avoid wrap checking in FIR code */ int32_t *fir_ptr; +#if ARM_ARCH >= 6 + /* FIR history is interleaved with guard to eliminate wrap checking + * when convolving. + * |LR|LR|LR|LR|LR|LR|LR|LR|--|--|--|--|--|--|--|--| */ + /* copy of echo FIR constants as int16_t, loaded as int32 for + * halfword, packed multiples */ + int16_t fir_coeff [VOICE_COUNT]; +#else + /* FIR history is interleaved with guard to eliminate wrap checking + * when convolving. + * |LL|RR|LL|RR|LL|RR|LL|RR|LL|RR|LL|RR|LL|RR|LL|RR|... + * |--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--| */ /* copy of echo FIR constants as int32_t, for faster access */ int32_t fir_coeff [VOICE_COUNT]; -#else +#endif /* ARM_ARCH */ +#else /* Unoptimized CPU */ /* fir_buf [i + 8] == fir_buf [i], to avoid wrap checking in FIR code */ int fir_pos; /* (0 to 7) */ int fir_buf [FIR_BUF_HALF * 2] [2]; diff --git a/apps/codecs/libspc/spc_dsp.c b/apps/codecs/libspc/spc_dsp.c index 5ea651478..0d07e5f04 100644 --- a/apps/codecs/libspc/spc_dsp.c +++ b/apps/codecs/libspc/spc_dsp.c @@ -57,6 +57,16 @@ void DSP_write( struct Spc_Dsp* this, int i, int data ) } } +#if ARM_ARCH >= 6 +/* if ( n < -32768 ) out = -32768; */ +/* if ( n > 32767 ) out = 32767; */ +#define CLAMP16( n ) \ + ({ \ + asm ("ssat %0, #16, %1" \ + : "=r" ( n ) : "r"( n ) ); \ + n; \ + }) +#else /* if ( n < -32768 ) out = -32768; */ /* if ( n > 32767 ) out = 32767; */ #define CLAMP16( n ) \ @@ -65,6 +75,7 @@ void DSP_write( struct Spc_Dsp* this, int i, int data ) n = 0x7FFF ^ (n >> 31); \ n; \ }) +#endif #if SPC_BRRCACHE static void decode_brr( struct Spc_Dsp* this, unsigned start_addr, @@ -418,7 +429,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) /* Key on events are delayed */ int key_on_delay = voice->key_on_delay; - if ( --key_on_delay >= 0 ) /* <1% of the time */ + if ( UNLIKELY ( --key_on_delay >= 0 ) ) /* <1% of the time */ { key_on(this,voice,sd,raw_voice,key_on_delay,vbit); } @@ -438,13 +449,13 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) int env_mode = voice->env_mode; int adsr0 = raw_voice->adsr [0]; int env_timer; - if ( env_mode != state_release ) /* 99% of the time */ + if ( LIKELY ( env_mode != state_release ) ) /* 99% of the time */ { env_timer = voice->env_timer; - if ( adsr0 & 0x80 ) /* 79% of the time */ + if ( LIKELY ( adsr0 & 0x80 ) ) /* 79% of the time */ { int adsr1 = raw_voice->adsr [1]; - if ( env_mode == state_sustain ) /* 74% of the time */ + if ( LIKELY ( env_mode == state_sustain ) ) /* 74% of the time */ { if ( (env_timer -= env_rates [adsr1 & 0x1F]) > 0 ) goto write_env_timer; @@ -607,25 +618,12 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) goto skip_decode; } } - + /* header */ int const block_header = *addr; addr += 9; voice->addr = addr; voice->block_header = block_header; - int const filter = (block_header & 0x0C) - 0x08; - - /* scaling (invalid scaling gives -4096 for neg nybble, - 0 for pos) */ - static unsigned char const right_shifts [16] = { - 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 29, 29, 29, - }; - static unsigned char const left_shifts [16] = { - 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11 - }; - int const scale = block_header >> 4; - int const right_shift = right_shifts [scale]; - int const left_shift = left_shifts [scale]; /* previous samples */ int smp2 = voice->samples [BRR_BLOCK_SIZE + 1]; @@ -650,54 +648,117 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) /* force sample to end on next decode */ voice->block_header = 1; } - - do /* decode and filter 16 samples */ + + int const filter = block_header & 0x0c; + int const scale = block_header >> 4; + + if ( filter == 0x08 ) /* filter 2 (30-90% of the time) */ { - /* Get nybble, sign-extend, then scale - get byte, select which nybble, sign-extend, then shift - based on scaling. also handles invalid scaling values.*/ - int delta = (int) (int8_t) (addr [offset >> 3] << - (offset & 4)) >> right_shift << left_shift; - - out [offset >> 2] = smp2; - - if ( filter == 0 ) /* mode 0x08 (30-90% of the time) */ + /* y[n] = x[n] + 61/32 * y[n-1] - 15/16 * y[n-2] */ + do /* decode and filter 16 samples */ { + /* Get nybble, sign-extend, then scale + get byte, select which nybble, sign-extend, then shift + based on scaling. */ + int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4; + delta = (delta << scale) >> 1; + + if (scale > 0xc) + delta = (delta >> 17) << 11; + + out [offset >> 2] = smp2; + delta -= smp2 >> 1; delta += smp2 >> 5; - smp2 = smp1; delta += smp1; delta += (-smp1 - (smp1 >> 1)) >> 5; + + delta = CLAMP16( delta ); + smp2 = smp1; + smp1 = (int16_t) (delta * 2); /* sign-extend */ } - else + while ( (offset += 4) != 0 ); + } + else if ( filter == 0x04 ) /* filter 1 */ + { + /* y[n] = x[n] + 15/16 * y[n-1] */ + do /* decode and filter 16 samples */ { - if ( filter == -4 ) /* mode 0x04 */ - { - delta += smp1 >> 1; - delta += (-smp1) >> 5; - } - else if ( filter > -4 ) /* mode 0x0C */ - { - delta -= smp2 >> 1; - delta += (smp2 + (smp2 >> 1)) >> 4; - delta += smp1; - delta += (-smp1 * 13) >> 7; - } + /* Get nybble, sign-extend, then scale + get byte, select which nybble, sign-extend, then shift + based on scaling. */ + int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4; + delta = (delta << scale) >> 1; + + if (scale > 0xc) + delta = (delta >> 17) << 11; + + out [offset >> 2] = smp2; + + delta += smp1 >> 1; + delta += (-smp1) >> 5; + + delta = CLAMP16( delta ); smp2 = smp1; + smp1 = (int16_t) (delta * 2); /* sign-extend */ } - - delta = CLAMP16( delta ); - smp1 = (int16_t) (delta * 2); /* sign-extend */ + while ( (offset += 4) != 0 ); } - while ( (offset += 4) != 0 ); - + else if ( filter == 0x0c ) /* filter 3 */ + { + /* y[n] = x[n] + 115/64 * y[n-1] - 13/16 * y[n-2] */ + do /* decode and filter 16 samples */ + { + /* Get nybble, sign-extend, then scale + get byte, select which nybble, sign-extend, then shift + based on scaling. */ + int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4; + delta = (delta << scale) >> 1; + + if (scale > 0xc) + delta = (delta >> 17) << 11; + + out [offset >> 2] = smp2; + + delta -= smp2 >> 1; + delta += (smp2 + (smp2 >> 1)) >> 4; + delta += smp1; + delta += (-smp1 * 13) >> 7; + + delta = CLAMP16( delta ); + smp2 = smp1; + smp1 = (int16_t) (delta * 2); /* sign-extend */ + } + while ( (offset += 4) != 0 ); + } + else /* filter 0 */ + { + /* y[n] = x[n] */ + do /* decode and filter 16 samples */ + { + /* Get nybble, sign-extend, then scale + get byte, select which nybble, sign-extend, then shift + based on scaling. */ + int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4; + delta = (delta << scale) >> 1; + + if (scale > 0xc) + delta = (delta >> 17) << 11; + + out [offset >> 2] = smp2; + + smp2 = smp1; + smp1 = delta * 2; + } + while ( (offset += 4) != 0 ); + } + out [0] = smp2; out [1] = smp1; skip_decode:; } - #endif - + #endif /* !SPC_BRRCACHE */ /* Get rate (with possible modulation) */ int rate = VOICE_RATE(vr); if ( this->r.g.pitch_mods & vbit ) @@ -754,13 +815,87 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) /* Use faster gaussian interpolation when exact result isn't needed by pitch modulator of next channel */ - int amp_0, amp_1; - if ( !(slow_gaussian & vbit) ) /* 99% of the time */ + int amp_0, amp_1; /* Also serve as temps _0, and _1 */ + if ( LIKELY ( !(slow_gaussian & vbit) ) ) /* 99% of the time */ { /* Main optimization is lack of clamping. Not a problem since output never goes more than +/- 16 outside 16-bit range and things are clamped later anyway. Other optimization is to preserve fractional accuracy, eliminating several masks. */ + #if defined (CPU_ARM) + int output; + int _2, _3; /* All-purpose temps */ + /* Multiple ASM blocks keep regs free and reduce result + * latency issues. */ + #if ARM_ARCH >= 6 + /* Interpolate */ + asm volatile ( + "ldr %[_0], [%[interp]] \r\n" /* _0=i0i1 */ + "ldr %[_2], [%[fwd]] \r\n" /* _2=f0f1 */ + "ldr %[_1], [%[interp], #4] \r\n" /* _1=i2i3 */ + "ldr %[_3], [%[rev]] \r\n" /* _3=r0r1 */ + "smuad %[out], %[_0], %[_2] \r\n" /* out=f0*i0 + f1*i1 */ + "smladx %[out], %[_1], %[_3], %[out] \r\n" /* out+=r1*i2 + r0*i3 */ + : [out]"=&r"(output), + [_0]"=&r"(amp_0), [_1]"=&r"(amp_1), + [_2]"=&r"(_2), [_3]"=&r"(_3) + : [fwd]"r"(fwd), [rev]"r"(rev), + [interp]"r"(interp)); + /* Apply voice envelope */ + asm volatile ( + "mov %[_2], %[out], asr #(11-5) \r\n" /* To do >> 16 later */ + "mul %[out], %[_2], %[envx] \r\n" /* and avoid exp. shift */ + : [out]"+r"(output), [_2]"=&r"(_2) + : [envx]"r"((int)voice->envx)); + /* Apply left and right volume */ + asm volatile ( + "smulwb %[amp_0], %[out], %[vvol_0] \r\n" /* (32x16->48)[47:16]->[31:0] */ + "smulwb %[amp_1], %[out], %[vvol_1] \r\n" + : [out]"+r"(output), + [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1) + : [vvol_0]"r"(voice->volume[0]), + [vvol_1]"r"(voice->volume[1])); + + raw_voice->outx = output >> (8+5); /* 'output' still 5 bits too big */ + #else /* ARM_ARCH < 6 */ + /* Perform gaussian interpolation on four samples */ + asm volatile ( + "ldrsh %[_0], [%[interp]] \r\n" + "ldrsh %[_2], [%[fwd]] \r\n" + "ldrsh %[_1], [%[interp], #2] \r\n" + "ldrsh %[_3], [%[fwd], #2] \r\n" + "mul %[out], %[_0], %[_2] \r\n" /* out= fwd[0]*interp[0] */ + "ldrsh %[_0], [%[interp], #4] \r\n" + "ldrsh %[_2], [%[rev], #2] \r\n" + "mla %[out], %[_1], %[_3], %[out] \r\n" /* out+=fwd[1]*interp[1] */ + "ldrsh %[_1], [%[interp], #6] \r\n" + "ldrsh %[_3], [%[rev]] \r\n" + "mla %[out], %[_0], %[_2], %[out] \r\n" /* out+=rev[1]*interp[2] */ + "mla %[out], %[_1], %[_3], %[out] \r\n" /* out+=rev[0]*interp[3] */ + : [out]"=&r"(output), + [_0]"=&r"(amp_0), [_1]"=&r"(amp_1), + [_2]"=&r"(_2), [_3]"=&r"(_3) + : [fwd]"r"(fwd), [rev]"r"(rev), + [interp]"r"(interp)); + /* Apply voice envelope */ + asm volatile ( + "mov %[_2], %[out], asr #11 \r\n" + "mul %[out], %[_2], %[envx] \r\n" + : [out]"+r"(output), [_2]"=&r"(_2) + : [envx]"r"((int)voice->envx)); + /* Reduce and apply left and right volume */ + asm volatile ( + "mov %[out], %[out], asr #11 \r\n" + "mul %[amp_0], %[out], %[vvol_0] \r\n" + "mul %[amp_1], %[out], %[vvol_1] \r\n" + : [out]"+r"(output), + [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1) + : [vvol_0]"r"((int)voice->volume[0]), + [vvol_1]"r"((int)voice->volume[1])); + + raw_voice->outx = output >> 8; + #endif /* ARM_ARCH */ + #else /* Unoptimized CPU */ int output = (((fwd [0] * interp [0] + fwd [1] * interp [1] + rev [1] * interp [2] + @@ -769,11 +904,121 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) /* duplicated here to give compiler more to run in parallel */ amp_0 = voice->volume [0] * output; amp_1 = voice->volume [1] * output; + raw_voice->outx = output >> 8; + #endif /* CPU_* */ } - else + else /* slow gaussian */ { + #if defined(CPU_ARM) + #if ARM_ARCH >= 6 + int output = *(int16_t*) &this->noise; + + if ( !(this->r.g.noise_enables & vbit) ) + { + /* Interpolate */ + int _2, _3; + asm volatile ( + /* NOTE: often-unaligned accesses */ + "ldr %[_0], [%[interp]] \r\n" /* _0=i0i1 */ + "ldr %[_2], [%[fwd]] \r\n" /* _2=f0f1 */ + "ldr %[_1], [%[interp], #4] \r\n" /* _1=i2i3 */ + "ldr %[_3], [%[rev]] \r\n" /* _3=f2f3 */ + "smulbb %[out], %[_0], %[_2] \r\n" /* out=f0*i0 */ + "smultt %[_0], %[_0], %[_2] \r\n" /* _0=f1*i1 */ + "smulbt %[_2], %[_1], %[_3] \r\n" /* _2=r1*i2 */ + "smultb %[_3], %[_1], %[_3] \r\n" /* _3=r0*i3 */ + : [out]"=r"(output), + [_0]"=&r"(amp_0), [_1]"=&r"(amp_1), + [_2]"=&r"(_2), [_3]"=&r"(_3) + : [fwd]"r"(fwd), [rev]"r"(rev), + [interp]"r"(interp)); + asm volatile ( + "mov %[out], %[out], asr#12 \r\n" + "add %[_0], %[out], %[_0], asr #12 \r\n" + "add %[_2], %[_0], %[_2], asr #12 \r\n" + "pkhbt %[_0], %[_2], %[_3], asl #4 \r\n" /* _3[31:16], _2[15:0] */ + "sadd16 %[_0], %[_0], %[_0] \r\n" /* _3[31:16]*2, _2[15:0]*2 */ + "qsubaddx %[out], %[_0], %[_0] \r\n" /* out[15:0]= + * sat16(_3[31:16]+_2[15:0]) */ + : [out]"+r"(output), + [_0]"+r"(amp_0), [_2]"+r"(_2), [_3]"+r"(_3)); + } + /* Apply voice envelope */ + asm volatile ( + "smulbb %[out], %[out], %[envx] \r\n" + : [out]"+r"(output) + : [envx]"r"(voice->envx)); + /* Reduce and apply left and right volume */ + asm volatile ( + "mov %[out], %[out], asr #11 \r\n" + "bic %[out], %[out], #0x1 \r\n" + "mul %[amp_0], %[out], %[vvol_0] \r\n" + "mul %[amp_1], %[out], %[vvol_1] \r\n" + : [out]"+r"(output), + [amp_1]"=r"(amp_1), [amp_0]"=r"(amp_0) + : [vvol_0]"r"((int)voice->volume[0]), + [vvol_1]"r"((int)voice->volume[1])); + + prev_outx = output; + raw_voice->outx = output >> 8; + #else /* ARM_ARCH < 6 */ + int output = *(int16_t*) &this->noise; + + if ( !(this->r.g.noise_enables & vbit) ) + { + /* Interpolate */ + int _2, _3; + asm volatile ( + "ldrsh %[_0], [%[interp]] \r\n" + "ldrsh %[_2], [%[fwd]] \r\n" + "ldrsh %[_1], [%[interp], #2] \r\n" + "ldrsh %[_3], [%[fwd], #2] \r\n" + "mul %[out], %[_2], %[_0] \r\n" /* fwd[0]*interp[0] */ + "ldrsh %[_2], [%[rev], #2] \r\n" + "mul %[_0], %[_3], %[_1] \r\n" /* fwd[1]*interp[1] */ + "ldrsh %[_1], [%[interp], #4] \r\n" + "mov %[out], %[out], asr #12 \r\n" + "ldrsh %[_3], [%[rev]] \r\n" + "mul %[_2], %[_1], %[_2] \r\n" /* rev[1]*interp[2] */ + "ldrsh %[_1], [%[interp], #6] \r\n" + "add %[_0], %[out], %[_0], asr #12 \r\n" + "mul %[_3], %[_1], %[_3] \r\n" /* rev[0]*interp[3] */ + "add %[_2], %[_0], %[_2], asr #12 \r\n" + "mov %[_2], %[_2], lsl #17 \r\n" + "mov %[_3], %[_3], asr #12 \r\n" + "mov %[_3], %[_3], asl #1 \r\n" + "add %[out], %[_3], %[_2], asr #16 \r\n" + : [out]"=r"(output), + [_0]"=&r"(amp_0), [_1]"=&r"(amp_1), + [_2]"=&r"(_2), [_3]"=&r"(_3) + : [fwd]"r"(fwd), [rev]"r"(rev), + [interp]"r"(interp)); + + output = CLAMP16(output); + } + /* Apply voice envelope */ + asm volatile ( + "mul %[_0], %[out], %[envx] \r\n" + : [_0]"=r"(amp_0) + : [out]"r"(output), [envx]"r"((int)voice->envx)); + /* Reduce and apply left and right volume */ + asm volatile ( + "mov %[out], %[amp_0], asr #11 \r\n" /* amp_0 = _0 */ + "bic %[out], %[out], #0x1 \r\n" + "mul %[amp_0], %[out], %[vvol_0] \r\n" + "mul %[amp_1], %[out], %[vvol_1] \r\n" + : [out]"+r"(output), [amp_0]"+r"(amp_0), + [amp_1]"=r"(amp_1) + : [vvol_0]"r"((int)voice->volume[0]), + [vvol_1]"r"((int)voice->volume[1])); + + prev_outx = output; + raw_voice->outx = output >> 8; + #endif /* ARM_ARCH >= 6 */ + #else /* Unoptimized CPU */ int output = *(int16_t*) &this->noise; + if ( !(this->r.g.noise_enables & vbit) ) { output = (fwd [0] * interp [0]) & ~0xFFF; @@ -788,8 +1033,10 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) /* duplicated here to give compiler more to run in parallel */ amp_0 = voice->volume [0] * output; amp_1 = voice->volume [1] * output; + prev_outx = output; - raw_voice->outx = (int8_t) (output >> 8); + raw_voice->outx = output >> 8; + #endif /* CPU_* */ } #else /* SPCNOINTERP */ /* two-point linear interpolation */ @@ -826,16 +1073,14 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) "asr.l %[sh], %[y1] \r\n" "add.l %[y0], %[y1] \r\n" : [f]"+d"(f), [y0]"=&a"(y0), [y1]"=&d"(amp_0) - : [s]"a"(voice->samples), [sh]"d"(12) - ); + : [s]"a"(voice->samples), [sh]"d"(12)); } /* apply voice envelope to output */ asm volatile ( - "mac.w %[output]l, %[envx]l, %%acc0 \r\n" + "mac.w %[out]l, %[envx]l, %%acc0 \r\n" : - : [output]"r"(amp_0), [envx]"r"(voice->envx) - ); + : [out]"r"(amp_0), [envx]"r"(voice->envx)); /* advance voice position */ voice->position += rate; @@ -843,15 +1088,14 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) /* fetch output, scale and apply left and right voice volume */ asm volatile ( - "movclr.l %%acc0, %[output] \r\n" - "asr.l %[sh], %[output] \r\n" - "mac.l %[vvol_0], %[output], %%acc0 \r\n" - "mac.l %[vvol_1], %[output], %%acc1 \r\n" - : [output]"=&d"(amp_0) + "movclr.l %%acc0, %[out] \r\n" + "asr.l %[sh], %[out] \r\n" + "mac.l %[vvol_0], %[out], %%acc0 \r\n" + "mac.l %[vvol_1], %[out], %%acc1 \r\n" + : [out]"=&d"(amp_0) : [vvol_0]"r"((int)voice->volume[0]), [vvol_1]"r"((int)voice->volume[1]), - [sh]"d"(11) - ); + [sh]"d"(11)); /* save this output into previous, scale and save in output register */ @@ -862,14 +1106,16 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) asm volatile ( "movclr.l %%acc0, %[amp_0] \r\n" "movclr.l %%acc1, %[amp_1] \r\n" - : [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1) - ); + : [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1)); #elif defined (CPU_ARM) int amp_0, amp_1; - if ( (this->r.g.noise_enables & vbit) != 0 ) { + if ( (this->r.g.noise_enables & vbit) != 0 ) + { amp_0 = *(int16_t *)&this->noise; - } else { + } + else + { uint32_t f = voice->position; amp_0 = (uint32_t)voice->samples; @@ -882,8 +1128,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) "sub %[y1], %[y1], %[y0] \r\n" "mul %[f], %[y1], %[f] \r\n" "add %[y0], %[y0], %[f], asr #12 \r\n" - : [f]"+r"(f), [y0]"+r"(amp_0), [y1]"=&r"(amp_1) - ); + : [f]"+r"(f), [y0]"+r"(amp_0), [y1]"=&r"(amp_1)); } voice->position += rate; @@ -893,8 +1138,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) "mov %[amp_0], %[amp_1], asr #11 \r\n" "mov %[amp_1], %[amp_0], asr #8 \r\n" : [amp_0]"+r"(amp_0), [amp_1]"=&r"(amp_1) - : [envx]"r"(voice->envx) - ); + : [envx]"r"(voice->envx)); prev_outx = amp_0; raw_voice->outx = (int8_t)amp_1; @@ -904,8 +1148,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) "mul %[amp_0], %[vol_0], %[amp_0] \r\n" : [amp_0]"+r"(amp_0), [amp_1]"+r"(amp_1) : [vol_0]"r"((int)voice->volume[0]), - [vol_1]"r"((int)voice->volume[1]) - ); + [vol_1]"r"((int)voice->volume[1])); #else /* Unoptimized CPU */ int output; @@ -1089,25 +1332,116 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) echo_pos = 0; this->echo_pos = echo_pos; + #if ARM_ARCH >= 6 + int32_t *fir_ptr, *fir_coeff; + int fb_0, fb_1; + + /* Apply FIR */ + fb_0 = *(uint32_t *)echo_ptr; + + /* Keep last 8 samples */ + asm volatile ( + "add %[fir_p], %[t_fir_p], #4 \r\n" + "bic %[t_fir_p], %[fir_p], %[mask] \r\n" + "str %[fb_0], [%[fir_p], #-4] \r\n" + /* duplicate at +8 eliminates wrap checking below */ + "str %[fb_0], [%[fir_p], #28] \r\n" + : [fir_p]"=&r"(fir_ptr), [t_fir_p]"+r"(this->fir_ptr) + : [fb_0]"r"(fb_0), [mask]"i"(~FIR_BUF_MASK)); + + fir_coeff = (int32_t *)this->fir_coeff; + + /* Fugly, but the best version found. */ + int _0; + asm volatile ( /* L0R0 = acc0 */ + "ldmia %[fir_p]!, { r2-r5 } \r\n" /* L1R1-L4R4 = r2-r5 */ + "ldmia %[fir_c]!, { r0-r1 } \r\n" /* C0C1-C2C3 = r0-r1 */ + "pkhbt %[_0], %[acc0], r2, asl #16 \r\n" /* L0R0,L1R1->L0L1,R0R1 */ + "pkhtb r2, r2, %[acc0], asr #16 \r\n" + "smuad %[acc0], %[_0], r0 \r\n" /* acc0=L0*C0+L1*C1 */ + "smuad %[acc1], r2, r0 \r\n" /* acc1=R0*C0+R1*C1 */ + "pkhbt %[_0], r3, r4, asl #16 \r\n" /* L2R2,L3R3->L2L3,R2R3 */ + "pkhtb r4, r4, r3, asr #16 \r\n" + "smlad %[acc0], %[_0], r1, %[acc0] \r\n" /* acc0+=L2*C2+L3*C3 */ + "smlad %[acc1], r4, r1, %[acc1] \r\n" /* acc1+=R2*C2+R3*C3 */ + "ldmia %[fir_p], { r2-r4 } \r\n" /* L5R5-L7R7 = r2-r4 */ + "ldmia %[fir_c], { r0-r1 } \r\n" /* C4C5-C6C7 = r0-r1 */ + "pkhbt %[_0], r5, r2, asl #16 \r\n" /* L4R4,L5R5->L4L5,R4R5 */ + "pkhtb r2, r2, r5, asr #16 \r\n" + "smlad %[acc0], %[_0], r0, %[acc0] \r\n" /* acc0+=L4*C4+L5*C5 */ + "smlad %[acc1], r2, r0, %[acc1] \r\n" /* acc1+=R4*C4+R5*C5 */ + "pkhbt %[_0], r3, r4, asl #16 \r\n" /* L6R6,L7R7->L6L7,R6R7 */ + "pkhtb r4, r4, r3, asr #16 \r\n" + "smlad %[acc0], %[_0], r1, %[acc0] \r\n" /* acc0+=L6*C6+L7*C7 */ + "smlad %[acc1], r4, r1, %[acc1] \r\n" /* acc1+=R6*C6+R7*C7 */ + : [acc0]"+r"(fb_0), [acc1]"=&r"(fb_1), [_0]"=&r"(_0), + [fir_p]"+r"(fir_ptr), [fir_c]"+r"(fir_coeff) + : + : "r0", "r1", "r2", "r3", "r4", "r5"); + + /* Generate output */ + int amp_0, amp_1; + + asm volatile ( + "mul %[amp_0], %[gvol_0], %[chans_0] \r\n" + "mul %[amp_1], %[gvol_1], %[chans_1] \r\n" + : [amp_0]"=&r"(amp_0), [amp_1]"=&r"(amp_1) + : [gvol_0]"r"(global_vol_0), [gvol_1]"r"(global_vol_1), + [chans_0]"r"(chans_0), [chans_1]"r"(chans_1)); + asm volatile ( + "mla %[amp_0], %[fb_0], %[ev_0], %[amp_0] \r\n" + "mla %[amp_1], %[fb_1], %[ev_1], %[amp_1] \r\n" + : [amp_0]"+r"(amp_0), [amp_1]"+r"(amp_1) + : [fb_0]"r"(fb_0), [fb_1]"r"(fb_1), + [ev_0]"r"((int)this->r.g.echo_volume_0), + [ev_1]"r"((int)this->r.g.echo_volume_1)); + + out_buf [ 0] = amp_0 >> global_muting; + out_buf [WAV_CHUNK_SIZE] = amp_1 >> global_muting; + out_buf ++; + + if ( !(this->r.g.flags & 0x20) ) + { + /* Feedback into echo buffer */ + int e0, e1; + + asm volatile ( + "mov %[e0], %[echo_0], asl #7 \r\n" + "mov %[e1], %[echo_1], asl #7 \r\n" + "mla %[e0], %[fb_0], %[efb], %[e0] \r\n" + "mla %[e1], %[fb_1], %[efb], %[e1] \r\n" + : [e0]"=&r"(e0), [e1]"=&r"(e1) + : [echo_0]"r"(echo_0), [echo_1]"r"(echo_1), + [fb_0]"r"(fb_0), [fb_1]"r"(fb_1), + [efb]"r"((int)this->r.g.echo_feedback)); + asm volatile ( + "ssat %[e0], #16, %[e0], asr #14 \r\n" + "ssat %[e1], #16, %[e1], asr #14 \r\n" + "pkhbt %[e0], %[e0], %[e1], lsl #16 \r\n" + "str %[e0], [%[echo_p]] \r\n" + : [e0]"+r"(e0), [e1]"+r"(e1) + : [echo_p]"r"(echo_ptr)); + } + #else /* ARM_ARCH < 6 */ int fb_0 = GET_LE16SA( echo_ptr ); int fb_1 = GET_LE16SA( echo_ptr + 2 ); + int32_t *fir_ptr, *fir_coeff; /* Keep last 8 samples */ - int32_t *fir_ptr = this->fir_ptr; /* Apply FIR */ asm volatile ( - "str %[fb_0], [%[fir_p]], #4 \r\n" - "str %[fb_1], [%[fir_p]], #4 \r\n" + "add %[fir_p], %[t_fir_p], #8 \r\n" + "bic %[t_fir_p], %[fir_p], %[mask] \r\n" + "str %[fb_0], [%[fir_p], #-8] \r\n" + "str %[fb_1], [%[fir_p], #-4] \r\n" /* duplicate at +8 eliminates wrap checking below */ - "str %[fb_0], [%[fir_p], #56] \r\n" - "str %[fb_1], [%[fir_p], #60] \r\n" - : [fir_p]"+r"(fir_ptr) - : [fb_0]"r"(fb_0), [fb_1]"r"(fb_1) - ); + "str %[fb_0], [%[fir_p], #56] \r\n" + "str %[fb_1], [%[fir_p], #60] \r\n" + : [fir_p]"=&r"(fir_ptr), [t_fir_p]"+r"(this->fir_ptr) + : [fb_0]"r"(fb_0), [fb_1]"r"(fb_1), [mask]"i"(~FIR_BUF_MASK)); - this->fir_ptr = (int32_t *)((intptr_t)fir_ptr & FIR_BUF_MASK); - int32_t *fir_coeff = this->fir_coeff; + fir_coeff = this->fir_coeff; asm volatile ( "ldmia %[fir_c]!, { r0-r1 } \r\n" @@ -1137,8 +1471,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) : [fb_0]"+r"(fb_0), [fb_1]"+r"(fb_1), [fir_p]"+r"(fir_ptr), [fir_c]"+r"(fir_coeff) : - : "r0", "r1", "r2", "r3", "r4", "r5" - ); + : "r0", "r1", "r2", "r3", "r4", "r5"); /* Generate output */ int amp_0 = (chans_0 * global_vol_0 + fb_0 * this->r.g.echo_volume_0) @@ -1160,6 +1493,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) e1 = CLAMP16( e1 ); SET_LE16A( echo_ptr + 2, e1 ); } + #endif /* ARM_ARCH */ #else /* Unoptimized CPU */ /* Read feedback from echo buffer */ int echo_pos = this->echo_pos; -- 2.11.4.GIT