From 00da21d789a975a3628de71fa79a0db15fdf12a7 Mon Sep 17 00:00:00 2001 From: Buschel Date: Sat, 14 Jun 2008 11:22:31 +0000 Subject: [PATCH] Resort operands in mul/smull/mla/smlal to use D[] as second multiplication operand. Additionally do not pre-scale D[] for the 64 Bit precision synthesizer. Through this the 64 Bit multiplication is speed up by 1.5MHz without loss of precision. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@17719 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libmusepack/synth_filter.c | 13 ++- apps/codecs/libmusepack/synth_filter_arm.S | 123 +++++++++++++++-------------- 2 files changed, 73 insertions(+), 63 deletions(-) diff --git a/apps/codecs/libmusepack/synth_filter.c b/apps/codecs/libmusepack/synth_filter.c index ae9474136..9c8d27eed 100644 --- a/apps/codecs/libmusepack/synth_filter.c +++ b/apps/codecs/libmusepack/synth_filter.c @@ -54,9 +54,16 @@ // in this configuration a post-shift by >>1 is needed after synthesis #else - // saturate to +/- 2^31 (= value << (31-17)), D-values are +/- 2^17 - #define D(value) (value << (14)) - + #if defined(CPU_ARM) + // do not up-scale D-values to achieve higher speed in smull/mlal + // operations. saves ~14/8 = 1.75 cycles per multiplication + #define D(value) (value) + + // in this configuration a post-shift by >>16 is needed after synthesis + #else + // saturate to +/- 2^31 (= value << (31-17)), D-values are +/- 2^17 + #define D(value) (value << (14)) + #endif // do not perform pre-shift #define MPC_V_PRESHIFT(X) (X) #endif diff --git a/apps/codecs/libmusepack/synth_filter_arm.S b/apps/codecs/libmusepack/synth_filter_arm.S index 7b2d2dfd2..8bc6bd3c5 100755 --- a/apps/codecs/libmusepack/synth_filter_arm.S +++ b/apps/codecs/libmusepack/synth_filter_arm.S @@ -26,7 +26,8 @@ * * 2nd step within synthesis filter. Does the dewindowing. * 32=32x32 multiplies (OPTIMIZE_FOR_SPEED) - * Uses pre-shifted V[] and D[] values. + * Uses pre-shifted V[] and D[] values. D[] will always be the second operand + * of mul/mla to achieve higher speed as D[] has lower amplitude than V[]. ****************************************************************************/ #if defined(OPTIMIZE_FOR_SPEED) .align 2 @@ -42,40 +43,40 @@ mpc_decoder_windowing_D: mov lr, #32 .loop32: - ldmia r2!, { r3-r10 } /* load first 8 window coefficients */ + ldmia r2!, { r3-r10 } /* load D[00..07] */ ldr r11, [r1] /* 0 */ - mul r12, r3, r11 + mul r12, r11, r3 ldr r11, [r1, #96*4] /* 1 */ - mla r12, r4, r11, r12 + mla r12, r11, r4, r12 ldr r11, [r1, #128*4] /* 2 */ - mla r12, r5, r11, r12 + mla r12, r11, r5, r12 ldr r11, [r1, #224*4] /* 3 */ - mla r12, r6, r11, r12 + mla r12, r11, r6, r12 ldr r11, [r1, #256*4] /* 4 */ - mla r12, r7, r11, r12 + mla r12, r11, r7, r12 ldr r11, [r1, #352*4] /* 5 */ - mla r12, r8, r11, r12 + mla r12, r11, r8, r12 ldr r11, [r1, #384*4] /* 6 */ - mla r12, r9, r11, r12 + mla r12, r11, r9, r12 ldr r11, [r1, #480*4] /* 7 */ - mla r12, r10, r11, r12 - ldmia r2!, { r3-r10 } /* load last 8 window coefficients */ + mla r12, r11, r10, r12 + ldmia r2!, { r3-r10 } /* load D[08..15] */ ldr r11, [r1, #512*4] /* 8 */ - mla r12, r3, r11, r12 + mla r12, r11, r3, r12 ldr r11, [r1, #608*4] /* 9 */ - mla r12, r4, r11, r12 + mla r12, r11, r4, r12 ldr r11, [r1, #640*4] /* 10 */ - mla r12, r5, r11, r12 + mla r12, r11, r5, r12 ldr r11, [r1, #736*4] /* 11 */ - mla r12, r6, r11, r12 + mla r12, r11, r6, r12 ldr r11, [r1, #768*4] /* 12 */ - mla r12, r7, r11, r12 + mla r12, r11, r7, r12 ldr r11, [r1, #864*4] /* 13 */ - mla r12, r8, r11, r12 + mla r12, r11, r8, r12 ldr r11, [r1, #896*4] /* 14 */ - mla r12, r9, r11, r12 + mla r12, r11, r9, r12 ldr r11, [r1, #992*4] /* 15 */ - mla r12, r10, r11, r12 + mla r12, r11, r10, r12 mov r12, r12, asr #1 /* post shift to compensate for pre-shifting */ str r12, [r0], #4 /* store Data */ add r1, r1, #4 /* V++ */ @@ -92,9 +93,8 @@ mpc_decoder_windowing_D: * * 2nd step within synthesis filter. Does the dewindowing. * 64=32x32 multiplies - * Drops lo-part of 64bit multiply results and will therefor loose 1 bit - * accuracy. The decoder output is binary identical as this imprecision is - * far below the output's 16bit resolution. + * Uses un-shifted D[]-values. D[] will always be the second operand of + * smull/smlal to achieve higher speed as D[] has lower amplitude than V[]. ****************************************************************************/ .align 2 .global mpc_decoder_windowing_D @@ -105,52 +105,55 @@ mpc_decoder_windowing_D: /* r2 = D[] */ /* lr = counter */ - stmfd sp!, {r4-r12, lr} + stmfd sp!, {r4-r9, lr} mov lr, #32 .loop32: - ldmia r2!, { r3-r10 } /* load first 8 window coefficients */ - ldr r11, [r1] /* 0 */ - smull r11, r12, r3, r11 - ldr r11, [r1, #96*4] /* 1 */ - smlal r11, r12, r4, r11 - ldr r11, [r1, #128*4] /* 2 */ - smlal r11, r12, r5, r11 - ldr r11, [r1, #224*4] /* 3 */ - smlal r11, r12, r6, r11 - ldr r11, [r1, #256*4] /* 4 */ - smlal r11, r12, r7, r11 - ldr r11, [r1, #352*4] /* 5 */ - smlal r11, r12, r8, r11 - ldr r11, [r1, #384*4] /* 6 */ - smlal r11, r12, r9, r11 - ldr r11, [r1, #480*4] /* 7 */ - smlal r11, r12, r10, r11 - ldmia r2!, { r3-r10 } /* load last 8 window coefficients */ - ldr r11, [r1, #512*4] /* 8 */ - smlal r11, r12, r3, r11 - ldr r11, [r1, #608*4] /* 9 */ - smlal r11, r12, r4, r11 - ldr r11, [r1, #640*4] /* 10 */ - smlal r11, r12, r5, r11 - ldr r11, [r1, #736*4] /* 11 */ - smlal r11, r12, r6, r11 - ldr r11, [r1, #768*4] /* 12 */ - smlal r11, r12, r7, r11 - ldr r11, [r1, #864*4] /* 13 */ - smlal r11, r12, r8, r11 - ldr r11, [r1, #896*4] /* 14 */ - smlal r11, r12, r9, r11 - ldr r11, [r1, #992*4] /* 15 */ - smlal r11, r12, r10, r11 - mov r4, r12, lsl #2 /* get result from hi-part, loose 2 bits */ - str r4, [r0], #4 /* store Data */ + ldmia r2!, { r3-r6 } /* load D[00..03] */ + ldr r7, [r1] /* 0 */ + smull r8, r9, r7, r3 + ldr r7, [r1, #96*4] /* 1 */ + smlal r8, r9, r7, r4 + ldr r7, [r1, #128*4] /* 2 */ + smlal r8, r9, r7, r5 + ldr r7, [r1, #224*4] /* 3 */ + smlal r8, r9, r7, r6 + ldmia r2!, { r3-r6 } /* load D[04..07] */ + ldr r7, [r1, #256*4] /* 4 */ + smlal r8, r9, r7, r3 + ldr r7, [r1, #352*4] /* 5 */ + smlal r8, r9, r7, r4 + ldr r7, [r1, #384*4] /* 6 */ + smlal r8, r9, r7, r5 + ldr r7, [r1, #480*4] /* 7 */ + smlal r8, r9, r7, r6 + ldmia r2!, { r3-r6 } /* load D[08..11] */ + ldr r7, [r1, #512*4] /* 8 */ + smlal r8, r9, r7, r3 + ldr r7, [r1, #608*4] /* 9 */ + smlal r8, r9, r7, r4 + ldr r7, [r1, #640*4] /* 10 */ + smlal r8, r9, r7, r5 + ldr r7, [r1, #736*4] /* 11 */ + smlal r8, r9, r7, r6 + ldmia r2!, { r3-r6 } /* load D[12..15] */ + ldr r7, [r1, #768*4] /* 12 */ + smlal r8, r9, r7, r3 + ldr r7, [r1, #864*4] /* 13 */ + smlal r8, r9, r7, r4 + ldr r7, [r1, #896*4] /* 14 */ + smlal r8, r9, r7, r5 + ldr r7, [r1, #992*4] /* 15 */ + smlal r8, r9, r7, r6 + mov r8, r8, lsr #16 + orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */ + str r8, [r0], #4 /* store Data */ add r1, r1, #4 /* V++ */ subs lr, lr, #1 bgt .loop32 - ldmfd sp!, {r4-r12, pc} + ldmfd sp!, {r4-r9, pc} .mpc_dewindowing_end: .size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D #endif -- 2.11.4.GIT