From ec6a9fc7ab246fa1caf9925ccd2660bf44c61400 Mon Sep 17 00:00:00 2001 From: amiconn Date: Tue, 2 Feb 2010 22:50:21 +0000 Subject: [PATCH] APE codec: Speed up decoding of -c2000 and higher on ARMv4 and coldfire by fusing vector math for the filters. Speedup is roughly 3.5% for -c2000, 8% for -c3000 and 12% for -c4000. To be extended to other architectures. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24473 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/demac/libdemac/filter.c | 28 ++ apps/codecs/demac/libdemac/vector_math16_cf.h | 388 ++++++++++++----------- apps/codecs/demac/libdemac/vector_math32_armv4.h | 194 ++++++------ apps/codecs/lib/udiv32_arm.S | 2 +- 4 files changed, 335 insertions(+), 277 deletions(-) diff --git a/apps/codecs/demac/libdemac/filter.c b/apps/codecs/demac/libdemac/filter.c index 93edf39cb..ed6f3c8dc 100644 --- a/apps/codecs/demac/libdemac/filter.c +++ b/apps/codecs/demac/libdemac/filter.c @@ -134,6 +134,19 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3980(struct filter_t* f, while(LIKELY(count--)) { +#ifdef FUSED_VECTOR_MATH + if (LIKELY(*data != 0)) { + if (*data < 0) + res = vector_sp_add(f->coeffs, f->delay - ORDER, + f->adaptcoeffs - ORDER); + else + res = vector_sp_sub(f->coeffs, f->delay - ORDER, + f->adaptcoeffs - ORDER); + } else { + res = scalarproduct(f->coeffs, f->delay - ORDER); + } + res = FP_TO_INT(res); +#else res = FP_TO_INT(scalarproduct(f->coeffs, f->delay - ORDER)); if (LIKELY(*data != 0)) { @@ -142,6 +155,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3980(struct filter_t* f, else vector_sub(f->coeffs, f->adaptcoeffs - ORDER); } +#endif res += *data; @@ -193,6 +207,19 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f, while(LIKELY(count--)) { +#ifdef FUSED_VECTOR_MATH + if (LIKELY(*data != 0)) { + if (*data < 0) + res = vector_sp_add(f->coeffs, f->delay - ORDER, + f->adaptcoeffs - ORDER); + else + res = vector_sp_sub(f->coeffs, f->delay - ORDER, + f->adaptcoeffs - ORDER); + } else { + res = scalarproduct(f->coeffs, f->delay - ORDER); + } + res = FP_TO_INT(res); +#else res = FP_TO_INT(scalarproduct(f->coeffs, f->delay - ORDER)); if (LIKELY(*data != 0)) { @@ -201,6 +228,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f, else vector_sub(f->coeffs, f->adaptcoeffs - ORDER); } +#endif /* Convert res from (32-FRACBITS).FRACBITS fixed-point format to an integer (rounding to nearest) and add the input value to diff --git a/apps/codecs/demac/libdemac/vector_math16_cf.h b/apps/codecs/demac/libdemac/vector_math16_cf.h index 11e7f07ad..6e8216c9c 100644 --- a/apps/codecs/demac/libdemac/vector_math16_cf.h +++ b/apps/codecs/demac/libdemac/vector_math16_cf.h @@ -24,19 +24,27 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA */ -/* This version fetches data as 32 bit words, and *recommends* v1 to be - * 32 bit aligned, otherwise performance will suffer. */ -static inline void vector_add(int16_t* v1, int16_t* v2) +#define FUSED_VECTOR_MATH + +#define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */ + +/* Calculate scalarproduct, then add a 2nd vector (fused for performance) + * This version fetches data as 32 bit words, and *recommends* v1 to be + * 32 bit aligned. It also assumes that f2 and s2 are either both 32 bit + * aligned or both unaligned. Performance will suffer if either condition + * isn't met. It also needs EMAC in signed integer mode. */ +static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2) { + int res; #if ORDER > 16 int cnt = ORDER>>4; #endif -#define ADDHALFREGS(s1, sum) /* Add register halves straight. */ \ - "move.l " #s1 ", %%d4 \n" /* 's1' can be an A or D reg. */ \ - "add.l " #sum ", " #s1 "\n" /* 'sum' must be a D reg. */ \ - "clr.w %%d4 \n" /* 's1' and %%d4 are clobbered! */ \ - "add.l %%d4 , " #sum "\n" \ +#define ADDHALFREGS(s1, s2, sum) /* Add register halves straight. */ \ + "move.l " #s1 ", " #sum "\n" /* 's1' and 's2' can be A or D */ \ + "add.l " #s2 ", " #s1 "\n" /* regs, 'sum' must be a D reg. */ \ + "clr.w " #sum " \n" /* 's1' is clobbered! */ \ + "add.l " #s2 ", " #sum "\n" \ "move.w " #s1 ", " #sum "\n" #define ADDHALFXREGS(s1, s2, sum) /* Add register halves across. */ \ @@ -47,94 +55,115 @@ static inline void vector_add(int16_t* v1, int16_t* v2) "move.w " #s1 ", " #sum "\n" asm volatile ( - "move.l %[v2], %%d0 \n" - "and.l #2, %%d0 \n" - "jeq 20f \n" - - "10: \n" - "move.w (%[v2])+, %%d0 \n" - "swap %%d0 \n" - "1: \n" - "movem.l (%[v1]), %%a0-%%a3 \n" - "movem.l (%[v2]), %%d1-%%d4 \n" - ADDHALFXREGS(%%a0, %%d1, %%d0) - "move.l %%d0, (%[v1])+ \n" - ADDHALFXREGS(%%a1, %%d2, %%d1) - "move.l %%d1, (%[v1])+ \n" - ADDHALFXREGS(%%a2, %%d3, %%d2) - "move.l %%d2, (%[v1])+ \n" - ADDHALFXREGS(%%a3, %%d4, %%d3) - "move.l %%d3, (%[v1])+ \n" - "lea.l (16, %[v2]), %[v2] \n" - "move.l %%d4, %%d0 \n" - - "movem.l (%[v1]), %%a0-%%a3 \n" - "movem.l (%[v2]), %%d1-%%d4 \n" - ADDHALFXREGS(%%a0, %%d1, %%d0) - "move.l %%d0, (%[v1])+ \n" - ADDHALFXREGS(%%a1, %%d2, %%d1) - "move.l %%d1, (%[v1])+ \n" - ADDHALFXREGS(%%a2, %%d3, %%d2) - "move.l %%d2, (%[v1])+ \n" - ADDHALFXREGS(%%a3, %%d4, %%d3) - "move.l %%d3, (%[v1])+ \n" + "move.l %[f2], %%d0 \n" + "and.l #2, %%d0 \n" + "jeq 20f \n" + + "10: \n" + "move.w (%[f2])+, %%d0 \n" + "move.w (%[s2])+, %%d1 \n" + "swap %%d1 \n" + "1: \n" + ".rept 2 \n" + "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n" + "mac.w %%d0l, %%d6u, (%[f2])+, %%d0, %%acc0\n" + "mac.w %%d0u, %%d6l, (%[s2])+, %%d2, %%acc0\n" + ADDHALFXREGS(%%d6, %%d2, %%d1) + "mac.w %%d0l, %%d7u, (%[f2])+, %%d0, %%acc0\n" + "mac.w %%d0u, %%d7l, (%[s2])+, %%d6, %%acc0\n" + "move.l %%d1, (%[v1])+ \n" + ADDHALFXREGS(%%d7, %%d6, %%d2) + "mac.w %%d0l, %%a0u, (%[f2])+, %%d0, %%acc0\n" + "mac.w %%d0u, %%a0l, (%[s2])+, %%d7, %%acc0\n" + "move.l %%d2, (%[v1])+ \n" + ADDHALFXREGS(%%a0, %%d7, %%d6) + "mac.w %%d0l, %%a1u, (%[f2])+, %%d0, %%acc0\n" + "mac.w %%d0u, %%a1l, (%[s2])+, %%d1, %%acc0\n" + "move.l %%d6, (%[v1])+ \n" + ADDHALFXREGS(%%a1, %%d1, %%d7) + "move.l %%d7, (%[v1])+ \n" + ".endr \n" + #if ORDER > 16 - "lea.l (16, %[v2]), %[v2] \n" - "move.l %%d4, %%d0 \n" + "subq.l #1, %[res] \n" + "bne.w 1b \n" +#endif + "jra 99f \n" - "subq.l #1, %[cnt] \n" - "jne 1b \n" + "20: \n" + "move.l (%[f2])+, %%d0 \n" + "1: \n" + "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n" + "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n" + "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n" + ADDHALFREGS(%%d6, %%d1, %%d2) + "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n" + "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n" + "move.l %%d2, (%[v1])+ \n" + ADDHALFREGS(%%d7, %%d1, %%d2) + "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n" + "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n" + "move.l %%d2, (%[v1])+ \n" + ADDHALFREGS(%%a0, %%d1, %%d2) + "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n" + "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n" + "move.l %%d2, (%[v1])+ \n" + ADDHALFREGS(%%a1, %%d1, %%d2) + "move.l %%d2, (%[v1])+ \n" + + "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n" + "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n" + "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n" + ADDHALFREGS(%%d6, %%d1, %%d2) + "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n" + "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n" + "move.l %%d2, (%[v1])+ \n" + ADDHALFREGS(%%d7, %%d1, %%d2) + "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n" + "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n" + "move.l %%d2, (%[v1])+ \n" + ADDHALFREGS(%%a0, %%d1, %%d2) + "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n" +#if ORDER > 16 + "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n" +#else + "mac.w %%d0l, %%a1l, %%acc0 \n" #endif - "jra 99f \n" - - "20: \n" - "1: \n" - "movem.l (%[v2]), %%a0-%%a3 \n" - "movem.l (%[v1]), %%d0-%%d3 \n" - ADDHALFREGS(%%a0, %%d0) - "move.l %%d0, (%[v1])+ \n" - ADDHALFREGS(%%a1, %%d1) - "move.l %%d1, (%[v1])+ \n" - ADDHALFREGS(%%a2, %%d2) - "move.l %%d2, (%[v1])+ \n" - ADDHALFREGS(%%a3, %%d3) - "move.l %%d3, (%[v1])+ \n" - "lea.l (16, %[v2]), %[v2] \n" - - "movem.l (%[v2]), %%a0-%%a3 \n" - "movem.l (%[v1]), %%d0-%%d3 \n" - ADDHALFREGS(%%a0, %%d0) - "move.l %%d0, (%[v1])+ \n" - ADDHALFREGS(%%a1, %%d1) - "move.l %%d1, (%[v1])+ \n" - ADDHALFREGS(%%a2, %%d2) - "move.l %%d2, (%[v1])+ \n" - ADDHALFREGS(%%a3, %%d3) - "move.l %%d3, (%[v1])+ \n" + "move.l %%d2, (%[v1])+ \n" + ADDHALFREGS(%%a1, %%d1, %%d2) + "move.l %%d2, (%[v1])+ \n" #if ORDER > 16 - "lea.l (16, %[v2]), %[v2] \n" - - "subq.l #1, %[cnt] \n" - "jne 1b \n" + "subq.l #1, %[res] \n" + "bne.w 1b \n" #endif - "99: \n" + + "99: \n" + "movclr.l %%acc0, %[res] \n" : /* outputs */ + [v1]"+a"(v1), + [f2]"+a"(f2), + [s2]"+a"(s2), + [res]"=d"(res) + : /* inputs */ #if ORDER > 16 - [cnt]"+d"(cnt), + [cnt]"[res]"(cnt) #endif - [v1] "+a"(v1), - [v2] "+a"(v2) - : /* inputs */ : /* clobbers */ - "d0", "d1", "d2", "d3", "d4", - "a0", "a1", "a2", "a3", "memory" + "d0", "d1", "d2", "d6", "d7", + "a0", "a1", "memory" + ); + return res; } -/* This version fetches data as 32 bit words, and *recommends* v1 to be - * 32 bit aligned, otherwise performance will suffer. */ -static inline void vector_sub(int16_t* v1, int16_t* v2) +/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) + * This version fetches data as 32 bit words, and *recommends* v1 to be + * 32 bit aligned. It also assumes that f2 and s2 are either both 32 bit + * aligned or both unaligned. Performance will suffer if either condition + * isn't met. It also needs EMAC in signed integer mode. */ +static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2) { + int res; #if ORDER > 16 int cnt = ORDER>>4; #endif @@ -155,107 +184,116 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) "move.w " #min ", " #s1d "\n" asm volatile ( - "move.l %[v2], %%d0 \n" - "and.l #2, %%d0 \n" - "jeq 20f \n" - - "10: \n" - "move.w (%[v2])+, %%d0 \n" - "swap %%d0 \n" - "1: \n" - "movem.l (%[v2]), %%d1-%%d4 \n" - "movem.l (%[v1]), %%a0-%%a3 \n" - SUBHALFXREGS(%%a0, %%d1, %%d0) - "move.l %%d0, (%[v1])+ \n" - SUBHALFXREGS(%%a1, %%d2, %%d1) - "move.l %%d1, (%[v1])+ \n" - SUBHALFXREGS(%%a2, %%d3, %%d2) - "move.l %%d2, (%[v1])+ \n" - SUBHALFXREGS(%%a3, %%d4, %%d3) - "move.l %%d3, (%[v1])+ \n" - "lea.l (16, %[v2]), %[v2] \n" - "move.l %%d4, %%d0 \n" - - "movem.l (%[v2]), %%d1-%%d4 \n" - "movem.l (%[v1]), %%a0-%%a3 \n" - SUBHALFXREGS(%%a0, %%d1, %%d0) - "move.l %%d0, (%[v1])+ \n" - SUBHALFXREGS(%%a1, %%d2, %%d1) - "move.l %%d1, (%[v1])+ \n" - SUBHALFXREGS(%%a2, %%d3, %%d2) - "move.l %%d2, (%[v1])+ \n" - SUBHALFXREGS(%%a3, %%d4, %%d3) - "move.l %%d3, (%[v1])+ \n" + "move.l %[f2], %%d0 \n" + "and.l #2, %%d0 \n" + "jeq 20f \n" + + "10: \n" + "move.w (%[f2])+, %%d0 \n" + "move.w (%[s2])+, %%d1 \n" + "swap %%d1 \n" + "1: \n" + ".rept 2 \n" + "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n" + "mac.w %%d0l, %%d6u, (%[f2])+, %%d0, %%acc0\n" + "mac.w %%d0u, %%d6l, (%[s2])+, %%d2, %%acc0\n" + SUBHALFXREGS(%%d6, %%d2, %%d1) + "mac.w %%d0l, %%d7u, (%[f2])+, %%d0, %%acc0\n" + "mac.w %%d0u, %%d7l, (%[s2])+, %%d6, %%acc0\n" + "move.l %%d1, (%[v1])+ \n" + SUBHALFXREGS(%%d7, %%d6, %%d2) + "mac.w %%d0l, %%a0u, (%[f2])+, %%d0, %%acc0\n" + "mac.w %%d0u, %%a0l, (%[s2])+, %%d7, %%acc0\n" + "move.l %%d2, (%[v1])+ \n" + SUBHALFXREGS(%%a0, %%d7, %%d6) + "mac.w %%d0l, %%a1u, (%[f2])+, %%d0, %%acc0\n" + "mac.w %%d0u, %%a1l, (%[s2])+, %%d1, %%acc0\n" + "move.l %%d6, (%[v1])+ \n" + SUBHALFXREGS(%%a1, %%d1, %%d7) + "move.l %%d7, (%[v1])+ \n" + ".endr \n" + #if ORDER > 16 - "lea.l (16, %[v2]), %[v2] \n" - "move.l %%d4, %%d0 \n" - - "subq.l #1, %[cnt] \n" - "bne.w 1b \n" + "subq.l #1, %[res] \n" + "bne.w 1b \n" #endif - "jra 99f \n" - - "20: \n" - "1: \n" - "movem.l (%[v2]), %%d1-%%d4 \n" - "movem.l (%[v1]), %%a0-%%a3 \n" - SUBHALFREGS(%%a0, %%d1, %%d0) - "move.l %%d0, (%[v1])+ \n" - SUBHALFREGS(%%a1, %%d2, %%d1) - "move.l %%d1, (%[v1])+ \n" - SUBHALFREGS(%%a2, %%d3, %%d2) - "move.l %%d2, (%[v1])+ \n" - SUBHALFREGS(%%a3, %%d4, %%d3) - "move.l %%d3, (%[v1])+ \n" - "lea.l (16, %[v2]), %[v2] \n" - - "movem.l (%[v2]), %%d1-%%d4 \n" - "movem.l (%[v1]), %%a0-%%a3 \n" - SUBHALFREGS(%%a0, %%d1, %%d0) - "move.l %%d0, (%[v1])+ \n" - SUBHALFREGS(%%a1, %%d2, %%d1) - "move.l %%d1, (%[v1])+ \n" - SUBHALFREGS(%%a2, %%d3, %%d2) - "move.l %%d2, (%[v1])+ \n" - SUBHALFREGS(%%a3, %%d4, %%d3) - "move.l %%d3, (%[v1])+ \n" -#if ORDER > 16 - "lea.l (16, %[v2]), %[v2] \n" - "subq.l #1, %[cnt] \n" - "bne.w 1b \n" + "jra 99f \n" + + "20: \n" + "move.l (%[f2])+, %%d0 \n" + "1: \n" + "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n" + "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n" + "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n" + SUBHALFREGS(%%d6, %%d1, %%d2) + "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n" + "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n" + "move.l %%d2, (%[v1])+ \n" + SUBHALFREGS(%%d7, %%d1, %%d2) + "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n" + "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n" + "move.l %%d2, (%[v1])+ \n" + SUBHALFREGS(%%a0, %%d1, %%d2) + "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n" + "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n" + "move.l %%d2, (%[v1])+ \n" + SUBHALFREGS(%%a1, %%d1, %%d2) + "move.l %%d2, (%[v1])+ \n" + + "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n" + "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n" + "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n" + SUBHALFREGS(%%d6, %%d1, %%d2) + "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n" + "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n" + "move.l %%d2, (%[v1])+ \n" + SUBHALFREGS(%%d7, %%d1, %%d2) + "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n" + "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n" + "move.l %%d2, (%[v1])+ \n" + SUBHALFREGS(%%a0, %%d1, %%d2) + "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n" +#if ORDER > 16 + "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n" +#else + "mac.w %%d0l, %%a1l, %%acc0 \n" +#endif + "move.l %%d2, (%[v1])+ \n" + SUBHALFREGS(%%a1, %%d1, %%d2) + "move.l %%d2, (%[v1])+ \n" +#if ORDER > 16 + "subq.l #1, %[res] \n" + "bne.w 1b \n" #endif - "99: \n" + "99: \n" + "movclr.l %%acc0, %[res] \n" : /* outputs */ + [v1]"+a"(v1), + [f2]"+a"(f2), + [s2]"+a"(s2), + [res]"=d"(res) + : /* inputs */ #if ORDER > 16 - [cnt]"+d"(cnt), + [cnt]"[res]"(cnt) #endif - [v1] "+a"(v1), - [v2] "+a"(v2) - : /* inputs */ : /* clobbers */ - "d0", "d1", "d2", "d3", "d4", - "a0", "a1", "a2", "a3", "memory" + "d0", "d1", "d2", "d6", "d7", + "a0", "a1", "memory" + ); + return res; } -#define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */ - /* This version fetches data as 32 bit words, and *recommends* v1 to be * 32 bit aligned, otherwise performance will suffer. It also needs EMAC - * in signed integer mode - call above macro before use. */ + * in signed integer mode. */ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) { int res; -#if ORDER > 32 - int cnt = ORDER>>5; -#endif - #if ORDER > 16 -#define MAC_BLOCKS "7" -#else -#define MAC_BLOCKS "3" + int cnt = ORDER>>4; #endif asm volatile ( @@ -267,20 +305,16 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) "move.l (%[v1])+, %%d0 \n" "move.w (%[v2])+, %%d1 \n" "1: \n" - ".rept " MAC_BLOCKS "\n" - "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" - "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" + ".rept 7 \n" "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" ".endr \n" "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" - "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" - "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n" -#if ORDER > 32 +#if ORDER > 16 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n" "subq.l #1, %[res] \n" - "bne.w 1b \n" + "bne.b 1b \n" #else "mac.w %%d0l, %%d1u, %%acc0 \n" #endif @@ -290,7 +324,7 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) "move.l (%[v1])+, %%d0 \n" "move.l (%[v2])+, %%d1 \n" "1: \n" - ".rept " MAC_BLOCKS "\n" + ".rept 3 \n" "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" @@ -299,11 +333,11 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n" -#if ORDER > 32 +#if ORDER > 16 "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n" "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n" "subq.l #1, %[res] \n" - "bne.w 1b \n" + "bne.b 1b \n" #else "mac.w %%d2u, %%d1u, %%acc0 \n" "mac.w %%d2l, %%d1l, %%acc0 \n" @@ -316,7 +350,7 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) [v2]"+a"(v2), [res]"=d"(res) : /* inputs */ -#if ORDER > 32 +#if ORDER > 16 [cnt]"[res]"(cnt) #endif : /* clobbers */ diff --git a/apps/codecs/demac/libdemac/vector_math32_armv4.h b/apps/codecs/demac/libdemac/vector_math32_armv4.h index 89b24f2b0..207fca303 100644 --- a/apps/codecs/demac/libdemac/vector_math32_armv4.h +++ b/apps/codecs/demac/libdemac/vector_math32_armv4.h @@ -24,78 +24,134 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA */ -static inline void vector_add(int32_t* v1, int32_t* v2) +#define FUSED_VECTOR_MATH + +#if ORDER > 32 +#define BLOCK_REPEAT "8" +#elif ORDER > 16 +#define BLOCK_REPEAT "7" +#else +#define BLOCK_REPEAT "3" +#endif + +/* Calculate scalarproduct, then add a 2nd vector (fused for performance) */ +static inline int32_t vector_sp_add(int32_t* v1, int32_t* f2, int32_t* s2) { + int res; #if ORDER > 32 int cnt = ORDER>>5; #endif -#if ORDER > 16 -#define ADD_SUB_BLOCKS "8" + asm volatile ( +#if ORDER > 32 + "mov %[res], #0 \n" + "1: \n" #else -#define ADD_SUB_BLOCKS "4" + "ldmia %[v1], {r0-r3} \n" + "ldmia %[f2]!, {r4-r7} \n" + "mul %[res], r4, r0 \n" + "mla %[res], r5, r1, %[res] \n" + "mla %[res], r6, r2, %[res] \n" + "mla %[res], r7, r3, %[res] \n" + "ldmia %[s2]!, {r4-r7} \n" + "add r0, r0, r4 \n" + "add r1, r1, r5 \n" + "add r2, r2, r6 \n" + "add r3, r3, r7 \n" + "stmia %[v1]!, {r0-r3} \n" #endif - - asm volatile ( - "1: \n" - ".rept " ADD_SUB_BLOCKS "\n" - "ldmia %[v1], {r0-r3} \n" - "ldmia %[v2]!, {r4-r7} \n" - "add r0, r0, r4 \n" - "add r1, r1, r5 \n" - "add r2, r2, r6 \n" - "add r3, r3, r7 \n" - "stmia %[v1]!, {r0-r3} \n" - ".endr \n" + ".rept " BLOCK_REPEAT "\n" + "ldmia %[v1], {r0-r3} \n" + "ldmia %[f2]!, {r4-r7} \n" + "mla %[res], r4, r0, %[res] \n" + "mla %[res], r5, r1, %[res] \n" + "mla %[res], r6, r2, %[res] \n" + "mla %[res], r7, r3, %[res] \n" + "ldmia %[s2]!, {r4-r7} \n" + "add r0, r0, r4 \n" + "add r1, r1, r5 \n" + "add r2, r2, r6 \n" + "add r3, r3, r7 \n" + "stmia %[v1]!, {r0-r3} \n" + ".endr \n" #if ORDER > 32 - "subs %[cnt], %[cnt], #1 \n" - "bne 1b \n" + "subs %[cnt], %[cnt], #1 \n" + "bne 1b \n" #endif : /* outputs */ #if ORDER > 32 [cnt]"+r"(cnt), #endif [v1] "+r"(v1), - [v2] "+r"(v2) + [f2] "+r"(f2), + [s2] "+r"(s2), + [res]"=r"(res) : /* inputs */ : /* clobbers */ "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "memory" ); + return res; } -static inline void vector_sub(int32_t* v1, int32_t* v2) +/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) */ +static inline int32_t vector_sp_sub(int32_t* v1, int32_t* f2, int32_t* s2) { + int res; #if ORDER > 32 int cnt = ORDER>>5; #endif asm volatile ( - "1: \n" - ".rept " ADD_SUB_BLOCKS "\n" - "ldmia %[v1], {r0-r3} \n" - "ldmia %[v2]!, {r4-r7} \n" - "sub r0, r0, r4 \n" - "sub r1, r1, r5 \n" - "sub r2, r2, r6 \n" - "sub r3, r3, r7 \n" - "stmia %[v1]!, {r0-r3} \n" - ".endr \n" #if ORDER > 32 - "subs %[cnt], %[cnt], #1 \n" - "bne 1b \n" + "mov %[res], #0 \n" + "1: \n" +#else + "ldmia %[v1], {r0-r3} \n" + "ldmia %[f2]!, {r4-r7} \n" + "mul %[res], r4, r0 \n" + "mla %[res], r5, r1, %[res] \n" + "mla %[res], r6, r2, %[res] \n" + "mla %[res], r7, r3, %[res] \n" + "ldmia %[s2]!, {r4-r7} \n" + "sub r0, r0, r4 \n" + "sub r1, r1, r5 \n" + "sub r2, r2, r6 \n" + "sub r3, r3, r7 \n" + "stmia %[v1]!, {r0-r3} \n" +#endif + ".rept " BLOCK_REPEAT "\n" + "ldmia %[v1], {r0-r3} \n" + "ldmia %[f2]!, {r4-r7} \n" + "mla %[res], r4, r0, %[res] \n" + "mla %[res], r5, r1, %[res] \n" + "mla %[res], r6, r2, %[res] \n" + "mla %[res], r7, r3, %[res] \n" + "ldmia %[s2]!, {r4-r7} \n" + "sub r0, r0, r4 \n" + "sub r1, r1, r5 \n" + "sub r2, r2, r6 \n" + "sub r3, r3, r7 \n" + "stmia %[v1]!, {r0-r3} \n" + ".endr \n" +#if ORDER > 32 + "subs %[cnt], %[cnt], #1 \n" + "bne 1b \n" #endif : /* outputs */ #if ORDER > 32 [cnt]"+r"(cnt), #endif [v1] "+r"(v1), - [v2] "+r"(v2) + [f2] "+r"(f2), + [s2] "+r"(s2), + [res]"=r"(res) : /* inputs */ : /* clobbers */ "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "memory" ); + return res; } static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) @@ -106,78 +162,18 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) #endif asm volatile ( -#if ORDER > 16 #if ORDER > 32 "mov %[res], #0 \n" -#endif - "ldmia %[v2]!, {r6-r7} \n" "1: \n" - "ldmia %[v1]!, {r0,r1,r3-r5} \n" -#if ORDER > 32 - "mla %[res], r6, r0, %[res] \n" #else - "mul %[res], r6, r0 \n" -#endif - "mla %[res], r7, r1, %[res] \n" - "ldmia %[v2]!, {r0-r2,r6-r8} \n" - "mla %[res], r0, r3, %[res] \n" - "mla %[res], r1, r4, %[res] \n" - "mla %[res], r2, r5, %[res] \n" - "ldmia %[v1]!, {r0-r4} \n" - "mla %[res], r6, r0, %[res] \n" - "mla %[res], r7, r1, %[res] \n" - "mla %[res], r8, r2, %[res] \n" - "ldmia %[v2]!, {r0,r1,r6-r8} \n" - "mla %[res], r0, r3, %[res] \n" - "mla %[res], r1, r4, %[res] \n" - "ldmia %[v1]!, {r0-r5} \n" - "mla %[res], r6, r0, %[res] \n" - "mla %[res], r7, r1, %[res] \n" - "mla %[res], r8, r2, %[res] \n" - "ldmia %[v2]!, {r0-r2,r6,r7} \n" - "mla %[res], r0, r3, %[res] \n" - "mla %[res], r1, r4, %[res] \n" - "mla %[res], r2, r5, %[res] \n" - "ldmia %[v1]!, {r0,r1,r3-r5} \n" - "mla %[res], r6, r0, %[res] \n" - "mla %[res], r7, r1, %[res] \n" - "ldmia %[v2]!, {r0-r2,r6-r8} \n" - "mla %[res], r0, r3, %[res] \n" - "mla %[res], r1, r4, %[res] \n" - "mla %[res], r2, r5, %[res] \n" - "ldmia %[v1]!, {r0-r4} \n" - "mla %[res], r6, r0, %[res] \n" - "mla %[res], r7, r1, %[res] \n" - "mla %[res], r8, r2, %[res] \n" - "ldmia %[v2]!, {r0,r1,r6-r8} \n" - "mla %[res], r0, r3, %[res] \n" - "mla %[res], r1, r4, %[res] \n" - "ldmia %[v1]!, {r0-r5} \n" - "mla %[res], r6, r0, %[res] \n" - "mla %[res], r7, r1, %[res] \n" - "mla %[res], r8, r2, %[res] \n" -#if ORDER > 32 - "ldmia %[v2]!, {r0-r2,r6,r7} \n" -#else - "ldmia %[v2]!, {r0-r2} \n" -#endif - "mla %[res], r0, r3, %[res] \n" - "mla %[res], r1, r4, %[res] \n" - "mla %[res], r2, r5, %[res] \n" -#if ORDER > 32 - "subs %[cnt], %[cnt], #1 \n" - "bne 1b \n" -#endif - -#else /* ORDER <= 16 */ "ldmia %[v1]!, {r0-r3} \n" "ldmia %[v2]!, {r4-r7} \n" "mul %[res], r4, r0 \n" "mla %[res], r5, r1, %[res] \n" "mla %[res], r6, r2, %[res] \n" "mla %[res], r7, r3, %[res] \n" - - ".rept 3 \n" +#endif + ".rept " BLOCK_REPEAT "\n" "ldmia %[v1]!, {r0-r3} \n" "ldmia %[v2]!, {r4-r7} \n" "mla %[res], r4, r0, %[res] \n" @@ -185,7 +181,10 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) "mla %[res], r6, r2, %[res] \n" "mla %[res], r7, r3, %[res] \n" ".endr \n" -#endif /* ORDER <= 16 */ +#if ORDER > 32 + "subs %[cnt], %[cnt], #1 \n" + "bne 1b \n" +#endif : /* outputs */ #if ORDER > 32 [cnt]"+r"(cnt), @@ -197,9 +196,6 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) : /* clobbers */ "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7" -#if ORDER > 16 - ,"r8" -#endif ); return res; } diff --git a/apps/codecs/lib/udiv32_arm.S b/apps/codecs/lib/udiv32_arm.S index 8efc92c2e..117b1789b 100644 --- a/apps/codecs/lib/udiv32_arm.S +++ b/apps/codecs/lib/udiv32_arm.S @@ -92,7 +92,7 @@ #if CONFIG_CPU == PP5020 .set recip_max, 8384 #elif CONFIG_CPU == PP5002 -.set recip_max, 4992 +.set recip_max, 4608 #else .set recip_max, 16384 #endif -- 2.11.4.GIT