From 4442bc539d916f279e8a9444848128f29a86ed22 Mon Sep 17 00:00:00 2001 From: saratoga Date: Mon, 29 Nov 2010 22:34:51 +0000 Subject: [PATCH] ARM9 optimized synth_full for libmad. Speeds up mp3 decoding by an even 2 MHz on all ARM9 and later devices. Note this is only optimized for arm9 (non-E), although it is faster on later devices. An arm9E/11 version will be needed for optimal performance on newer devices. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@28710 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libmad/synth_full_arm.S | 182 +++++++++++++++++++----------------- 1 file changed, 94 insertions(+), 88 deletions(-) diff --git a/apps/codecs/libmad/synth_full_arm.S b/apps/codecs/libmad/synth_full_arm.S index dec437f66..8d312de64 100644 --- a/apps/codecs/libmad/synth_full_arm.S +++ b/apps/codecs/libmad/synth_full_arm.S @@ -31,7 +31,12 @@ ;; r1 = fo ;; r2 = fe ;; r3 = D0ptr - ;; r4 = D1ptr + ;; r4 = D1ptr + + /*;; r5 = loop counter + ;; r6,r7 accumulator1 + ;; r8,r9 accumulator2 */ + synth_full_odd_sbsample: stmdb sp!, {r4-r11, lr} ldr r4, [sp, #36] @@ -40,88 +45,89 @@ synth_full_odd_sbsample: mov r5, #15 add r2, r2, #32 .l: + /* ;; PROD_O and odd half of SB_SAMPLE*/ add r3, r3, #128 add r4, r4, #128 - ldmia r1!, {r10, r11, r12, lr} ldr r7, [r3, #4] - smull r6, r7, r10, r7 + ldmia r1!, {r10, r11, r12, lr} ldr r9, [r4, #120] + smull r6, r7, r10, r7 + ldr sp, [r3, #60] smull r8, r9, r10, r9 - - ldr r10, [r3, #60] - smlal r6, r7, r11, r10 ldr r10, [r3, #52] + smlal r6, r7, r11, sp + ldr sp, [r3, #44] smlal r6, r7, r12, r10 - ldr r10, [r3, #44] - smlal r6, r7, lr, r10 - ldr r10, [r4, #64] + smlal r6, r7, lr, sp + ldr sp, [r4, #72] smlal r8, r9, r11, r10 - ldr r10, [r4, #72] - smlal r8, r9, r12, r10 ldr r10, [r4, #80] + smlal r8, r9, r12, sp smlal r8, r9, lr, r10 - + ldr r10, [r3, #36] + ldmia r1!, {r11, r12, sp, lr} - ldr r10, [r3, #36] smlal r6, r7, r11, r10 + + ldr r10, [r4, #88] /*;;1 cycle stall on arm9, but we free up r11*/ + smlal r8, r9, r11, r10 + ldr r10, [r3, #28] + ldr r11, [r3, #20] smlal r6, r7, r12, r10 - ldr r10, [r3, #20] - smlal r6, r7, sp, r10 ldr r10, [r3, #12] + smlal r6, r7, sp, r11 + ldr r11, [r4, #96] smlal r6, r7, lr, r10 - - ldr r10, [r4, #88] - smlal r8, r9, r11, r10 - ldr r10, [r4, #96] - smlal r8, r9, r12, r10 ldr r10, [r4, #104] + smlal r8, r9, r12, r11 + ldr r11, [r4, #112] smlal r8, r9, sp, r10 - ldr r10, [r4, #112] - smlal r8, r9, lr, r10 + smlal r8, r9, lr, r11 rsbs r6, r6, #0 rsc r7, r7, #0 - - ldmia r2!, {r11, r12, sp, lr} - + + /* ;; PROD_A and even half of SB_SAMPLE*/ ldr r10, [r3, #0] + ldmia r2!, {r11, r12, sp, lr} smlal r6, r7, r11, r10 - ldr r10, [r3, #56] - smlal r6, r7, r12, r10 + + ldr r10, [r4, #60] /*;;1 cycle stall on arm9, but we free up r11*/ + smlal r8, r9, r11, r10 + ldr r11, [r3, #56] ldr r10, [r3, #48] + smlal r6, r7, r12, r11 + ldr r11, [r3, #40] smlal r6, r7, sp, r10 - ldr r10, [r3, #40] - smlal r6, r7, lr, r10 - - ldr r10, [r4, #60] - smlal r8, r9, r11, r10 ldr r10, [r4, #68] + smlal r6, r7, lr, r11 + ldr r11, [r4, #76] smlal r8, r9, r12, r10 - ldr r10, [r4, #76] - smlal r8, r9, sp, r10 - ldr r10, [r4, #84] + ldr r10, [r4, #84] + smlal r8, r9, sp, r11 smlal r8, r9, lr, r10 - - ldmia r2!, {r11, r12, sp, lr} + ldr r10, [r3, #32] + ldmia r2!, {r11, r12, sp, lr} smlal r6, r7, r11, r10 + + ldr r10, [r4, #92] /*;;1 cycle stall on arm9, but we free up r11*/ + smlal r8, r9, r11, r10 + ldr r10, [r3, #24] + ldr r11, [r3, #16] smlal r6, r7, r12, r10 - ldr r10, [r3, #16] - smlal r6, r7, sp, r10 ldr r10, [r3, #8] + smlal r6, r7, sp, r11 + ldr r11, [r4, #100] smlal r6, r7, lr, r10 - - ldr r10, [r4, #92] - smlal r8, r9, r11, r10 - ldr r10, [r4, #100] - smlal r8, r9, r12, r10 ldr r10, [r4, #108] + smlal r8, r9, r12, r11 + ldr r11, [r4, #116] smlal r8, r9, sp, r10 - ldr r10, [r4, #116] - smlal r8, r9, lr, r10 + smlal r8, r9, lr, r11 movs r6, r6, lsr #16 adc r6, r6, r7, lsl #16 @@ -146,88 +152,88 @@ synth_full_even_sbsample: mov r5, #15 add r2, r2, #32 .l2: + /* ;; PROD_O and odd half of SB_SAMPLE*/ add r3, r3, #128 add r4, r4, #128 - ldmia r1!, {r10, r11, r12, lr} ldr r7, [r3, #0] - smull r6, r7, r10, r7 + ldmia r1!, {r10, r11, r12, lr} ldr r9, [r4, #60] + smull r6, r7, r10, r7 + ldr sp, [r3, #56] smull r8, r9, r10, r9 - - ldr r10, [r3, #56] - smlal r6, r7, r11, r10 ldr r10, [r3, #48] + smlal r6, r7, r11, sp + ldr sp, [r3, #40] smlal r6, r7, r12, r10 - ldr r10, [r3, #40] - smlal r6, r7, lr, r10 - ldr r10, [r4, #68] + smlal r6, r7, lr, sp + + ldr sp, [r4, #76] smlal r8, r9, r11, r10 - ldr r10, [r4, #76] - smlal r8, r9, r12, r10 ldr r10, [r4, #84] + smlal r8, r9, r12, sp smlal r8, r9, lr, r10 - - ldmia r1!, {r11, r12, sp, lr} + ldr r10, [r3, #32] + ldmia r1!, {r11, r12, sp, lr} + smlal r6, r7, r11, r10 + ldr r10, [r4, #92] + smlal r8, r9, r11, r10 ldr r10, [r3, #24] + ldr r11, [r3, #16] smlal r6, r7, r12, r10 - ldr r10, [r3, #16] - smlal r6, r7, sp, r10 ldr r10, [r3, #8] + smlal r6, r7, sp, r11 + ldr r11, [r4, #100] smlal r6, r7, lr, r10 - - ldr r10, [r4, #92] - smlal r8, r9, r11, r10 - ldr r10, [r4, #100] - smlal r8, r9, r12, r10 ldr r10, [r4, #108] + smlal r8, r9, r12, r11 + ldr r11, [r4, #116] smlal r8, r9, sp, r10 - ldr r10, [r4, #116] - smlal r8, r9, lr, r10 + smlal r8, r9, lr, r11 rsbs r6, r6, #0 rsc r7, r7, #0 - ldmia r2!, {r11, r12, sp, lr} - ldr r10, [r3, #4] + ldmia r2!, {r11, r12, sp, lr} smlal r6, r7, r11, r10 + ldr r10, [r4, #120] /*;;1 cycle stall on arm9, but we free up r11*/ + smlal r8, r9, r11, r10 ldr r10, [r3, #60] + ldr r11, [r3, #52] smlal r6, r7, r12, r10 - ldr r10, [r3, #52] - smlal r6, r7, sp, r10 - ldr r10, [r3, #44] + ldr r10, [r3, #44] + smlal r6, r7, sp, r11 + ldr r11, [r4, #64] smlal r6, r7, lr, r10 - ldr r10, [r4, #120] - smlal r8, r9, r11, r10 - ldr r10, [r4, #64] - smlal r8, r9, r12, r10 ldr r10, [r4, #72] + smlal r8, r9, r12, r11 + ldr r11, [r4, #80] smlal r8, r9, sp, r10 - ldr r10, [r4, #80] - smlal r8, r9, lr, r10 - ldmia r2!, {r11, r12, sp, lr} + smlal r8, r9, lr, r11 + ldr r10, [r3, #36] + ldmia r2!, {r11, r12, sp, lr} smlal r6, r7, r11, r10 + ldr r10, [r4, #88] /*;;1 cycle stall on arm9, but we free up r11*/ + smlal r8, r9, r11, r10 + ldr r10, [r3, #28] + ldr r11, [r3, #20] smlal r6, r7, r12, r10 - ldr r10, [r3, #20] - smlal r6, r7, sp, r10 ldr r10, [r3, #12] + smlal r6, r7, sp, r11 + ldr r11, [r4, #96] smlal r6, r7, lr, r10 - - ldr r10, [r4, #88] - smlal r8, r9, r11, r10 - ldr r10, [r4, #96] - smlal r8, r9, r12, r10 ldr r10, [r4, #104] + smlal r8, r9, r12, r11 + ldr r11, [r4, #112] smlal r8, r9, sp, r10 - ldr r10, [r4, #112] - smlal r8, r9, lr, r10 + smlal r8, r9, lr, r11 movs r6, r6, lsr #16 adc r6, r6, r7, lsl #16 -- 2.11.4.GIT