From 30683f6e6a10ae0cca0fd951b06be4b51e9715d4 Mon Sep 17 00:00:00 2001 From: jethead71 Date: Tue, 11 May 2010 08:40:52 +0000 Subject: [PATCH] ARM DSP: Make things a little more pipeline friendly. Reduce nonvolatile register stacking where possible. Routines now handle odd sample counts properly and will not over-write in that case. Remove a few pointless labels. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@25943 a1c6a512-1295-4272-9138-f99709370657 --- apps/dsp_arm.S | 364 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 218 insertions(+), 146 deletions(-) diff --git a/apps/dsp_arm.S b/apps/dsp_arm.S index f924569bc..b4871d150 100644 --- a/apps/dsp_arm.S +++ b/apps/dsp_arm.S @@ -33,24 +33,37 @@ .type channels_process_sound_chan_mono, %function channels_process_sound_chan_mono: @ input: r0 = count, r1 = buf - stmfd sp!, {r4-r5, lr} - ldmia r1, {r2-r3} @ r4 = buf[0], r5 = buf[1] - -.monoloop: - ldmia r2, {r4-r5} - ldmia r3, {r12,lr} - mov r4, r4, asr #1 @ r4 = r4/2 - add r4, r4, r12, asr #1 @ r4 = r4 + r12/2 = (buf[0]+buf[1])/2 - mov r5, r5, asr #1 @ r5 = r5/2 - add r5, r5, lr, asr #1 @ r5 = r5 + lr/2 = (buf[0]+buf[1])/2 - stmia r2!, {r4-r5} - stmia r3!, {r4-r5} - subs r0, r0, #2 - bgt .monoloop - - ldmfd sp!, {r4-r5, pc} -.monoend: - .size channels_process_sound_chan_mono,.monoend-channels_process_sound_chan_mono + stmfd sp!, { r4, lr } @ + @ + ldmia r1, { r1, r2 } @ r1 = buf[0], r2 = buf[1] + subs r0, r0, #1 @ odd: end at 0; even: end at -1 + beq .mono_singlesample @ Zero? Only one sample! + @ +.monoloop: @ + ldmia r1, { r3, r4 } @ r3, r4 = Li0, Li1 + ldmia r2, { r12, r14 } @ r12, r14 = Ri0, Ri1 + mov r3, r3, asr #1 @ Mo0 = Li0 / 2 + Ri0 / 2 + mov r4, r4, asr #1 @ Mo1 = Li1 / 2 + Ri1 / 2 + add r12, r3, r12, asr #1 @ + add r14, r4, r14, asr #1 @ + subs r0, r0, #2 @ + stmia r1!, { r12, r14 } @ store Mo0, Mo1 + stmia r2!, { r12, r14 } @ store Mo0, Mo1 + bgt .monoloop @ + @ + ldmltfd sp!, { r4, pc } @ if count was even, we're done + @ +.mono_singlesample: @ + ldr r3, [r1] @ r3 = Ls + ldr r12, [r2] @ r12 = Rs + mov r3, r3, asr #1 @ Mo = Ls / 2 + Rs / 2 + add r12, r3, r12, asr #1 @ + str r12, [r1] @ store Mo + str r12, [r2] @ store Mo + @ + ldmfd sp!, { r4, pc } @ + .size channels_process_sound_chan_mono, \ + .-channels_process_sound_chan_mono /**************************************************************************** * void channels_process_sound_chan_karaoke(int count, int32_t *buf[]) @@ -64,26 +77,40 @@ channels_process_sound_chan_mono: .type channels_process_sound_chan_karaoke, %function channels_process_sound_chan_karaoke: @ input: r0 = count, r1 = buf - stmfd sp!, {r4-r5, lr} - ldmia r1, {r2-r3} @ r4 = buf[0], r5 = buf[1] - -.karaokeloop: - ldmia r2, {r4-r5} - ldmia r3, {r12,lr} - mov r12, r12, asr #1 @ r12 = r12/2 - rsb r4, r12, r4, asr #1 @ r4 = -r12 + r4/2 = (buf[0]-buf[1])/2 - rsb r12, r4, #0 @ r12 = -r4 - mov lr, lr, asr #1 @ lr = lr/2 - rsb r5, lr, r5, asr #1 @ r5 = -lr + r5/2 = (buf[0]-buf[1])/2 - rsb lr, r5, #0 @ lr = -r5 - stmia r2!, {r4-r5} - stmia r3!, {r12,lr} - subs r0, r0, #2 - bgt .karaokeloop - - ldmfd sp!, {r4-r5, pc} -.karaokeend: - .size channels_process_sound_chan_karaoke,.karaokeend-channels_process_sound_chan_karaoke + stmfd sp!, { r4, lr } @ + @ + ldmia r1, { r1, r2 } @ r1 = buf[0], r2 = buf[1] + subs r0, r0, #1 @ odd: end at 0; even: end at -1 + beq .karaoke_singlesample @ Zero? Only one sample! + @ +.karaokeloop: @ + ldmia r1, { r3, r4 } @ r3, r4 = Li0, Li1 + ldmia r2, { r12, r14 } @ r12, r14 = Ri0, Ri1 + mov r3, r3, asr #1 @ Lo0 = Li0 / 2 - Ri0 / 2 + mov r4, r4, asr #1 @ Lo1 = Li1 / 2 - Ri1 / 2 + sub r3, r3, r12, asr #1 @ + sub r4, r4, r14, asr #1 @ + rsb r12, r3, #0 @ Ro0 = -Lk0 = Rs0 / 2 - Ls0 / 2 + rsb r14, r4, #0 @ Ro1 = -Lk1 = Ri1 / 2 - Li1 / 2 + subs r0, r0, #2 @ + stmia r1!, { r3, r4 } @ store Lo0, Lo1 + stmia r2!, { r12, r14 } @ store Ro0, Ro1 + bgt .karaokeloop @ + @ + ldmltfd sp!, { r4, pc } @ if count was even, we're done + @ +.karaoke_singlesample: @ + ldr r3, [r1] @ r3 = Li + ldr r12, [r2] @ r12 = Ri + mov r3, r3, asr #1 @ Lk = Li / 2 - Ri /2 + sub r3, r3, r12, asr #1 @ + rsb r12, r3, #0 @ Rk = -Lo = Ri / 2 - Li / 2 + str r3, [r1] @ store Lo + str r12, [r2] @ store Ro + @ + ldmfd sp!, { r4, pc } @ + .size channels_process_sound_chan_karaoke, \ + .-channels_process_sound_chan_karaoke #if ARM_ARCH < 6 /**************************************************************************** @@ -99,42 +126,57 @@ channels_process_sound_chan_karaoke: .type sample_output_mono, %function sample_output_mono: @ input: r0 = count, r1 = data, r2 = src, r3 = dst - stmfd sp!, {r4-r7, lr} + stmfd sp!, { r4-r6, lr } - ldr r4, [r2] @ r4 = src[0] - ldr r5, [r1] @ lr = data->output_scale - sub r1, r5, #1 @ r1 = r5-1 - mov r2, #1 - mov r2, r2, asl r1 @ r2 = 1<output_scale + ldr r2, [r2] @ r2 = src[0] + + mov r4, #1 + mov r4, r4, lsl r1 @ r4 = 1 << (scale-1) + mov r4, r4, lsr #1 + mvn r14, #0x8000 @ r14 = 0xffff7fff, needed for + @ clipping and masking + subs r0, r0, #1 @ + beq .som_singlesample @ Zero? Only one sample! .somloop: - ldmia r4!, {r6-r7} - add r6, r6, r2 - mov r6, r6, asr r5 @ r6 = (r6 + 1<<(scale-1)) >> scale - mov lr, r6, asr #15 - teq lr, lr, asr #31 - eorne r6, r1, lr, asr #31 @ Clip (-32768...+32767) - add r7, r7, r2 - mov r7, r7, asr r5 @ r7 = (r7 + 1<<(scale-1)) >> scale - mov lr, r7, asr #15 - teq lr, lr, asr #31 - eorne r7, r1, lr, asr #31 @ Clip (-32768...+32767) + ldmia r2!, { r5, r6 } + add r5, r5, r4 @ r6 = (r6 + 1<<(scale-1)) >> scale + mov r5, r5, asr r1 + mov r12, r5, asr #15 + teq r12, r12, asr #31 + eorne r5, r14, r5, asr #31 @ Clip (-32768...+32767) + add r6, r6, r4 + mov r6, r6, asr r1 @ r7 = (r7 + 1<<(scale-1)) >> scale + mov r12, r6, asr #15 + teq r12, r12, asr #31 + eorne r6, r14, r6, asr #31 @ Clip (-32768...+32767) - and r6, r6, r12 - orr r6, r6, r6, asl #16 @ pack first 2 halfwords into 1 word - and r7, r7, r12 - orr r7, r7, r7, asl #16 @ pack last 2 halfwords into 1 word - stmia r3!, {r6-r7} + and r5, r5, r14, lsr #16 + and r6, r6, r14, lsr #16 + orr r5, r5, r5, lsl #16 @ pack first 2 halfwords into 1 word + orr r6, r6, r6, lsl #16 @ pack last 2 halfwords into 1 word + stmia r3!, { r5, r6 } subs r0, r0, #2 bgt .somloop - ldmfd sp!, {r4-r7, pc} -.somend: - .size sample_output_mono,.somend-sample_output_mono + ldmltfd sp!, { r4-r6, pc } @ even 'count'? return + +.som_singlesample: + ldr r5, [r2] @ do odd sample + add r5, r5, r4 + mov r5, r5, asr r1 + mov r12, r5, asr #15 + teq r12, r12, asr #31 + eorne r5, r14, r5, asr #31 + + and r5, r5, r14, lsr #16 @ pack 2 halfwords into 1 word + orr r5, r5, r5, lsl #16 + str r5, [r3] + + ldmfd sp!, { r4-r6, pc } + .size sample_output_mono, .-sample_output_mono /**************************************************************************** * void sample_output_stereo(int count, struct dsp_data *data, @@ -149,54 +191,80 @@ sample_output_mono: .type sample_output_stereo, %function sample_output_stereo: @ input: r0 = count, r1 = data, r2 = src, r3 = dst - stmfd sp!, {r4-r10, lr} + stmfd sp!, { r4-r9, lr } - ldmia r2, {r4-r5} @ r4 = src[0], r5 = src[1] - ldr r6, [r1] @ r6 = data->output_scale - sub r1, r6, #1 @ r1 = r6-1 - mov r2, #1 - mov r2, r2, asl r1 @ r2 = 1<output_scale + ldmia r2, { r2, r5 } @ r2 = src[0], r5 = src[1] + + mov r4, #1 + mov r4, r4, lsl r1 @ r4 = 1 << (scale-1) + mov r4, r4, lsr #1 @ + + mvn r14, #0x8000 @ r14 = 0xffff7fff, needed for + @ clipping and masking + subs r0, r0, #1 @ + beq .sos_singlesample @ Zero? Only one sample! .sosloop: - ldmia r4!, {r7-r8} - add r7, r7, r2 - mov r7, r7, asr r6 @ r7 = (r7 + 1<<(scale-1)) >> scale - mov lr, r7, asr #15 - teq lr, lr, asr #31 - eorne r7, r1, lr, asr #31 @ Clip (-32768...+32767) - add r8, r8, r2 - mov r8, r8, asr r6 @ r8 = (r8 + 1<<(scale-1)) >> scale - mov lr, r8, asr #15 - teq lr, lr, asr #31 - eorne r8, r1, lr, asr #31 @ Clip (-32768...+32767) + ldmia r2!, { r6, r7 } @ 2 left + ldmia r5!, { r8, r9 } @ 2 right + + add r6, r6, r4 @ r6 = (r6 + 1<<(scale-1)) >> scale + mov r6, r6, asr r1 + mov r12, r6, asr #15 + teq r12, r12, asr #31 + eorne r6, r14, r6, asr #31 @ Clip (-32768...+32767) + add r7, r7, r4 + mov r7, r7, asr r1 @ r7 = (r7 + 1<<(scale-1)) >> scale + mov r12, r7, asr #15 + teq r12, r12, asr #31 + eorne r7, r14, r7, asr #31 @ Clip (-32768...+32767) - ldmia r5!, {r9-r10} - add r9, r9, r2 - mov r9, r9, asr r6 @ r9 = (r9 + 1<<(scale-1)) >> scale - mov lr, r9, asr #15 - teq lr, lr, asr #31 - eorne r9, r1, lr, asr #31 @ Clip (-32768...+32767) - add r10, r10, r2 - mov r10, r10, asr r6 @ r10 = (r10 + 1<<(scale-1)) >> scale - mov lr, r10, asr #15 - teq lr, lr, asr #31 - eorne r10, r1, lr, asr #31 @ Clip (-32768...+32767) + add r8, r8, r4 @ r8 = (r8 + 1<<(scale-1)) >> scale + mov r8, r8, asr r1 + mov r12, r8, asr #15 + teq r12, r12, asr #31 + eorne r8, r14, r8, asr #31 @ Clip (-32768...+32767) + add r9, r9, r4 @ r9 = (r9 + 1<<(scale-1)) >> scale + mov r9, r9, asr r1 + mov r12, r9, asr #15 + teq r12, r12, asr #31 + eorne r9, r14, r9, asr #31 @ Clip (-32768...+32767) - and r7, r7, r12 - orr r9, r7, r9, asl #16 @ pack first 2 halfwords into 1 word - and r8, r8, r12 - orr r10, r8, r10, asl #16 @ pack last 2 halfwords into 1 word - stmia r3!, {r9-r10} + and r6, r6, r14, lsr #16 @ pack first 2 halfwords into 1 word + orr r8, r6, r8, asl #16 + and r7, r7, r14, lsr #16 @ pack last 2 halfwords into 1 word + orr r9, r7, r9, asl #16 + + stmia r3!, { r8, r9 } subs r0, r0, #2 bgt .sosloop - ldmfd sp!, {r4-r10, pc} -.sosend: - .size sample_output_stereo,.sosend-sample_output_stereo + ldmltfd sp!, { r4-r9, pc } @ even 'count'? return + +.sos_singlesample: + ldr r6, [r2] @ left odd sample + ldr r8, [r5] @ right odd sample + + add r6, r6, r4 @ r6 = (r7 + 1<<(scale-1)) >> scale + mov r6, r6, asr r1 + mov r12, r6, asr #15 + teq r12, r12, asr #31 + eorne r6, r14, r6, asr #31 @ Clip (-32768...+32767) + add r8, r8, r4 @ r8 = (r8 + 1<<(scale-1)) >> scale + mov r8, r8, asr r1 + mov r12, r8, asr #15 + teq r12, r12, asr #31 + eorne r8, r14, r8, asr #31 @ Clip (-32768...+32767) + + and r6, r6, r14, lsr #16 @ pack 2 halfwords into 1 word + orr r8, r6, r8, asl #16 + + str r8, [r3] + + ldmfd sp!, { r4-r9, pc } + .size sample_output_stereo, .-sample_output_stereo #endif /* ARM_ARCH < 6 */ /**************************************************************************** @@ -259,8 +327,7 @@ apply_crossfeed: str r0, [r12, #30*4] @ save delay line index add sp, sp, #8 @ remove temp variables from stack ldmia sp!, { r4-r11, pc } -.cfend: - .size apply_crossfeed,.cfend-apply_crossfeed + .size apply_crossfeed, .-apply_crossfeed /**************************************************************************** * int dsp_downsample(int count, struct dsp_data *data, @@ -317,8 +384,7 @@ dsp_downsample: sub r8, r8, r1 @ dst - &dst[0] mov r0, r8, lsr #2 @ convert bytes->samples ldmia sp!, { r4-r11, pc } @ ... and we're out -.dsend: - .size dsp_downsample,.dsend-dsp_downsample + .size dsp_downsample, .-dsp_downsample /**************************************************************************** * int dsp_upsample(int count, struct dsp_data *dsp, @@ -327,23 +393,22 @@ dsp_downsample: .section .text .global dsp_upsample dsp_upsample: - stmdb sp!, { r4-r11, lr } @ stack modified regs + stmfd sp!, { r4-r11, lr } @ stack modified regs ldmib r1, { r5-r6 } @ r5 = num_channels,r6 = resample_data.delta sub r5, r5, #1 @ pre-decrement num_channels for use add r4, r1, #12 @ r4 = &resample_data.phase - stmdb sp!, { r0, r4 } @ stack count and &resample_data.phase + mov r6, r6, lsl #16 @ we'll use carry to detect pos increments + stmfd sp!, { r0, r4 } @ stack count and &resample_data.phase .uschannel_loop: ldr r12, [r4] @ r12 = resample_data.phase - mov r1, r12, ror #16 @ swap halfword positions, we'll use carry - @ to detect pos increments ldr r7, [r2, r5, lsl #2] @ r7 = s = src[ch - 1] ldr r8, [r3, r5, lsl #2] @ r8 = d = dst[ch - 1] add r9, r4, #4 @ r9 = &last_sample[0] - ldr r10, [r9, r5, lsl #2] @ r10 = last_sample[ch - 1] + mov r1, r12, lsl #16 @ we'll use carry to detect pos increments sub r11, r0, #1 ldr r14, [r7, r11, lsl #2] @ load last sample in s[] ... + ldr r10, [r9, r5, lsl #2] @ r10 = last_sample[ch - 1] str r14, [r9, r5, lsl #2] @ and write as next frame's last_sample - add r9, r7, r0, lsl #2 @ r9 = src_end = &src[count] movs r14, r12, lsr #16 @ pos = resample_data.phase >> 16 beq .usstart_0 @ pos = 0 cmp r14, r0 @ if pos >= count, we're already done @@ -354,41 +419,38 @@ dsp_upsample: @ Register usage in loop: @ r0 = count, r1 = phase, r4 = &resample_data.phase, r5 = cur_channel, - @ r6 = delta, r7 = s, r8 = d, r9 = src_end, r10 = s[pos - 1], r11 = s[pos] + @ r6 = delta, r7 = s, r8 = d, r9 = diff, r10 = s[pos - 1], r11 = s[pos] .usloop_1: mov r10, r11 @ r10 = previous sample .usstart_0: ldr r11, [r7], #4 @ r11 = next sample - sub r0, r11, r10 @ r0 = s[pos] - s[pos - 1] + mov r4, r1, lsr #16 @ r4 = frac = phase >> 16 + sub r9, r11, r10 @ r9 = diff = s[pos] - s[pos - 1] .usloop_0: + smull r12, r14, r4, r9 + adds r1, r1, r6 @ phase += delta << 16 mov r4, r1, lsr #16 @ r4 = frac = phase >> 16 - smull r12, r14, r4, r0 add r14, r10, r14, lsl #16 add r14, r14, r12, lsr #16 @ r14 = out = s[pos - 1] + frac*diff str r14, [r8], #4 @ *d++ = out - adds r1, r1, r6, lsl #16 @ phase += delta << 16 bcc .usloop_0 @ if carry is set, pos is incremented - cmp r7, r9 @ if s < src_end, do another sample - blo .usloop_1 + subs r0, r0, #1 @ if count > 0, do another sample + bgt .usloop_1 .usloop_skip: subs r5, r5, #1 - ldmia sp, { r0, r4 } @ reload count and &resample_data.phase + ldmfd sp, { r0, r4 } @ reload count and &resample_data.phase bpl .uschannel_loop @ if (--ch) >= 0, do another channel - mov r1, r1, ror #16 @ wrap phase back to start of next frame - str r1, [r4] @ store back - ldr r1, [r3] @ r1 = &dst[0] - sub r8, r8, r1 @ dst - &dst[0] + mov r1, r1, lsr #16 @ wrap phase back to start of next frame + ldr r2, [r3] @ r1 = &dst[0] + str r1, [r4] @ store phase + sub r8, r8, r2 @ dst - &dst[0] mov r0, r8, lsr #2 @ convert bytes->samples add sp, sp, #8 @ adjust stack for temp variables - ldmia sp!, { r4-r11, pc } @ ... and we're out -.usend: - .size dsp_upsample,.usend-dsp_upsample + ldmfd sp!, { r4-r11, pc } @ ... and we're out + .size dsp_upsample, .-dsp_upsample /**************************************************************************** * void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[]) - * NOTE: The following code processes two samples at once. When count is odd, - * there is an additional obsolete sample processed, which will not be - * used by the calling functions. */ .section .icode, "ax", %progbits .align 2 @@ -396,30 +458,40 @@ dsp_upsample: .type dsp_apply_gain, %function dsp_apply_gain: @ input: r0 = count, r1 = data, r2 = buf[] - stmfd sp!, {r4-r7, lr} + stmfd sp!, { r4-r8, lr } ldr r3, [r1, #4] @ r3 = data->num_channels ldr r4, [r1, #32] @ r5 = data->gain .dag_outerloop: ldr r1, [r2], #4 @ r1 = buf[0] and increment index of buf[] - mov r12, r0 @ r12 = r0 = count + subs r12, r0, #1 @ r12 = r0 = count - 1 + beq .dag_singlesample @ Zero? Only one sample! .dag_innerloop: - ldmia r1, {r5, r6} @ load r5, r6 from r1 - smull r7, lr, r5, r4 @ r5 = FRACMUL_SHL(r5, r4, 8) - mov lr, lr, asl #9 - orr r5, lr, r7, lsr #23 - smull r7, lr, r6, r4 @ r6 = FRACMUL_SHL(r6, r4, 8) - mov lr, lr, asl #9 - orr r6, lr, r7, lsr #23 - stmia r1!, {r5, r6} @ save r5, r6 to r1 and increment r1 + ldmia r1, { r5, r6 } @ load r5, r6 from r1 + smull r7, r8, r5, r4 @ r7 = FRACMUL_SHL(r5, r4, 8) + smull r14, r5, r6, r4 @ r14 = FRACMUL_SHL(r6, r4, 8) subs r12, r12, #2 + mov r7, r7, lsr #23 + mov r14, r14, lsr #23 + orr r7, r7, r8, asl #9 + orr r14, r14, r5, asl #9 + stmia r1!, { r7, r14 } @ save r7, r14 to [r1] and increment r1 bgt .dag_innerloop @ end of inner loop + blt .dag_evencount @ < 0? even count + +.dag_singlesample: + ldr r5, [r1] @ handle odd sample + smull r7, r8, r5, r4 @ r7 = FRACMUL_SHL(r5, r4, 8) + mov r7, r7, lsr #23 + orr r7, r7, r8, asl #9 + str r7, [r1] + +.dag_evencount: subs r3, r3, #1 bgt .dag_outerloop @ end of outer loop - - ldmfd sp!, {r4-r7, pc} -.dagend: - .size dsp_apply_gain,.dagend-dsp_apply_gain + + ldmfd sp!, { r4-r8, pc } + .size dsp_apply_gain, .-dsp_apply_gain -- 2.11.4.GIT