From a0bc2b7550e92f7b0d71903d4b117d8a233dcdf0 Mon Sep 17 00:00:00 2001 From: funman Date: Mon, 10 May 2010 13:01:40 +0000 Subject: [PATCH] flac: remove ARM assembly Using current gcc it only makes decoding a tiny bit slower Using eabi gcc it makes no speed difference Tested on fuzev1 git-svn-id: svn://svn.rockbox.org/rockbox/trunk@25929 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libffmpegFLAC/SOURCES | 2 - apps/codecs/libffmpegFLAC/arm.S | 271 ------------------------------------ apps/codecs/libffmpegFLAC/arm.h | 8 -- apps/codecs/libffmpegFLAC/decoder.c | 6 - 4 files changed, 287 deletions(-) delete mode 100644 apps/codecs/libffmpegFLAC/arm.S delete mode 100644 apps/codecs/libffmpegFLAC/arm.h diff --git a/apps/codecs/libffmpegFLAC/SOURCES b/apps/codecs/libffmpegFLAC/SOURCES index deed19bce..1bd92e8be 100644 --- a/apps/codecs/libffmpegFLAC/SOURCES +++ b/apps/codecs/libffmpegFLAC/SOURCES @@ -3,6 +3,4 @@ decoder.c shndec.c #if defined(CPU_COLDFIRE) coldfire.S -#elif defined(CPU_ARM) -arm.S #endif diff --git a/apps/codecs/libffmpegFLAC/arm.S b/apps/codecs/libffmpegFLAC/arm.S deleted file mode 100644 index 2a2746eef..000000000 --- a/apps/codecs/libffmpegFLAC/arm.S +++ /dev/null @@ -1,271 +0,0 @@ -/*************************************************************************** - * __________ __ ___. - * Open \______ \ ____ ____ | | _\_ |__ _______ ___ - * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / - * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < - * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ - * \/ \/ \/ \/ \/ - * $Id$ - * - * Copyright (C) 2006 by Thom Johansen - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY - * KIND, either express or implied. - * - ****************************************************************************/ - -#include "config.h" - -/* The following is an assembler optimised version of the LPC filtering - routines needed for FLAC decoding. It is optimised for use with ARM - processors. - All LPC filtering up to order 9 is done in specially optimised unrolled - loops, while every order above this is handled by a slower default routine. - */ -#ifdef USE_IRAM - .section .icode,"ax",%progbits -#else - .text -#endif - .global lpc_decode_arm -lpc_decode_arm: - stmdb sp!, { r4-r11, lr } - ldr r4, [sp, #36] - /* r0 = blocksize, r1 = qlevel, r2 = pred_order - r3 = data, r4 = coeffs - */ - - /* the data pointer always lags behind history pointer by 'pred_order' - samples. since we have one loop for each order, we can hard code this - and free a register by not saving data pointer. - */ - sub r3, r3, r2, lsl #2 @ r3 = history - cmp r0, #0 @ no samples to process - beq .exit - cmp r2, #9 @ check if order is too high for unrolled loops - addls pc, pc, r2, lsl #2 @ jump to our unrolled decode loop if it exists -@ jumptable: - b .default @ order too high, go to default routine - b .exit @ zero order filter isn't possible, exit function - b .order1 - b .order2 - b .order3 - b .order4 - b .order5 - b .order6 - b .order7 - b .order8 - -@ last jump table entry coincides with target, so leave it out -.order9: - ldmia r4, { r5-r12, r14 } @ fetch coefs -.loop9: - ldr r4, [r3], #4 @ load first history sample - mul r2, r4, r14 @ multiply with last coef - ldr r4, [r3], #4 @ rinse and repeat while accumulating sum in r2 - mla r2, r4, r12, r2 - ldr r4, [r3], #4 - mla r2, r4, r11, r2 - ldr r4, [r3], #4 - mla r2, r4, r10, r2 - ldr r4, [r3], #4 - mla r2, r4, r9, r2 - ldr r4, [r3], #4 - mla r2, r4, r8, r2 - ldr r4, [r3], #4 - mla r2, r4, r7, r2 - ldr r4, [r3], #4 - mla r2, r4, r6, r2 - ldr r4, [r3], #4 - mla r2, r4, r5, r2 - ldr r4, [r3] @ r4 = residual - add r2, r4, r2, asr r1 @ shift sum by qlevel bits and add residual - str r2, [r3], #-8*4 @ save result and wrap history pointer back - subs r0, r0, #1 @ check if we're done - bne .loop9 @ nope, jump back - b .exit - -.order8: - ldmia r4, { r5-r12 } -.loop8: - @ we have more registers to spare here, so start block reading - ldmia r3!, { r4, r14 } - mul r2, r4, r12 - mla r2, r14, r11, r2 - ldmia r3!, { r4, r14 } - mla r2, r4, r10, r2 - mla r2, r14, r9, r2 - ldmia r3!, { r4, r14 } - mla r2, r4, r8, r2 - mla r2, r14, r7, r2 - ldmia r3!, { r4, r14 } - mla r2, r4, r6, r2 - mla r2, r14, r5, r2 - ldr r4, [r3] - add r2, r4, r2, asr r1 - str r2, [r3], #-7*4 - subs r0, r0, #1 - bne .loop8 - b .exit - -.order7: - ldmia r4, { r5-r11 } -.loop7: - ldmia r3!, { r4, r12, r14 } - mul r2, r4, r11 - mla r2, r12, r10, r2 - mla r2, r14, r9, r2 - ldmia r3!, { r4, r12, r14 } - mla r2, r4, r8, r2 - mla r2, r12, r7, r2 - mla r2, r14, r6, r2 - ldr r4, [r3], #4 - mla r2, r4, r5, r2 - ldr r4, [r3] - add r2, r4, r2, asr r1 - str r2, [r3], #-6*4 - subs r0, r0, #1 - bne .loop7 - b .exit - -.order6: - ldmia r4, { r5-r10 } -.loop6: - ldmia r3!, { r4, r11-r12, r14 } - mul r2, r4, r10 - mla r2, r11, r9, r2 - mla r2, r12, r8, r2 - mla r2, r14, r7, r2 - ldmia r3!, { r4, r11 } - mla r2, r4, r6, r2 - mla r2, r11, r5, r2 - ldr r4, [r3] - add r2, r4, r2, asr r1 - str r2, [r3], #-5*4 - subs r0, r0, #1 - bne .loop6 - b .exit - -.order5: - ldmia r4, { r5-r9 } -.loop5: - ldmia r3!, { r4, r10-r12, r14 } - mul r2, r4, r9 - mla r2, r10, r8, r2 - mla r2, r11, r7, r2 - mla r2, r12, r6, r2 - mla r2, r14, r5, r2 - ldr r4, [r3] - add r2, r4, r2, asr r1 - str r2, [r3], #-4*4 - subs r0, r0, #1 - bne .loop5 - b .exit - -.order4: - ldmia r4, { r5-r8 } -.loop4: - ldmia r3!, { r4, r11-r12, r14 } - mul r2, r4, r8 - mla r2, r11, r7, r2 - mla r2, r12, r6, r2 - mla r2, r14, r5, r2 - ldr r4, [r3] - add r2, r4, r2, asr r1 - str r2, [r3], #-3*4 - subs r0, r0, #1 - bne .loop4 - b .exit - -.order3: - ldmia r4, { r5-r7 } -.loop3: - ldmia r3!, { r4, r12, r14 } - mul r2, r4, r7 - mla r2, r12, r6, r2 - mla r2, r14, r5, r2 - ldr r4, [r3] - add r2, r4, r2, asr r1 - str r2, [r3], #-2*4 - subs r0, r0, #1 - bne .loop3 - b .exit - -.order2: - ldmia r4, { r5-r6 } -.loop2: - ldmia r3!, { r4, r14 } - mul r2, r4, r6 - mla r2, r14, r5, r2 - ldr r4, [r3] - add r2, r4, r2, asr r1 - str r2, [r3], #-1*4 - subs r0, r0, #1 - bne .loop2 - b .exit - -.order1: - ldr r5, [r4] @ load the one coef we need - ldr r4, [r3], #4 @ load one history sample, r3 now points to residual -.loop1: - mul r2, r4, r5 @ multiply coef by history sample - ldr r4, [r3] @ load residual - add r4, r4, r2, asr r1 @ add result to residual - str r4, [r3], #4 @ place r3 at next residual, we already have - subs r0, r0, #1 @ the current sample in r4 for the next iteration - bne .loop1 - b .exit - -.default: - /* we do the filtering in an unrolled by 4 loop as far as we can, and then - do the rest by jump table. */ - add r5, r4, r2, lsl #2 @ need to start in the other end of coefs - mov r7, r2, lsr #2 @ r7 = coefs/4 - mov r14, #0 @ init accumulator -.dloop1: - ldmdb r5!, { r8-r11 } - ldmia r3!, { r6, r12 } - mla r14, r6, r11, r14 - mla r14, r12, r10, r14 - ldmia r3!, { r6, r12 } - mla r14, r6, r9, r14 - mla r14, r12, r8, r14 - subs r7, r7, #1 - bne .dloop1 - - and r7, r2, #3 @ get remaining samples to be filtered - add pc, pc, r7, lsl #2 @ jump into accumulator chain -@ jumptable: - b .dsave @ padding - b .dsave - b .oneleft - b .twoleft -@ implicit .threeleft - ldr r12, [r5, #-4]! - ldr r8, [r3], #4 - mla r14, r12, r8, r14 -.twoleft: - ldr r12, [r5, #-4]! - ldr r8, [r3], #4 - mla r14, r12, r8, r14 -.oneleft: - ldr r12, [r5, #-4]! - ldr r8, [r3], #4 - mla r14, r12, r8, r14 - -.dsave: - ldr r12, [r3] @ load residual - add r14, r12, r14, asr r1 @ shift sum by qlevel bits and add residual - str r14, [r3], #4 @ store result - sub r3, r3, r2, lsl #2 @ and wrap history pointer back to next first pos - subs r0, r0, #1 @ are we done? - bne .default @ no, prepare for next sample - -.exit: - ldmia sp!, { r4-r11, pc } - diff --git a/apps/codecs/libffmpegFLAC/arm.h b/apps/codecs/libffmpegFLAC/arm.h deleted file mode 100644 index 39080d7f7..000000000 --- a/apps/codecs/libffmpegFLAC/arm.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _FLAC_ARM_H -#define _FLAC_ARM_H - -#include "bitstream.h" - -void lpc_decode_arm(int blocksize, int qlevel, int pred_order, int32_t* data, int* coeffs); - -#endif diff --git a/apps/codecs/libffmpegFLAC/decoder.c b/apps/codecs/libffmpegFLAC/decoder.c index ed175548f..e5c4b426d 100644 --- a/apps/codecs/libffmpegFLAC/decoder.c +++ b/apps/codecs/libffmpegFLAC/decoder.c @@ -44,8 +44,6 @@ #if defined(CPU_COLDFIRE) #include "coldfire.h" -#elif defined(CPU_ARM) -#include "arm.h" #endif #define FFMAX(a,b) ((a) > (b) ? (a) : (b)) @@ -264,10 +262,6 @@ static int decode_subframe_lpc(FLACContext *s, int32_t* decoded, int pred_order) (void)sum; lpc_decode_emac(s->blocksize - pred_order, qlevel, pred_order, decoded + pred_order, coeffs); - #elif defined(CPU_ARM) - (void)sum; - lpc_decode_arm(s->blocksize - pred_order, qlevel, pred_order, - decoded + pred_order, coeffs); #else for (i = pred_order; i < s->blocksize; i++) { -- 2.11.4.GIT