From abdc9d8b6c167990b6ee0eed869298fa3fed0164 Mon Sep 17 00:00:00 2001 From: amiconn Date: Sat, 20 Jun 2009 14:05:15 +0000 Subject: [PATCH] Faster idct for ARMv6. Overall mpegplayer speedup is quite minimal though. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21392 a1c6a512-1295-4272-9138-f99709370657 --- apps/plugins/mpegplayer/SOURCES | 4 + apps/plugins/mpegplayer/decode.c | 2 +- apps/plugins/mpegplayer/idct_armv6.S | 337 +++++++++++++++++++++++++++++++++++ 3 files changed, 342 insertions(+), 1 deletion(-) create mode 100644 apps/plugins/mpegplayer/idct_armv6.S diff --git a/apps/plugins/mpegplayer/SOURCES b/apps/plugins/mpegplayer/SOURCES index 5b3360cc5..5ca0fcd86 100644 --- a/apps/plugins/mpegplayer/SOURCES +++ b/apps/plugins/mpegplayer/SOURCES @@ -9,7 +9,11 @@ idct_coldfire.S motion_comp_coldfire_c.c motion_comp_coldfire_s.S #elif defined CPU_ARM +#if ARM_ARCH >= 6 +idct_armv6.S +#else idct_arm.S +#endif motion_comp_arm_c.c motion_comp_arm_s.S #else /* other CPU or SIM */ diff --git a/apps/plugins/mpegplayer/decode.c b/apps/plugins/mpegplayer/decode.c index a19b929be..91251206e 100644 --- a/apps/plugins/mpegplayer/decode.c +++ b/apps/plugins/mpegplayer/decode.c @@ -35,7 +35,7 @@ #define BUFFER_SIZE (1194 * 1024) -#ifdef CPU_COLDFIRE +#if defined(CPU_COLDFIRE) || (defined(CPU_ARM) && ARM_ARCH >= 6) /* twice as large as on other targets because coldfire uses * a secondary, transposed buffer for optimisation */ static int16_t static_dct_block[128] IBSS_ATTR ATTR_ALIGN(16); diff --git a/apps/plugins/mpegplayer/idct_armv6.S b/apps/plugins/mpegplayer/idct_armv6.S new file mode 100644 index 000000000..73feed478 --- /dev/null +++ b/apps/plugins/mpegplayer/idct_armv6.S @@ -0,0 +1,337 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2009 by Jens Arnold + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ + + + .global mpeg2_idct_copy + .type mpeg2_idct_copy, %function + .global mpeg2_idct_add + .type mpeg2_idct_add, %function + +/* Custom calling convention: + * r0 contains block pointer and is non-volatile + * all non-volatile c context saved and restored on its behalf + */ +.idct: + str lr, [sp, #-4]! @ lr is used + add r1, r0, #128 @ secondary, transposed temp buffer + mov r14, #8 @ loop counter + +.row_loop: + ldmia r0!, {r2, r3, r10, r11} @ fetch f0, f2, f4, f6, f1, f3, f5, f7 + ldrd r4, L_W1357 @ load W1, W3, W5, W7 + + smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3 + smlad r6, r5, r11, r6 @ + W5 * f5 + W7 * f7 + + smultt r7, r5, r10 @ b1 = -W7 * f3 + smlabb r7, r4, r11, r7 @ + -W1 * f5 + smlabt r7, r5, r11, r7 @ + -W5 * f7 + rsb r7, r7, #0 + smlatb r7, r4, r10, r7 @ + W3 * f1 + + smulbt r8, r4, r10 @ b2 = -W1 * f3 + rsb r8, r8, #0 + smlabb r8, r5, r10, r8 @ + W5 * f1 + smlatb r8, r5, r11, r8 @ + W7 * f5 + smlatt r8, r4, r11, r8 @ + W3 * f7 + + smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5 + smlsdx r9, r11, r4, r9 @ + f5 * W3 - f1 * W1 + + ldrd r4, L_W0246 @ load W0, W2, W4, W6 + add r2, r2, #1 @ f0 += 1 + + smulbb r10, r4, r2 @ a0' = W0 * f0 + smlabb r10, r5, r3, r10 @ + W4 * f4 + smultt r12, r4, r2 @ a3' = W2 * f2 + smlatt r12, r5, r3, r12 @ + W6 * f6 + add r10, r10, r12 @ a0 = a0' + a3' + sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3' + + smulbb r11, r5, r3 @ a1' = -W4 * f4 + rsb r11, r11, #0 + smlabb r11, r4, r2, r11 @ + W0 * f0 + smultt r3, r4, r3 @ a2' = -W2 * f6 + rsb r3, r3, #0 + smlatt r3, r5, r2, r3 @ + W6 * f2 + add r11, r11, r3 @ a1 = a1' + a2' + sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2' + + sub r2, r10, r6 @ block[7] = (a0 - b0) + mov r2, r2, asr #12 @ >> 12 + strh r2, [r1, #7*16] + sub r2, r11, r7 @ block[6] = (a1 - b1) + mov r2, r2, asr #12 @ >> 12 + strh r2, [r1, #6*16] + sub r2, r3, r8 @ block[5] = (a2 - b2) + mov r2, r2, asr #12 @ >> 12 + strh r2, [r1, #5*16] + sub r2, r12, r9 @ block[4] = (a3 - b3) + mov r2, r2, asr #12 @ >> 12 + strh r2, [r1, #4*16] + add r2, r12, r9 @ block[3] = (a3 + b3) + mov r2, r2, asr #12 @ >> 12 + strh r2, [r1, #3*16] + add r2, r3, r8 @ block[2] = (a2 + b2) + mov r2, r2, asr #12 @ >> 12 + strh r2, [r1, #2*16] + add r2, r11, r7 @ block[1] = (a1 + b1) + mov r2, r2, asr #12 @ >> 12 + strh r2, [r1, #1*16] + add r2, r10, r6 @ block[0] = (a0 + b0) + mov r2, r2, asr #12 @ >> 12 + strh r2, [r1], #2 @ advance to next temp column + + subs r14, r14, #1 + bne .row_loop + b .col_start + + @placed here because of ldrd's offset limit +L_W1357: + .short 2841 + .short 2408 + .short 1609 + .short 565 + +L_W0246: + .short 2048 + .short 2676 + .short 2048 + .short 1108 + +.col_start: + @ r0 now points to the temp buffer, where we need it. + sub r1, r1, #128+16 @ point r1 back to the input block + mov r14, #8 @ loop counter + +.col_loop: + ldmia r0!, {r2, r3, r10, r11} @ fetch f0, f2, f4, f6, f1, f3, f5, f7 + ldrd r4, L_W1357 @ load W1, W3, W5, W7 + + smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3 + smlad r6, r5, r11, r6 @ + W5 * f5 + W7 * f7 + + smultt r7, r5, r10 @ b1 = -W7 * f3 + smlabb r7, r4, r11, r7 @ + -W1 * f5 + smlabt r7, r5, r11, r7 @ + -W5 * f7 + rsb r7, r7, #0 + smlatb r7, r4, r10, r7 @ + W3 * f1 + + smulbt r8, r4, r10 @ b2 = -W1 * f3 + rsb r8, r8, #0 + smlabb r8, r5, r10, r8 @ + W5 * f1 + smlatb r8, r5, r11, r8 @ + W7 * f5 + smlatt r8, r4, r11, r8 @ + W3 * f7 + + smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5 + smlsdx r9, r11, r4, r9 @ + f5 * W3 - f1 * W1 + + ldrd r4, L_W0246 @ load W0, W2, W4, W6 + add r2, r2, #32 @ DC offset: 0.5 + + smulbb r10, r4, r2 @ a0' = W0 * f0 + smlabb r10, r5, r3, r10 @ + W4 * f4 + smultt r12, r4, r2 @ a3' = W2 * f2 + smlatt r12, r5, r3, r12 @ + W6 * f6 + add r10, r10, r12 @ a0 = a0' + a3' + sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3' + + smulbb r11, r5, r3 @ a1' = -W4 * f4 + rsb r11, r11, #0 + smlabb r11, r4, r2, r11 @ + W0 * f0 + smultt r3, r4, r3 @ a2' = -W2 * f6 + rsb r3, r3, #0 + smlatt r3, r5, r2, r3 @ + W6 * f2 + add r11, r11, r3 @ a1 = a1' + a2' + sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2' + + sub r2, r10, r6 @ block[7] = (a0 - b0) + mov r2, r2, asr #17 @ >> 17 + strh r2, [r1, #7*16] + sub r2, r11, r7 @ block[6] = (a1 - b1) + mov r2, r2, asr #17 @ >> 17 + strh r2, [r1, #6*16] + sub r2, r3, r8 @ block[5] = (a2 - b2) + mov r2, r2, asr #17 @ >> 17 + strh r2, [r1, #5*16] + sub r2, r12, r9 @ block[4] = (a3 - b3) + mov r2, r2, asr #17 @ >> 17 + strh r2, [r1, #4*16] + add r2, r12, r9 @ block[3] = (a3 + b3) + mov r2, r2, asr #17 @ >> 17 + strh r2, [r1, #3*16] + add r2, r3, r8 @ block[2] = (a2 + b2) + mov r2, r2, asr #17 @ >> 17 + strh r2, [r1, #2*16] + add r2, r11, r7 @ block[1] = (a1 + b1) + mov r2, r2, asr #17 @ >> 17 + strh r2, [r1, #1*16] + add r2, r10, r6 @ block[0] = (a0 + b0) + mov r2, r2, asr #17 @ >> 17 + strh r2, [r1], #2 @ advance to next column + + subs r14, r14, #1 + bne .col_loop + + sub r0, r0, #256 @ point r0 back to the input block + ldr pc, [sp], #4 + + +mpeg2_idct_copy: + stmfd sp!, {r1-r2, r4-r12, lr} + bl .idct + ldmfd sp!, {r1-r2} + + add r12, r0, #128 + ldrd r4, [r0] + mov r8, #0 + mov r9, #0 + mov r10, #0 + mov r11, #0 +1: + ldrd r6, [r0, #8] + usat16 r4, #8, r4 + strb r4, [r1, #0] + mov r4, r4, lsr #16 + strb r4, [r1, #1] + usat16 r5, #8, r5 + strb r5, [r1, #2] + mov r5, r5, lsr #16 + strb r5, [r1, #3] + ldrd r4, [r0, #16] + usat16 r6, #8, r6 + strb r6, [r1, #4] + mov r6, r6, lsr #16 + strb r6, [r1, #5] + usat16 r7, #8, r7 + strb r7, [r1, #6] + mov r7, r7, lsr #16 + strb r7, [r1, #7] + stmia r0!, {r8-r11} + add r1, r1, r2 + cmp r0, r12 + blo 1b + + ldmfd sp!, {r4-r12, pc} + +mpeg2_idct_add: + cmp r0, #129 + mov r0, r1 + ldreqsh r1, [r0, #0] + bne 1f + and r1, r1, #0x70 + cmp r1, #0x40 + bne 3f +1: + stmfd sp!, {r2-r12, lr} + bl .idct + ldmfd sp!, {r1-r2} + mov r11, #0 + add r12, r0, #128 +2: + ldmia r0, {r3-r6} + ldrb r7, [r1, #0] + ldrb r8, [r1, #1] + ldrb r9, [r1, #2] + ldrb r10, [r1, #3] + str r11, [r0], #4 + orr r7, r7, r8, lsl #16 + sadd16 r3, r3, r7 + usat16 r3, #8, r3 + strb r3, [r1, #0] + mov r3, r3, lsr #16 + strb r3, [r1, #1] + str r11, [r0], #4 + orr r9, r9, r10, lsl #16 + sadd16 r4, r4, r9 + usat16 r4, #8, r4 + strb r4, [r1, #2] + mov r4, r4, lsr #16 + strb r4, [r1, #3] + ldrb r7, [r1, #4] + ldrb r8, [r1, #5] + ldrb r9, [r1, #6] + ldrb r10, [r1, #7] + str r11, [r0], #4 + orr r7, r7, r8, lsl #16 + sadd16 r5, r5, r7 + usat16 r5, #8, r5 + strb r5, [r1, #4] + mov r5, r5, lsr #16 + strb r5, [r1, #5] + str r11, [r0], #4 + orr r9, r9, r10, lsl #16 + sadd16 r6, r6, r9 + usat16 r6, #8, r6 + strb r6, [r1, #6] + mov r6, r6, lsr #16 + strb r6, [r1, #7] + add r1, r1, r2 + cmp r0, r12 + blo 2b + ldmfd sp!, {r4-r12, pc} + +3: + stmfd sp!, {r4-r7} + ldrsh r1, [r0, #0] /* r1 = block[0] */ + mov r11, #0 + strh r11, [r0, #0] /* block[0] = 0 */ + strh r11, [r0, #126] /* block[63] = 0 */ + add r1, r1, #64 /* r1 = DC << 7 */ + add r0, r2, r3, asl #3 +4: + ldrb r4, [r2, #0] + ldrb r5, [r2, #1] + ldrb r6, [r2, #2] + ldrb r7, [r2, #3] + add r4, r4, r1, asr #7 + usat r4, #8, r4 + strb r4, [r2, #0] + add r5, r5, r1, asr #7 + usat r5, #8, r5 + strb r5, [r2, #1] + add r6, r6, r1, asr #7 + usat r6, #8, r6 + strb r6, [r2, #2] + add r7, r7, r1, asr #7 + usat r7, #8, r7 + strb r7, [r2, #3] + ldrb r4, [r2, #4] + ldrb r5, [r2, #5] + ldrb r6, [r2, #6] + ldrb r7, [r2, #7] + add r4, r4, r1, asr #7 + usat r4, #8, r4 + strb r4, [r2, #4] + add r5, r5, r1, asr #7 + usat r5, #8, r5 + strb r5, [r2, #5] + add r6, r6, r1, asr #7 + usat r6, #8, r6 + strb r6, [r2, #6] + add r7, r7, r1, asr #7 + usat r7, #8, r7 + strb r7, [r2, #7] + add r2, r2, r3 + cmp r2, r0 + blo 4b + ldmfd sp!, {r4-r7} + bx lr -- 2.11.4.GIT