1 /***************************************************************************
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
10 * Copyright (C) 2009 by Jens Arnold
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
17 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
18 * KIND, either express or implied.
20 ****************************************************************************/
23 .global mpeg2_idct_copy
24 .type mpeg2_idct_copy, %function
25 .global mpeg2_idct_add
26 .type mpeg2_idct_add, %function
28 /* Custom calling convention:
29 * r0 contains block pointer and is non-volatile
30 * all non-volatile c context saved and restored on its behalf
33 str lr, [sp, #-4]! @ lr is used
34 add r1, r0, #128 @ secondary, transposed temp buffer
35 mov r14, #8 @ loop counter
38 ldmia r0!, {r2, r3, r10, r11} @ fetch f0, f2, f4, f6, f1, f3, f5, f7
39 ldrd r4, L_W1357 @ load W1, W3, W5, W7
41 smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3
42 smlad r6, r5, r11, r6 @ + W5 * f5 + W7 * f7
44 smultt r7, r5, r10 @ b1 = -W7 * f3
45 smlabb r7, r4, r11, r7 @ + -W1 * f5
46 smlabt r7, r5, r11, r7 @ + -W5 * f7
48 smlatb r7, r4, r10, r7 @ + W3 * f1
50 smulbt r8, r4, r10 @ b2 = -W1 * f3
52 smlabb r8, r5, r10, r8 @ + W5 * f1
53 smlatb r8, r5, r11, r8 @ + W7 * f5
54 smlatt r8, r4, r11, r8 @ + W3 * f7
56 smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5
57 smlsdx r9, r11, r4, r9 @ + f5 * W3 - f1 * W1
59 ldrd r4, L_W0246 @ load W0, W2, W4, W6
60 add r2, r2, #1 @ f0 += 1
62 smulbb r10, r4, r2 @ a0' = W0 * f0
63 smlabb r10, r5, r3, r10 @ + W4 * f4
64 smultt r12, r4, r2 @ a3' = W2 * f2
65 smlatt r12, r5, r3, r12 @ + W6 * f6
66 add r10, r10, r12 @ a0 = a0' + a3'
67 sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3'
69 smulbb r11, r5, r3 @ a1' = -W4 * f4
71 smlabb r11, r4, r2, r11 @ + W0 * f0
72 smultt r3, r4, r3 @ a2' = -W2 * f6
74 smlatt r3, r5, r2, r3 @ + W6 * f2
75 add r11, r11, r3 @ a1 = a1' + a2'
76 sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2'
78 sub r2, r10, r6 @ block[7] = (a0 - b0)
79 mov r2, r2, asr #12 @ >> 12
81 sub r2, r11, r7 @ block[6] = (a1 - b1)
82 mov r2, r2, asr #12 @ >> 12
84 sub r2, r3, r8 @ block[5] = (a2 - b2)
85 mov r2, r2, asr #12 @ >> 12
87 sub r2, r12, r9 @ block[4] = (a3 - b3)
88 mov r2, r2, asr #12 @ >> 12
90 add r2, r12, r9 @ block[3] = (a3 + b3)
91 mov r2, r2, asr #12 @ >> 12
93 add r2, r3, r8 @ block[2] = (a2 + b2)
94 mov r2, r2, asr #12 @ >> 12
96 add r2, r11, r7 @ block[1] = (a1 + b1)
97 mov r2, r2, asr #12 @ >> 12
99 add r2, r10, r6 @ block[0] = (a0 + b0)
100 mov r2, r2, asr #12 @ >> 12
101 strh r2, [r1], #2 @ advance to next temp column
107 @placed here because of ldrd's offset limit
121 @ r0 now points to the temp buffer, where we need it.
122 sub r1, r1, #128+16 @ point r1 back to the input block
123 mov r14, #8 @ loop counter
126 ldmia r0!, {r2, r3, r10, r11} @ fetch f0, f2, f4, f6, f1, f3, f5, f7
127 ldrd r4, L_W1357 @ load W1, W3, W5, W7
129 smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3
130 smlad r6, r5, r11, r6 @ + W5 * f5 + W7 * f7
132 smultt r7, r5, r10 @ b1 = -W7 * f3
133 smlabb r7, r4, r11, r7 @ + -W1 * f5
134 smlabt r7, r5, r11, r7 @ + -W5 * f7
136 smlatb r7, r4, r10, r7 @ + W3 * f1
138 smulbt r8, r4, r10 @ b2 = -W1 * f3
140 smlabb r8, r5, r10, r8 @ + W5 * f1
141 smlatb r8, r5, r11, r8 @ + W7 * f5
142 smlatt r8, r4, r11, r8 @ + W3 * f7
144 smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5
145 smlsdx r9, r11, r4, r9 @ + f5 * W3 - f1 * W1
147 ldrd r4, L_W0246 @ load W0, W2, W4, W6
148 add r2, r2, #32 @ DC offset: 0.5
150 smulbb r10, r4, r2 @ a0' = W0 * f0
151 smlabb r10, r5, r3, r10 @ + W4 * f4
152 smultt r12, r4, r2 @ a3' = W2 * f2
153 smlatt r12, r5, r3, r12 @ + W6 * f6
154 add r10, r10, r12 @ a0 = a0' + a3'
155 sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3'
157 smulbb r11, r5, r3 @ a1' = -W4 * f4
159 smlabb r11, r4, r2, r11 @ + W0 * f0
160 smultt r3, r4, r3 @ a2' = -W2 * f6
162 smlatt r3, r5, r2, r3 @ + W6 * f2
163 add r11, r11, r3 @ a1 = a1' + a2'
164 sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2'
166 sub r2, r10, r6 @ block[7] = (a0 - b0)
167 mov r2, r2, asr #17 @ >> 17
169 sub r2, r11, r7 @ block[6] = (a1 - b1)
170 mov r2, r2, asr #17 @ >> 17
172 sub r2, r3, r8 @ block[5] = (a2 - b2)
173 mov r2, r2, asr #17 @ >> 17
175 sub r2, r12, r9 @ block[4] = (a3 - b3)
176 mov r2, r2, asr #17 @ >> 17
178 add r2, r12, r9 @ block[3] = (a3 + b3)
179 mov r2, r2, asr #17 @ >> 17
181 add r2, r3, r8 @ block[2] = (a2 + b2)
182 mov r2, r2, asr #17 @ >> 17
184 add r2, r11, r7 @ block[1] = (a1 + b1)
185 mov r2, r2, asr #17 @ >> 17
187 add r2, r10, r6 @ block[0] = (a0 + b0)
188 mov r2, r2, asr #17 @ >> 17
189 strh r2, [r1], #2 @ advance to next column
194 sub r0, r0, #256 @ point r0 back to the input block
199 stmfd sp!, {r1-r2, r4-r12, lr}
233 ldmfd sp!, {r4-r12, pc}
244 stmfd sp!, {r2-r12, lr}
256 orr r7, r7, r8, lsl #16
263 orr r9, r9, r10, lsl #16
274 orr r7, r7, r8, lsl #16
281 orr r9, r9, r10, lsl #16
290 ldmfd sp!, {r4-r12, pc}
294 ldrsh r1, [r0, #0] /* r1 = block[0] */
296 strh r11, [r0, #0] /* block[0] = 0 */
297 strh r11, [r0, #126] /* block[63] = 0 */
298 add r1, r1, #64 /* r1 = DC << 7 */
299 add r0, r2, r3, asl #3
305 add r4, r4, r1, asr #7
308 add r5, r5, r1, asr #7
311 add r6, r6, r1, asr #7
314 add r7, r7, r1, asr #7
321 add r4, r4, r1, asr #7
324 add r5, r5, r1, asr #7
327 add r6, r6, r1, asr #7
330 add r7, r7, r1, asr #7