1 /***************************************************************************
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
10 * Copyright (C) 2009 by Jens Arnold
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
17 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
18 * KIND, either express or implied.
20 ****************************************************************************/
23 .global mpeg2_idct_copy
24 .type mpeg2_idct_copy, %function
25 .global mpeg2_idct_add
26 .type mpeg2_idct_add, %function
28 /* Custom calling convention:
29 * r0 contains block pointer and is non-volatile
30 * all non-volatile c context saved and restored on its behalf
33 str lr, [sp, #-4]! @ lr is used
34 add r1, r0, #128 @ secondary, transposed temp buffer
35 mov r14, #8 @ loop counter
38 ldmia r0!, {r2, r3, r10, r11} @ fetch f0, f2, f4, f6, f1, f3, f5, f7
39 ldrd r4, L_W1357 @ load W1, W3, W5, W7
41 smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3
42 smultt r7, r5, r10 @ -b1 = W7 * f3
43 smulbt r8, r4, r10 @ -b2 = W1 * f3
45 smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5
46 smlabb r7, r4, r11, r7 @ -b1 += W1 * f5
47 rsb r8, r8, #0 @ b2 = -b2
48 smlabb r8, r5, r10, r8 @ b2 += W5 * f1
50 smlad r6, r5, r11, r6 @ b0 += W5 * f5 + W7 * f7
51 smlabt r7, r5, r11, r7 @ -b1 += W5 * f7
52 smlatb r8, r5, r11, r8 @ b2 += W7 * f5
54 smlsdx r9, r11, r4, r9 @ b3 += f5 * W3 - f7 * W1
55 rsb r7, r7, #0 @ b1 = -b1
56 smlatb r7, r4, r10, r7 @ b1 += W3 * f1
57 smlatt r8, r4, r11, r8 @ b2 += W3 * f7
59 ldrd r4, L_W0246 @ load W0, W2, W4, W6
60 add r2, r2, #1 @ f0 += 1
62 smulbb r10, r5, r3 @ a0' = W4 * f4
63 smultt r12, r5, r3 @ a3' = W6 * f6
64 smultt r3, r4, r3 @ -a2' = W2 * f6
66 rsb r11, r10, #0 @ a1' = -W4 * f4
67 smlabb r10, r4, r2, r10 @ a0' += W0 * f0
68 smlabb r11, r4, r2, r11 @ a1' += W0 * f0
69 smlatt r12, r4, r2, r12 @ a3' += W2 * f2
70 rsb r3, r3, #0 @ a2' = -a2'
71 smlatt r3, r5, r2, r3 @ a2' += W6 * f2
73 add r10, r10, r12 @ a0 = a0' + a3'
74 sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3'
75 add r11, r11, r3 @ a1 = a1' + a2'
76 sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2'
78 subs r14, r14, #1 @ decrease loop count
80 @ Special store order for making the column pass calculate columns in
81 @ the order 0-2-1-3-4-6-5-7, allowing for uxtab16 use in later stages.
82 sub r2, r10, r6 @ block[7] = (a0 - b0)
83 mov r2, r2, asr #12 @ >> 12
85 sub r2, r11, r7 @ block[6] = (a1 - b1)
86 mov r2, r2, asr #12 @ >> 12
88 sub r2, r3, r8 @ block[5] = (a2 - b2)
89 mov r2, r2, asr #12 @ >> 12
91 sub r2, r12, r9 @ block[4] = (a3 - b3)
92 mov r2, r2, asr #12 @ >> 12
94 add r2, r12, r9 @ block[3] = (a3 + b3)
95 mov r2, r2, asr #12 @ >> 12
97 add r2, r3, r8 @ block[2] = (a2 + b2)
98 mov r2, r2, asr #12 @ >> 12
100 add r2, r11, r7 @ block[1] = (a1 + b1)
101 mov r2, r2, asr #12 @ >> 12
103 add r2, r10, r6 @ block[0] = (a0 + b0)
104 mov r2, r2, asr #12 @ >> 12
105 strh r2, [r1], #2 @ advance to next temp column
110 @placed here because of ldrd's offset limit
124 @ r0 now points to the temp buffer, where we need it.
125 sub r1, r1, #128+16 @ point r1 back to the input block
126 mov r14, #8 @ loop counter
129 ldmia r0!, {r2, r3, r10, r11} @ fetch f0, f2, f4, f6, f1, f3, f5, f7
130 ldrd r4, L_W1357 @ load W1, W3, W5, W7
132 smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3
133 smultt r7, r5, r10 @ -b1 = W7 * f3
134 smulbt r8, r4, r10 @ -b2 = W1 * f3
136 smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5
137 smlabb r7, r4, r11, r7 @ -b1 += W1 * f5
138 rsb r8, r8, #0 @ b2 = -b2
139 smlabb r8, r5, r10, r8 @ b2 += W5 * f1
141 smlad r6, r5, r11, r6 @ b0 += W5 * f5 + W7 * f7
142 smlabt r7, r5, r11, r7 @ -b1 += W5 * f7
143 smlatb r8, r5, r11, r8 @ b2 += W7 * f5
145 smlsdx r9, r11, r4, r9 @ b3 += f5 * W3 - f7 * W1
146 rsb r7, r7, #0 @ b1 = -b1
147 smlatb r7, r4, r10, r7 @ b1 += W3 * f1
148 smlatt r8, r4, r11, r8 @ b2 += W3 * f7
150 ldrd r4, L_W0246 @ load W0, W2, W4, W6
151 add r2, r2, #32 @ DC offset: 0.5
153 smulbb r10, r5, r3 @ a0' = W4 * f4
154 smultt r12, r5, r3 @ a3' = W6 * f6
155 smultt r3, r4, r3 @ -a2' = W2 * f6
157 rsb r11, r10, #0 @ a1' = -W4 * f4
158 smlabb r10, r4, r2, r10 @ a0' += W0 * f0
159 smlabb r11, r4, r2, r11 @ a1' += W0 * f0
160 smlatt r12, r4, r2, r12 @ a3' += W2 * f2
161 rsb r3, r3, #0 @ a2' = -a2'
162 smlatt r3, r5, r2, r3 @ a2' += W6 * f2
164 add r10, r10, r12 @ a0 = a0' + a3'
165 sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3'
166 add r11, r11, r3 @ a1 = a1' + a2'
167 sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2'
169 subs r14, r14, #1 @ decrease loop count
171 sub r2, r10, r6 @ block[7] = (a0 - b0)
172 mov r2, r2, asr #17 @ >> 17
174 sub r2, r11, r7 @ block[6] = (a1 - b1)
175 mov r2, r2, asr #17 @ >> 17
177 sub r2, r3, r8 @ block[5] = (a2 - b2)
178 mov r2, r2, asr #17 @ >> 17
180 sub r2, r12, r9 @ block[4] = (a3 - b3)
181 mov r2, r2, asr #17 @ >> 17
183 add r2, r12, r9 @ block[3] = (a3 + b3)
184 mov r2, r2, asr #17 @ >> 17
186 add r2, r3, r8 @ block[2] = (a2 + b2)
187 mov r2, r2, asr #17 @ >> 17
189 add r2, r11, r7 @ block[1] = (a1 + b1)
190 mov r2, r2, asr #17 @ >> 17
192 add r2, r10, r6 @ block[0] = (a0 + b0)
193 mov r2, r2, asr #17 @ >> 17
194 strh r2, [r1], #2 @ advance to next column
198 sub r0, r0, #256 @ point r0 back to the input block
203 stmfd sp!, {r1-r2, r4-r11, lr}
212 1: @ idct data is in order 0-2-1-3-4-6-5-7,
213 ldmia r0, {r4-r7} @ see above
217 orr r4, r4, r5, lsl #8
220 orr r5, r6, r7, lsl #8
221 strd r4, [r1] @ r4, r5
226 ldmfd sp!, {r4-r11, pc}
237 stmfd sp!, {r2-r11, lr}
246 ldrd r8, [r1] @ r8, r9
247 2: @ idct data is in order 0-2-1-3-4-6-5-7,
248 ldmia r0, {r4-r7} @ see above
249 stmia r0!, {r10-r12, lr}
251 uxtab16 r5, r5, r8, ror #8
254 orr r4, r4, r5, lsl #8
256 uxtab16 r7, r7, r9, ror #8
259 orr r5, r6, r7, lsl #8
260 strd r4, [r1] @ r4, r5
263 ldrlod r8, [r1] @ r8, r9
266 ldmfd sp!, {r4-r11, pc}
270 ldrsh r4, [r0, #0] @ r4 = block[0]
272 strh r12, [r0, #0] @ block[0] = 0
273 strh r12, [r0, #126] @ block[63] = 0
275 mov r4, r4, asr #7 @ r4 = DC
276 mov r4, r4, lsl #16 @ spread to 2 halfwords
277 orr r4, r4, r4, lsr #16
278 ldrd r0, [r2] @ r0, r1
279 add r12, r2, r3, asl #3
281 uxtab16 lr, r4, r0, ror #8
285 orr r0, r0, lr, lsl #8
286 uxtab16 lr, r4, r1, ror #8
290 orr r1, r1, lr, lsl #8
291 strd r0, [r2] @ r0, r1
294 ldrlod r0, [r2] @ r0, r1