1 /***************************************************************************
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
10 * Copyright (C) 2008 by Andree Buschmann
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
17 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
18 * KIND, either express or implied.
20 ****************************************************************************/
24 #if defined(CPU_S5L870X)
25 /* Moving this asm routine to iram is (little) faster on S5L870x. */
26 .section .icode, "ax", %progbits
28 .section .text, "ax", %progbits
31 /****************************************************************************
32 * void mpc_decoder_windowing_D(...)
34 * 2nd step within synthesis filter. Does the dewindowing.
36 * Uses un-shifted D[]-values. D[] will always be the second operand of
37 * smull/smlal to achieve higher speed as D[] has lower amplitude than V[].
38 ****************************************************************************/
40 .global mpc_decoder_windowing_D
41 .type mpc_decoder_windowing_D, %function
43 mpc_decoder_windowing_D:
48 /************************************************************************
49 * Reference implementation.
50 ***********************************************************************/
51 stmfd sp!, {r4-r8, lr}
55 ldmia r2!, { r3-r6 } /* load D[00..03] */
58 ldr r7, [r1, #96*4] /* 1 */
60 ldr r7, [r1, #128*4] /* 2 */
62 ldr r7, [r1, #224*4] /* 3 */
64 ldmia r2!, { r3-r6 } /* load D[04..07] */
65 ldr r7, [r1, #256*4] /* 4 */
67 ldr r7, [r1, #352*4] /* 5 */
69 ldr r7, [r1, #384*4] /* 6 */
71 ldr r7, [r1, #480*4] /* 7 */
73 ldmia r2!, { r3-r6 } /* load D[08..11] */
74 ldr r7, [r1, #512*4] /* 8 */
76 ldr r7, [r1, #608*4] /* 9 */
78 ldr r7, [r1, #640*4] /* 10 */
80 ldr r7, [r1, #736*4] /* 11 */
82 ldmia r2!, { r3-r6 } /* load D[12..15] */
83 ldr r7, [r1, #768*4] /* 12 */
85 ldr r7, [r1, #864*4] /* 13 */
87 ldr r7, [r1, #896*4] /* 14 */
89 ldr r7, [r1, #992*4] /* 15 */
92 orr r8, r8, r12, lsl #16 /* (lo>>16) || (hi<<16) */
93 str r8, [r0], #4 /* store Data */
94 add r1, r1, #4 /* V++ */
100 #elif defined(CPU_ARM7TDMI) /* arm7 only */
101 mpc_decoder_windowing_D:
106 /************************************************************************
107 * Further speed up through making use of symmetries within D[]-window.
108 * The row V[00] can be extracted as it has symmetries within this single
109 * row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's.
110 * The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be
111 * saved at the cost of 15 x 4 + 1 add's.
112 * The row V[16] can be extracted as it has symmetries within this single
113 * row. 8 smull/mlal and 8 ldr's can be saved.
114 * Used for arm7 only. For arm9 and above see implementation below.
115 ***********************************************************************/
116 stmfd sp!, {r4-r11, lr}
118 /******************************************
119 * row 0 with internal symmetry
120 *****************************************/
121 add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */
122 ldmia r2!, { r3-r6 } /* load D[01..04] */
123 ldr r7 , [r1, #96*4] /* 1 */
124 ldr r10, [r1, #992*4] /* 15 */
125 rsb r10, r10, r7 /* V[01] - V[15] */
126 smull r8, r9, r10, r3
127 ldr r7 , [r1, #128*4] /* 2 */
128 ldr r10, [r1, #896*4] /* 14 */
129 add r10, r10, r7 /* V[02] + V[14] */
130 smlal r8, r9, r10, r4
131 ldr r7 , [r1, #224*4] /* 3 */
132 ldr r10, [r1, #864*4] /* 13 */
133 rsb r10, r10, r7 /* V[03] - V[13] */
134 smlal r8, r9, r10, r5
135 ldr r7 , [r1, #256*4] /* 4 */
136 ldr r10, [r1, #768*4] /* 12 */
137 add r10, r10, r7 /* V[04] + V[12] */
138 smlal r8, r9, r10, r6
139 ldmia r2!, { r3-r6 } /* load D[05..08] */
140 ldr r7 , [r1, #352*4] /* 5 */
141 ldr r10, [r1, #736*4] /* 11 */
142 rsb r10, r10, r7 /* V[05] - V[11] */
143 smlal r8, r9, r10, r3
144 ldr r7 , [r1, #384*4] /* 6 */
145 ldr r10, [r1, #640*4] /* 10 */
146 add r10, r10, r7 /* V[06] + V[10] */
147 smlal r8, r9, r10, r4
148 ldr r7 , [r1, #480*4] /* 7 */
149 ldr r10, [r1, #608*4] /* 9 */
150 rsb r10, r10, r7 /* V[07] - V[09] */
151 smlal r8, r9, r10, r5
152 ldr r10, [r1, #512*4] /* 8 */
153 smlal r8, r9, r10, r6
155 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
156 str r8, [r0], #4 /* store Data */
157 add r1, r1, #4 /* V+=1, r1 = V[01] */
158 add r2, r2, #7*4 /* D+=7, r2 = D[16] */
160 /******************************************
161 * rows 01..15 are symmetric to rows 31..17
162 * r8 = lo, r9 = hi of 01..15
164 * r10 = lo, r11 = hi of 31..17
166 *****************************************/
168 add r12, r1, #30*4 /* r12 = V[31] */
170 ldmia r2!, { r3-r6 } /* load D[00..03] */
171 ldr r7, [r12, #768*4] /* 12 */
172 smull r10, r11, r7, r6
173 ldr r7, [r12, #864*4] /* 13 */
174 smlal r10, r11, r7, r5
175 ldr r7, [r12, #896*4] /* 14 */
176 smlal r10, r11, r7, r4
177 ldr r7, [r12, #992*4] /* 15 */
178 smlal r10, r11, r7, r3
181 ldr r7, [r1, #96*4] /* 1 */
183 ldr r7, [r1, #128*4] /* 2 */
185 ldr r7, [r1, #224*4] /* 3 */
187 ldmia r2!, { r3-r6 } /* load D[04..07] */
188 ldr r7, [r1, #256*4] /* 4 */
190 ldr r7, [r1, #352*4] /* 5 */
192 ldr r7, [r1, #384*4] /* 6 */
194 ldr r7, [r1, #480*4] /* 7 */
196 ldr r7, [r12, #512*4] /* 8 */
197 smlal r10, r11, r7, r6
198 ldr r7, [r12, #608*4] /* 9 */
199 smlal r10, r11, r7, r5
200 ldr r7, [r12, #640*4] /* 10 */
201 smlal r10, r11, r7, r4
202 ldr r7, [r12, #736*4] /* 11 */
203 smlal r10, r11, r7, r3
204 ldmia r2!, { r3-r6 } /* load D[08..11] */
205 ldr r7, [r12, #256*4] /* 4 */
206 smlal r10, r11, r7, r6
207 ldr r7, [r12, #352*4] /* 5 */
208 smlal r10, r11, r7, r5
209 ldr r7, [r12, #384*4] /* 6 */
210 smlal r10, r11, r7, r4
211 ldr r7, [r12, #480*4] /* 7 */
212 smlal r10, r11, r7, r3
213 ldr r7, [r1, #512*4] /* 8 */
215 ldr r7, [r1, #608*4] /* 9 */
217 ldr r7, [r1, #640*4] /* 10 */
219 ldr r7, [r1, #736*4] /* 11 */
221 ldmia r2!, { r3-r6 } /* load D[12..15] */
222 ldr r7, [r1, #768*4] /* 12 */
224 ldr r7, [r1, #864*4] /* 13 */
226 ldr r7, [r1, #896*4] /* 14 */
228 ldr r7, [r1, #992*4] /* 15 */
230 ldr r7, [r12] /* 0 */
231 smlal r10, r11, r7, r6
232 ldr r7, [r12, #96*4] /* 1 */
233 smlal r10, r11, r7, r5
234 ldr r7, [r12, #128*4] /* 2 */
235 smlal r10, r11, r7, r4
236 ldr r7, [r12, #224*4] /* 3 */
237 smlal r10, r11, r7, r3
238 /* store Data[01..15] */
240 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
241 /* store Data[31..17] */
242 mov r10, r10, lsr #16
243 orr r10, r10, r11, lsl #16 /* (lo>>16) || (hi<<16) */
244 rsb r10, r10, #0 /* r10 = -r10 */
245 str r10, [r0, lr] /* store Data */
246 str r8, [r0], #4 /* store Data */
247 /* correct adresses for next loop */
248 sub r12, r12, #4 /* r12 = V-- */
249 add r1, r1, #4 /* r1 = V++ */
254 /******************************************
255 * V[16] with internal symmetry
256 *****************************************/
257 ldmia r2!, { r3-r6 } /* load D[00..03] */
258 ldr r7 , [r1] /* 0 */
259 ldr r10, [r1, #992*4] /* 15 */
260 rsb r10, r10, r7 /* V[00] - V[15] */
261 smull r8, r9, r10, r3
262 ldr r7 , [r1, #96*4] /* 1 */
263 ldr r10, [r1, #896*4] /* 14 */
264 rsb r10, r10, r7 /* V[01] - V[14] */
265 smlal r8, r9, r10, r4
266 ldr r7 , [r1, #128*4] /* 2 */
267 ldr r10, [r1, #864*4] /* 13 */
268 rsb r10, r10, r7 /* V[02] - V[13] */
269 smlal r8, r9, r10, r5
270 ldr r7 , [r1, #224*4] /* 3 */
271 ldr r10, [r1, #768*4] /* 12 */
272 rsb r10, r10, r7 /* V[03] - V[12] */
273 smlal r8, r9, r10, r6
274 ldmia r2!, { r3-r6 } /* load D[04..07] */
275 ldr r7 , [r1, #256*4] /* 4 */
276 ldr r10, [r1, #736*4] /* 11 */
277 rsb r10, r10, r7 /* V[04] - V[11] */
278 smlal r8, r9, r10, r3
279 ldr r7 , [r1, #352*4] /* 5 */
280 ldr r10, [r1, #640*4] /* 10 */
281 rsb r10, r10, r7 /* V[05] - V[10] */
282 smlal r8, r9, r10, r4
283 ldr r7 , [r1, #384*4] /* 6 */
284 ldr r10, [r1, #608*4] /* 9 */
285 rsb r10, r10, r7 /* V[06] - V[09] */
286 smlal r8, r9, r10, r5
287 ldr r7 , [r1, #480*4] /* 7 */
288 ldr r10, [r1, #512*4] /* 8 */
289 rsb r10, r10, r7 /* V[07] - V[08] */
290 smlal r8, r9, r10, r6
292 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
293 str r8, [r0], #4 /* store Data */
296 #elif ARM_ARCH < 6 /* arm9 and above */
297 mpc_decoder_windowing_D:
302 /************************************************************************
303 * Further speed up through making use of symmetries within D[]-window.
304 * The row V[00] can be extracted as it has symmetries within this single
305 * row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's.
306 * The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be
307 * saved at the cost of 15 x 4 + 1 add's.
308 * The row V[16] can be extracted as it has symmetries within this single
309 * row. 8 smull/mlal and 8 ldr's can be saved.
310 * On arm9 (still armv4 architecture) reducing stalls after ldr/ldm speeds
311 * up decoding even though several ldm-calls are replaced with ldr to free
313 ***********************************************************************/
314 stmfd sp!, {r4-r11, lr}
316 /******************************************
317 * row 0 with internal symmetry
318 *****************************************/
319 add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */
320 ldmia r2!, { r3-r6 } /* load D[01..04] */
321 ldr r7 , [r1, #96*4] /* 1 */
322 ldr r10, [r1, #992*4] /* 15 */
323 ldr r11, [r1, #128*4] /* 2 */
324 ldr r12, [r1, #896*4] /* 14 */
325 rsb r10, r10, r7 /* V[01] - V[15] */
326 smull r8, r9, r10, r3
327 ldr r7 , [r1, #224*4] /* 3 */
328 ldr r10, [r1, #864*4] /* 13 */
329 add r12, r12, r11 /* V[02] + V[14] */
330 smlal r8, r9, r12, r4
331 ldr r11, [r1, #256*4] /* 4 */
332 ldr r12, [r1, #768*4] /* 12 */
333 rsb r10, r10, r7 /* V[03] - V[13] */
334 smlal r8, r9, r10, r5
335 ldr r7 , [r1, #352*4] /* 5 */
336 ldr r10, [r1, #736*4] /* 11 */
337 add r12, r12, r11 /* V[04] + V[12] */
338 smlal r8, r9, r12, r6
339 ldmia r2!, { r3-r6 } /* load D[05..08] */
340 ldr r11, [r1, #384*4] /* 6 */
341 ldr r12, [r1, #640*4] /* 10 */
342 rsb r10, r10, r7 /* V[05] - V[11] */
343 smlal r8, r9, r10, r3
344 ldr r7 , [r1, #480*4] /* 7 */
345 ldr r10, [r1, #608*4] /* 9 */
346 add r12, r12, r11 /* V[06] + V[10] */
347 smlal r8, r9, r12, r4
348 ldr r11, [r1, #512*4] /* 8 */
349 rsb r10, r10, r7 /* V[07] - V[09] */
350 smlal r8, r9, r10, r5
351 smlal r8, r9, r11, r6
353 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
354 str r8, [r0], #4 /* store Data */
355 add r1, r1, #4 /* V+=1, r1 = V[01] */
356 add r2, r2, #7*4 /* D+=7, r2 = D[16] */
358 /******************************************
359 * rows 01..15 are symmetric to rows 31..17
360 * r8 = lo, r9 = hi of 01..15
362 * r10 = lo, r11 = hi of 31..17
364 *****************************************/
366 add r12, r1, #30*4 /* r12 = V[31] */
368 ldmia r2!, { r3-r4 } /* load D[00..01] */
369 ldr r7, [r12, #896*4] /* 14 */
370 ldr r5, [r12, #992*4] /* 15 */
371 smull r10, r11, r7, r4
373 smlal r10, r11, r5, r3
374 ldr r5, [r1, #96*4] /* 1 */
376 ldr r7, [r12, #768*4] /* 12 */
378 ldmia r2!, { r3-r4 } /* load D[02..03] */
379 ldr r5, [r12, #864*4] /* 13 */
380 smlal r10, r11, r7, r4
381 ldr r7, [r1, #128*4] /* 2 */
382 smlal r10, r11, r5, r3
383 ldr r5, [r1, #224*4] /* 3 */
385 ldr r7, [r1, #256*4] /* 4 */
387 ldmia r2!, { r3-r4 } /* load D[04..04] */
388 ldr r5, [r1, #352*4] /* 5 */
390 ldr r7, [r12, #640*4] /* 10 */
392 ldr r5, [r12, #736*4] /* 11 */
393 smlal r10, r11, r7, r4
394 ldr r7, [r1, #384*4] /* 6 */
395 smlal r10, r11, r5, r3
396 ldmia r2!, { r3-r4 } /* load D[06..07] */
397 ldr r5, [r1, #480*4] /* 7 */
399 ldr r7, [r12, #512*4] /* 8 */
401 ldr r5, [r12, #608*4] /* 9 */
402 smlal r10, r11, r7, r4
403 ldr r7, [r12, #384*4] /* 6 */
404 smlal r10, r11, r5, r3
405 ldmia r2!, { r3-r4 } /* load D[08..09] */
406 ldr r5, [r12, #480*4] /* 7 */
407 smlal r10, r11, r7, r4
408 ldr r7, [r1, #512*4] /* 8 */
409 smlal r10, r11, r5, r3
410 ldr r5, [r1, #608*4] /* 9 */
412 ldr r7, [r1, #640*4] /* 10 */
414 ldmia r2!, { r3-r4 } /* load D[10..11] */
415 ldr r5, [r1, #736*4] /* 11 */
417 ldr r7, [r12, #256*4] /* 4 */
419 ldr r5, [r12, #352*4] /* 5 */
420 smlal r10, r11, r7, r4
421 ldr r7, [r1, #768*4] /* 12 */
422 smlal r10, r11, r5, r3
423 ldmia r2!, { r3-r4 } /* load D[12..13] */
424 ldr r5, [r1, #864*4] /* 13 */
426 ldr r7, [r12, #128*4] /* 2 */
428 ldr r5, [r12, #224*4] /* 3 */
429 smlal r10, r11, r7, r4
430 ldr r7, [r12] /* 0 */
431 smlal r10, r11, r5, r3
432 ldmia r2!, { r3-r4 } /* load D[14..15] */
433 ldr r5, [r12, #96*4] /* 1 */
434 smlal r10, r11, r7, r4
435 ldr r7, [r1, #896*4] /* 14 */
436 smlal r10, r11, r5, r3
437 ldr r5, [r1, #992*4] /* 15 */
440 /* store Data[01..15] */
442 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
443 /* store Data[31..17] */
444 mov r10, r10, lsr #16
445 orr r10, r10, r11, lsl #16 /* (lo>>16) || (hi<<16) */
446 rsb r10, r10, #0 /* r10 = -r10 */
447 str r10, [r0, lr] /* store Data */
448 str r8, [r0], #4 /* store Data */
449 /* correct adresses for next loop */
450 sub r12, r12, #4 /* r12 = V-- */
451 add r1, r1, #4 /* r1 = V++ */
456 /******************************************
457 * V[16] with internal symmetry
458 *****************************************/
459 ldmia r2!, { r3-r6 } /* load D[00..03] */
460 ldr r7 , [r1] /* 0 */
461 ldr r10, [r1, #992*4] /* 15 */
462 ldr r11, [r1, #96*4] /* 1 */
463 ldr r12, [r1, #896*4] /* 14 */
464 rsb r10, r10, r7 /* V[00] - V[15] */
465 smull r8, r9, r10, r3
466 ldr r7 , [r1, #128*4] /* 2 */
467 ldr r10, [r1, #864*4] /* 13 */
468 rsb r12, r12, r11 /* V[01] - V[14] */
469 smlal r8, r9, r12, r4
470 ldr r11, [r1, #224*4] /* 3 */
471 ldr r12, [r1, #768*4] /* 12 */
472 rsb r10, r10, r7 /* V[02] - V[13] */
473 smlal r8, r9, r10, r5
474 ldr r7 , [r1, #256*4] /* 4 */
475 ldr r10, [r1, #736*4] /* 11 */
476 rsb r12, r12, r11 /* V[03] - V[12] */
477 smlal r8, r9, r12, r6
478 ldmia r2!, { r3-r6 } /* load D[04..07] */
479 ldr r11, [r1, #352*4] /* 5 */
480 ldr r12, [r1, #640*4] /* 10 */
481 rsb r10, r10, r7 /* V[04] - V[11] */
482 smlal r8, r9, r10, r3
483 ldr r7 , [r1, #384*4] /* 6 */
484 ldr r10, [r1, #608*4] /* 9 */
485 rsb r12, r12, r11 /* V[05] - V[10] */
486 smlal r8, r9, r12, r4
487 ldr r11, [r1, #480*4] /* 7 */
488 ldr r12, [r1, #512*4] /* 8 */
489 rsb r10, r10, r7 /* V[06] - V[09] */
490 smlal r8, r9, r10, r5
491 rsb r12, r12, r11 /* V[07] - V[08] */
492 smlal r8, r9, r12, r6
494 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
495 str r8, [r0], #4 /* store Data */
499 mpc_decoder_windowing_D:
504 /************************************************************************
505 * Further speed up through making use of symmetries within D[]-window.
506 * The row V[00] can be extracted as it has symmetries within this single
507 * row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's.
508 * The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be
509 * saved at the cost of 15 x 4 + 1 add's.
510 * The row V[16] can be extracted as it has symmetries within this single
511 * row. 8 smull/mlal and 8 ldr's can be saved.
512 * On armv6 use smmulr/smlalr which are faster than smull/smlal and only
513 * accumulate the top 32 bits of the result so that frees up 2
514 * registers so we can ldm larger blocks.
515 ***********************************************************************/
516 stmfd sp!, {r4-r11, lr}
518 /******************************************
519 * row 0 with internal symmetry
520 *****************************************/
521 add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */
522 ldmia r2!, { r3-r6 } /* load D[01..04] */
523 ldr r7 , [r1, #96*4] /* 1 */
524 ldr r10, [r1, #992*4] /* 15 */
525 ldr r11, [r1, #128*4] /* 2 */
526 rsb r10, r10, r7 /* V[01] - V[15] */
527 ldr r12, [r1, #896*4] /* 14 */
529 ldr r7 , [r1, #224*4] /* 3 */
530 add r12, r12, r11 /* V[02] + V[14] */
531 ldr r10, [r1, #864*4] /* 13 */
532 smmlar r9, r12, r4, r9
533 ldr r11, [r1, #256*4] /* 4 */
534 rsb r10, r10, r7 /* V[03] - V[13] */
535 ldr r12, [r1, #768*4] /* 12 */
536 smmlar r9, r10, r5, r9
537 ldr r7 , [r1, #352*4] /* 5 */
538 add r12, r12, r11 /* V[04] + V[12] */
539 ldr r10, [r1, #736*4] /* 11 */
540 smmlar r9, r12, r6, r9
541 ldmia r2!, { r3-r6 } /* load D[05..08] */
542 ldr r11, [r1, #384*4] /* 6 */
543 rsb r10, r10, r7 /* V[05] - V[11] */
544 ldr r12, [r1, #640*4] /* 10 */
545 smmlar r9, r10, r3, r9
546 ldr r7 , [r1, #480*4] /* 7 */
547 add r12, r12, r11 /* V[06] + V[10] */
548 ldr r10, [r1, #608*4] /* 9 */
549 smmlar r9, r12, r4, r9
550 rsb r10, r10, r7 /* V[07] - V[09] */
551 ldr r11, [r1, #512*4] /* 8 */
552 smmlar r9, r10, r5, r9
553 add r1, r1, #4 /* V+=1, r1 = V[01] */
554 smmlar r9, r11, r6, r9
555 add r2, r2, #7*4 /* D+=7, r2 = D[16] */
557 str r9, [r0], #4 /* store Data */
559 /******************************************
560 * rows 01..15 are symmetric to rows 31..17
563 * r11 = acc of 31..17
565 *****************************************/
567 add r12, r1, #30*4 /* r12 = V[31] */
569 ldmia r2!, { r3-r6 } /* load D[00..03] */
570 ldr r7, [r12, #896*4] /* 14 */
571 ldr r8, [r12, #992*4] /* 15 */
574 smmlar r11, r8, r3, r11
575 ldr r8, [r1, #96*4] /* 1 */
577 ldr r7, [r12, #768*4] /* 12 */
578 smmlar r9, r8, r4, r9
579 ldr r8, [r12, #864*4] /* 13 */
580 smmlar r11, r7, r6, r11
581 ldr r7, [r1, #128*4] /* 2 */
582 smmlar r11, r8, r5, r11
583 ldr r8, [r1, #224*4] /* 3 */
584 smmlar r9, r7, r5, r9
585 ldr r7, [r1, #256*4] /* 4 */
586 smmlar r9, r8, r6, r9
587 ldmia r2!, { r3-r6 } /* load D[04..07] */
588 ldr r8, [r1, #352*4] /* 5 */
589 smmlar r9, r7, r3, r9
590 ldr r7, [r12, #640*4] /* 10 */
591 smmlar r9, r8, r4, r9
592 ldr r8, [r12, #736*4] /* 11 */
593 smmlar r11, r7, r4, r11
594 ldr r7, [r1, #384*4] /* 6 */
595 smmlar r11, r8, r3, r11
596 ldr r8, [r1, #480*4] /* 7 */
597 smmlar r9, r7, r5, r9
598 ldr r7, [r12, #512*4] /* 8 */
599 smmlar r9, r8, r6, r9
600 ldr r8, [r12, #608*4] /* 9 */
601 smmlar r11, r7, r6, r11
602 ldr r7, [r12, #384*4] /* 6 */
603 smmlar r11, r8, r5, r11
604 ldmia r2!, { r3-r6 } /* load D[08..11] */
605 ldr r8, [r12, #480*4] /* 7 */
606 smmlar r11, r7, r4, r11
607 ldr r7, [r1, #512*4] /* 8 */
608 smmlar r11, r8, r3, r11
609 ldr r8, [r1, #608*4] /* 9 */
610 smmlar r9, r7, r3, r9
611 ldr r7, [r1, #640*4] /* 10 */
612 smmlar r9, r8, r4, r9
613 ldr r8, [r1, #736*4] /* 11 */
614 smmlar r9, r7, r5, r9
615 ldr r7, [r12, #256*4] /* 4 */
616 smmlar r9, r8, r6, r9
617 ldr r8, [r12, #352*4] /* 5 */
618 smmlar r11, r7, r6, r11
619 ldr r7, [r1, #768*4] /* 12 */
620 smmlar r11, r8, r5, r11
621 ldmia r2!, { r3-r6 } /* load D[12..15] */
622 ldr r8, [r1, #864*4] /* 13 */
623 smmlar r9, r7, r3, r9
624 ldr r7, [r12, #128*4] /* 2 */
625 smmlar r9, r8, r4, r9
626 ldr r8, [r12, #224*4] /* 3 */
627 smmlar r11, r7, r4, r11
628 ldr r7, [r12] /* 0 */
629 smmlar r11, r8, r3, r11
630 ldr r8, [r12, #96*4] /* 1 */
631 smmlar r11, r7, r6, r11
632 ldr r7, [r1, #896*4] /* 14 */
633 smmlar r11, r8, r5, r11
634 ldr r8, [r1, #992*4] /* 15 */
635 smmlar r9, r7, r5, r9
636 sub r12, r12, #4 /* r12 = V-- correct adresses for next loop */
637 smmlar r9, r8, r6, r9
638 add r1, r1, #4 /* r1 = V++ correct adresses for next loop */
639 rsb r11, r11, #0 /* r11 = -r11 */
640 /* store Data[01..15] */
642 /* store Data[31..17] */
644 str r11, [r0, lr] /* store Data */
645 str r9, [r0], #4 /* store Data */
650 /******************************************
651 * V[16] with internal symmetry
652 *****************************************/
653 ldmia r2!, { r3-r6 } /* load D[00..03] */
654 ldr r7 , [r1] /* 0 */
655 ldr r10, [r1, #992*4] /* 15 */
656 ldr r11, [r1, #96*4] /* 1 */
657 rsb r10, r10, r7 /* V[00] - V[15] */
658 ldr r12, [r1, #896*4] /* 14 */
660 ldr r7 , [r1, #128*4] /* 2 */
661 rsb r12, r12, r11 /* V[01] - V[14] */
662 ldr r10, [r1, #864*4] /* 13 */
663 smmlar r9, r12, r4, r9
664 ldr r11, [r1, #224*4] /* 3 */
665 rsb r10, r10, r7 /* V[02] - V[13] */
666 ldr r12, [r1, #768*4] /* 12 */
667 smmlar r9, r10, r5, r9
668 ldr r7 , [r1, #256*4] /* 4 */
669 rsb r12, r12, r11 /* V[03] - V[12] */
670 ldr r10, [r1, #736*4] /* 11 */
671 smmlar r9, r12, r6, r9
672 ldmia r2!, { r3-r6 } /* load D[04..07] */
673 ldr r11, [r1, #352*4] /* 5 */
674 rsb r10, r10, r7 /* V[04] - V[11] */
675 ldr r12, [r1, #640*4] /* 10 */
676 smmlar r9, r10, r3, r9
677 ldr r7 , [r1, #384*4] /* 6 */
678 rsb r12, r12, r11 /* V[05] - V[10] */
679 ldr r10, [r1, #608*4] /* 9 */
680 smmlar r9, r12, r4, r9
681 ldr r11, [r1, #480*4] /* 7 */
682 rsb r10, r10, r7 /* V[06] - V[09] */
683 ldr r12, [r1, #512*4] /* 8 */
684 smmlar r9, r10, r5, r9
685 rsb r12, r12, r11 /* V[07] - V[08] */
686 smmlar r9, r12, r6, r9
688 str r9, [r0], #4 /* store Data */
692 .mpc_dewindowing_end:
693 .size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D