1 /***************************************************************************
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
12 * Copyright (C) 2009 Andrew Mahone asm versions of the C IDCT algorithms used
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version 2
18 * of the License, or (at your option) any later version.
20 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
21 * KIND, either express or implied.
23 ****************************************************************************/
25 #include "apps/core_asmdefs.h"
30 .type jpeg_idct1h, %function
32 .type jpeg_idct2v, %function
34 .type jpeg_idct2h, %function
36 .type jpeg_idct4v, %function
38 .type jpeg_idct4h, %function
40 .type jpeg_idct8v, %function
42 .type jpeg_idct8h, %function
45 /* In the common case of one pass through the loop, the extra add should be
46 cheaper than saving registers to stack and loading a the value 4112. */
54 mvnhi r12, r12, asr #31
56 usat r12, #8, r12, asr #5
64 .size jpeg_idct1h, .-jpeg_idct1h
68 /* Use SWAR tricks to fake partitioned add and subtract. This is slightly faster
69 than loading two values in each register and using shifts and strh, and
70 requires fewer fixup operations than splitting the values, calculating, and
94 /* ARMv6 offers partitioned adds and subtracts, used here to unroll the loop
109 .size jpeg_idct2v, .-jpeg_idct2v
113 /* Using LDR and shifts here would costs two more ops, and is no faster as
114 results can not be stored merged.
116 stmdb sp!, { r4-r5, lr }
131 mvnhi r5, r5, asr #31
133 mvnhi r4, r4, asr #31
135 strb r4, [r1, #pix8_size]
140 ldmia sp!, { r4-r5, pc }
142 stmdb sp!, { r4, lr }
147 saddsubx r12, r12, r12
148 usat r4, #8, r12, asr #21
150 usat r12, #8, r12, asr #5
152 strb r12, [r1, #pix8_size]
157 ldmia sp!, { r4, pc }
159 .size jpeg_idct2h, .-jpeg_idct2h
163 stmdb sp!, { r4-r7, lr }
171 add r6, r2, r4 /* r6 = tmp10 >> 2 = d0 + d2 */
172 sub r2, r2, r4 /* r2 = tmp12 >> 2= d0 - d2 */
173 add r4, r3, r5 /* r4 = z1 = d1 + d3 */
174 add r7, r4, r4, lsl #3
175 rsb r4, r4, r7, lsl #4
176 rsb r4, r4, r4, lsl #5 /* z1 *= 4433 */
178 mla r3, r12, r3, r4 /* r3 = tmp2 = z1 + z2 * 6270 */
179 mla r5, r14, r5, r4 /* r5 = tmp0 = z1 - z3 * 15137 */
180 mov r6, r6, lsl #2 /* r6 <<= 2 */
181 mov r2, r2, lsl #2 /* r2 <<= 2 */
182 add r7, r6, r3, asr #11 /* r7 = o0 */
183 sub r3, r6, r3, asr #11 /* r3 = o3 */
184 add r6, r2, r5, asr #11 /* r6 = o1 */
185 sub r2, r2, r5, asr #11 /* r2 = o2 */
193 ldmia sp!, { r4-r7, pc }
195 stmdb sp!, { r4-r8, lr }
204 add r6, r3, r14 /* r6 = z1 = d1 + d3 */
205 add r7, r2, r12 /* r7 = tmp10 >> 2 = d0 + d2 */
206 smlabb r6, r5, r6, r8 /* z1 *= 4433 */
207 sub r2, r2, r12 /* r2 = tmp12 >> 2= d0 - d2 */
208 smlatb r3, r5, r3, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
209 smlabb r14, r4, r14, r6 /* r14 = tmp0 = z1 - z3 * 15137 */
212 add r12, r7, r3, asr #11 /* r12 = o0 */
213 sub r7, r7, r3, asr #11 /* r7 = o3 */
214 add r3, r2, r14, asr #11 /* r3 = o1 */
215 sub r2, r2, r14, asr #11 /* r2 = o2 */
223 ldmia sp!, { r4-r8, pc }
225 stmdb sp!, { r4-r10, lr }
233 /* this part is being done in parallel on two columns */
234 sadd16 r8, r4, r6 /* r8 = d0 + d2 */
235 ssub16 r4, r4, r6 /* r4 = d0 - d2 */
236 sadd16 r6, r5, r7 /* r6 = d1 + d3 */
237 /* there is no parallel shift operation, but we can fake it with bic
241 /* multiplication expands values beyond 16 bits, so this part needs to be
242 split. the values will be merged below so that the rest of the addition
243 can be done in parallel */
244 smlabb r9, r3, r6, r12 /* r9 = z1[0] = (d1 * d3) * 4433 + 1024 */
245 smlabt r6, r3, r6, r12 /* r6 = z1[1] = (d1 * d3) * 4433 + 1024 */
246 smlatb r10, r3, r5, r9 /* r10 = tmp2[0] = z1 + d1 * 6270 */
247 smlabb r14, r2, r7, r9 /* r14 = tmp0[0] = z1 - d3 * 15137 */
248 smlatt r5, r3, r5, r6 /* r5 = tmp2[1] */
249 smlabt r6, r2, r7, r6 /* r6 = tmp0[1] */
250 mov r8, r8, lsl #2 /* complete the parallel shift started */
251 mov r4, r4, lsl #2 /* with the earlier bic instructions */
252 /* tmp2 are in r10, r5; tmp0 are in r14, r6 */
253 /* tmp10, tmp12 are in r4, r8 */
254 mov r10, r10, asr #11
255 mov r14, r14, asr #11
256 pkhbt r5, r10, r5, lsl #5 /* parallel tmp2 */
257 pkhbt r6, r14, r6, lsl #5 /* parallel tmp0 */
258 sadd16 r10, r8, r5 /* d0 */
259 ssub16 r5, r8, r5 /* d3 */
260 sadd16 r14, r4, r6 /* d1 */
261 ssub16 r6, r4, r6 /* d2 */
269 ldmia sp!, { r4-r10, pc }
271 .size jpeg_idct4v, .-jpeg_idct4v
286 stmdb sp!, { r4-r10, lr }
296 add r8, r4, r6 /* r8 = tmp10 >> 2 = d0 + d2 */
297 sub r4, r4, r6 /* r4 = tmp12 >> 2= d0 - d2 */
298 add r6, r5, r7 /* r6 = z1 = d1 + d3 */
299 add r9, r6, r6, lsl #3
300 rsb r6, r6, r9, lsl #4
301 rsb r6, r6, r6, lsl #5 /* z1 *= 4433 */
302 mla r7, r10, r7, r6 /* r5 = tmp0 = z1 - z3 * 15137 */
303 mla r5, r12, r5, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
304 add r9, r5, r8, lsl #13 /* r7 = o0 */
305 rsb r5, r5, r8, lsl #13 /* r3 = o3 */
306 add r8, r7, r4, lsl #13 /* r6 = o1 */
307 rsb r4, r7, r4, lsl #13 /* r2 = o2 */
313 mvnhi r9, r9, asr #31
315 mvnhi r8, r8, asr #31
317 mvnhi r4, r4, asr #31
319 mvnhi r5, r5, asr #31
321 strb r8, [r1, #pix8_size]
322 strb r4, [r1, #2*pix8_size]
323 strb r5, [r1, #3*pix8_size]
328 ldmia sp!, { r4-r10, pc }
330 stmdb sp!, { r4-r9, lr }
338 add r8, r14, r7 /* r8 = z1 = d1 + d3 */
339 add r12, r12, r4, lsr #16
340 smulbb r8, r5, r8 /* z1 *= 4433 */
341 add r9, r12, r6 /* r9 = tmp10 >> 13 = d0 + d2 */
342 smlatb r14, r5, r14, r8 /* r14= tmp2 = z1 + z2 * 6270 */
343 smlabb r7, r4, r7, r8 /* r7 = tmp0 = z1 - z3 * 15137 */
344 sub r12, r12, r6 /* r12= tmp12 >> 13 = d0 - d2 */
345 add r6, r14, r9, lsl #13 /* r6 = o0 */
346 rsb r9, r14, r9, lsl #13 /* r9 = o3 */
347 add r14, r7, r12, lsl #13 /* r14= o1 */
348 rsb r12, r7, r12, lsl #13 /* r12= o2 */
350 mov r14, r14, asr #18
351 mov r12, r12, asr #18
354 mvnhi r6, r6, asr #31
356 mvnhi r14, r14, asr #31
358 mvnhi r12, r12, asr #31
360 mvnhi r9, r9, asr #31
362 strb r14, [r1, #pix8_size]
363 strb r12, [r1, #2*pix8_size]
364 strb r9, [r1, #3*pix8_size]
369 ldmia sp!, { r4-r9, pc }
371 stmdb sp!, { r4-r9, lr }
375 ldmia r0, { r12, r14 }
377 sadd16 r6, r12, r14 /* r6lo = d0 + d2, r6hi = d1 + d3 */
378 ssub16 r7, r12, r14 /* r7lo = d0 - d2 */
381 smlatt r12, r5, r12, r8 /* r12= tmp2 = z1 + z2 * 6270 */
382 smlabt r14, r4, r14, r8 /* r14= tmp0 = z1 - z3 * 15137 */
384 add r8, r12, r6, lsl #13 /* r8 = o0 */
385 rsb r6, r12, r6, lsl #13 /* r6 = o3 */
386 add r12, r14, r7, lsl #13 /* r12= o1 */
387 rsb r14, r14, r7, lsl #13 /* r14= o2 */
388 usat r8, #8, r8, asr #18
389 usat r6, #8, r6, asr #18
390 usat r12, #8, r12, asr #18
391 usat r14, #8, r14, asr #18
393 strb r6, [r1, #3*pix8_size]
394 strb r12, [r1, #pix8_size]
395 strb r14, [r1, #2*pix8_size]
400 ldmia sp!, { r4-r9, pc }
402 .size jpeg_idct4h, .-jpeg_idct4h
406 stmdb sp!, { r4-r11, lr }
413 orreqs r9, r5, r4, lsr #16
427 ldmia sp!, { r4-r11, pc }
433 mov r10, r10, asr #16 /* r10 = z2 = d2 */
434 mov r11, r11, asr #16 /* r11 = z3 = d6 */
437 mov r8, r8, asr #3 /* r8 = z4 = (d0 << 13) + 1024 */
438 mul r9, r14, r9 /* r9 = z1 = (z2 + z3) * 4433 */
440 mla r11, r12, r11, r9 /* r11 = tmp2 = z1 - z3 * 15137 */
441 mla r10, r14, r10, r9 /* r10 = tmp3 = z1 + z2 * 6270 */
442 mov r9, r6, lsl #16 /* r9 = z5 << 3 = d4 << 16 */
443 add r12, r8, r9, asr #3 /* r12 = tmp0 = z4 + z5 */
444 sub r14, r8, r9, asr #3 /* r14 = tmp1 = z4 - z5 */
445 add r8, r12, r10 /* r8 = tmp10 = tmp0 + tmp3 */
446 sub r9, r12, r10 /* r9 = tmp13 = tmp0 - tmp3 */
447 add r10, r14, r11 /* r10 = tmp11 = tmp1 + tmp2 */
448 sub r11, r14, r11 /* r11 = tmp12 = tmp1 - tmp2 */
449 stmdb sp, { r8-r11 } /* tmp10 tmp13 tmp11 tmp12 */
450 mov r4, r4, asr #16 /* r4 = tmp3 = d1 */
451 mov r5, r5, asr #16 /* r5 = tmp2 = d3 */
452 mov r6, r6, asr #16 /* r6 = tmp1 = d5 */
453 mov r7, r7, asr #16 /* r7 = tmp0 = d7 */
456 add r12, r5, r7 /* r12 = z3 = tmp0 + tmp2 */
457 add r14, r4, r6 /* r14 = z4 = tmp1 + tmp3 */
458 add r9, r12, r14 /* r9 = z3 + z4 */
459 mul r9, r10, r9 /* r9 = z5 = (z3 + z4) * 9633 */
461 mla r12, r11, r12, r9 /* r12 = z3 = z5 - z3 * 16069 */
463 mla r14, r10, r14, r9 /* r14 = z4 = z5 - z4 * 3196 */
465 add r9, r4, r7 /* r9 = tmp0 + tmp3 */
466 mla r8, r11, r9, r12 /* r8 = z1 + z3 */
467 mla r9, r11, r9, r14 /* r9 = z1 + z4 */
469 mla r7, r10, r7, r8 /* r7 = tmp0 = z1 + z3 + tmp0 * 2446 */
471 mla r4, r11, r4, r9 /* r4 = tmp3 = z1 + z4 + tmp0 * 12299 */
473 add r9, r5, r6 /* r9 = tmp1 + tmp2 */
474 mla r12, r10, r9, r12 /* r12 = z2 + z3 */
475 mla r14, r10, r9, r14 /* r14 = z2 + z4 */
477 mla r5, r11, r5, r12 /* r5 = tmp2 = z2 + z3 + tmp2 * 25172 */
478 mla r6, r10, r6, r14 /* r6 = tmp1 = z2 + z4 + tmp1 * 16819 */
480 add r12, r8, r4 /* o0 */
481 sub r14, r8, r4 /* o7 */
482 add r8, r9, r7 /* o3 */
483 sub r9, r9, r7 /* o4 */
484 add r4, r10, r5 /* O1 */
485 sub r5, r10, r5 /* o6 */
486 add r10, r11, r6 /* o2 */
487 sub r11, r11, r6 /* o5 */
488 /* output in order: r12 r4 r10 r8 r9 r11 r5 r14 */
489 mov r12, r12, asr #11
491 mov r10, r10, asr #11
494 mov r11, r11, asr #11
496 mov r14, r14, asr #11
508 orreqs r9, r5, r4, lsr #16
510 mov r12, r12, asr #14
522 ldmia sp!, { r4-r11, pc }
527 add r10, r5, r7 /* r10[15:0] = d2 + d6 */
528 sub r14, r12, r6, lsl #16 /* r14 = tmp1 << 3 = (d0 - d4) << 16 */
529 smulbb r10, r8, r10 /* r10 = z1 = (d2 + d6) * 4433 */
530 add r12, r12, r6, lsl #16 /* r12 = tmp0 << 3= (d0 + d4) << 16 */
531 smlatb r11, r8, r7, r10 /* r11 = tmp2 = z1 - d6 * 15137 */
532 smlabb r10, r9, r5, r10 /* r10 = tmp3 = z1 + d2 * 6270 */
533 add r8, r11, r14, asr #3 /* r8 = tmp11 */
534 rsb r11, r11, r14, asr #3 /* r11 = tmp12 */
535 add r14, r10, r12, asr #3 /* r14 = tmp10 */
536 rsb r12, r10, r12, asr #3 /* r12 = tmp13 */
537 stmdb sp, { r8, r11, r12, r14 }/* tmp11 tmp12 tmp13 tmp10 */
538 mov r6, r6, asr #16 /* r6 = tmp1 = d5 */
539 mov r7, r7, asr #16 /* r7 = tmp0 = d7 */
540 add r12, r6, r4, asr #16 /* r12 = z4 = tmp1 + tmp3 */
541 add r14, r7, r5, asr #16 /* r14 = z3 = tmp0 + tmp2 */
542 add r8, r12, r14 /* r8 = z3 + z4 */
545 smultb r8, r9, r8 /* r8 = z5 = (z3 + z4) * 9633 */
546 add r9, r7, r4, asr #16 /* r9 = z1 = tmp0 + tmp3 */
547 smlabb r14, r10, r14, r8 /* r14 = z3 = z5 - z3 * 16069 */
548 smlatb r12, r10, r12, r8 /* r12 = z4 = z5 - z4 * 3196 */
549 smlabb r8, r11, r9, r14 /* r8 = z3 - z1 * 7373 */
550 smlabb r9, r11, r9, r12 /* r9 = z4 - z1 * 7373 */
551 add r10, r6, r5, asr #16 /* r10 = z2 = tmp1 + tmp2 */
552 smlatb r12, r11, r10, r12 /* r12 = z4 - z2 * 20995 */
553 smlatb r14, r11, r10, r14 /* r14 = z3 - z2 * 20995 */
556 smlabb r7, r10, r7, r8 /* r7 = tmp0 */
557 smlatt r4, r10, r4, r9 /* r4 = tmp3 */
558 smlabb r6, r11, r6, r12 /* r6 = tmp1 */
559 smlatt r5, r11, r5, r14 /* r5 = tmp2 */
560 ldmdb sp, { r8-r11 } /* tmp11 tmp12 tmp13 tmp10 */
561 add r12, r8, r5 /* o1 */
562 sub r14, r8, r5 /* o6 */
563 add r8, r9, r6 /* o2 */
564 sub r9, r9, r6 /* o5 */
565 add r6, r10, r7 /* o3 */
566 sub r7, r10, r7 /* o4 */
567 add r10, r11, r4 /* o0 */
568 sub r11, r11, r4 /* o7 */
569 mov r12, r12, asr #11
570 mov r14, r14, asr #11
575 mov r10, r10, asr #11
576 mov r11, r11, asr #11
589 ldmia sp!, { r4-r11, pc }
590 .size jpeg_idct8v, .-jpeg_idct8v
611 stmdb sp!, { r4-r11, lr }
616 add r8, r14, r4, lsl #16
618 orreqs r9, r5, r4, lsr #16
622 mvnhi r8, r8, asr #31
624 strb r8, [r1, #pix8_size]
625 strb r8, [r1, #2*pix8_size]
626 strb r8, [r1, #3*pix8_size]
627 strb r8, [r1, #4*pix8_size]
628 strb r8, [r1, #5*pix8_size]
629 strb r8, [r1, #6*pix8_size]
630 strb r8, [r1, #7*pix8_size]
634 ldmia sp!, { r4-r11, pc }
640 mov r10, r10, asr #16 /* r10 = z2 = d2 */
641 mov r11, r11, asr #16 /* r11 = z3 = d6 */
643 mov r8, r8, asr #3 /* r8 = z4 = (d0 + 4112) << 13 */
644 mul r9, r14, r9 /* r9 = z1 = (z2 + z3) * 4433 */
646 mla r11, r12, r11, r9 /* r11 = tmp2 = z1 - z3 * 15137 */
647 mla r10, r14, r10, r9 /* r10 = tmp3 = z1 + z2 * 6270 */
648 mov r9, r6, lsl #16 /* r9 = z5 << 3 = d4 << 16 */
649 add r12, r8, r9, asr #3 /* r12 = tmp0 = z4 + z5 */
650 sub r14, r8, r9, asr #3 /* r14 = tmp1 = z4 - z5 */
651 add r8, r12, r10 /* r8 = tmp10 = tmp0 + tmp3 */
652 sub r9, r12, r10 /* r9 = tmp13 = tmp0 - tmp3 */
653 add r10, r14, r11 /* r10 = tmp11 = tmp1 + tmp2 */
654 sub r11, r14, r11 /* r11 = tmp12 = tmp1 - tmp2 */
655 stmdb sp, { r8-r11 } /* tmp10 tmp13 tmp11 tmp12 */
656 mov r4, r4, asr #16 /* r4 = tmp3 = d1 */
657 mov r5, r5, asr #16 /* r5 = tmp2 = d3 */
658 mov r6, r6, asr #16 /* r6 = tmp1 = d5 */
659 mov r7, r7, asr #16 /* r7 = tmp0 = d7 */
662 add r12, r5, r7 /* r12 = z3 = tmp0 + tmp2 */
663 add r14, r4, r6 /* r14 = z4 = tmp1 + tmp3 */
664 add r9, r12, r14 /* r9 = z3 + z4 */
665 mul r9, r10, r9 /* r9 = z5 = (z3 + z4) * 9633 */
667 mla r12, r11, r12, r9 /* r12 = z3 = z5 - z3 * 16069 */
669 mla r14, r10, r14, r9 /* r14 = z4 = z5 - z4 * 3196 */
671 add r9, r4, r7 /* r9 = tmp0 + tmp3 */
672 mla r8, r11, r9, r12 /* r8 = z1 + z3 */
673 mla r9, r11, r9, r14 /* r9 = z1 + z4 */
675 mla r7, r10, r7, r8 /* r7 = tmp0 = z1 + z3 + tmp0 * 2446 */
677 mla r4, r11, r4, r9 /* r4 = tmp3 = z1 + z4 + tmp0 * 12299 */
679 add r9, r5, r6 /* r9 = tmp1 + tmp2 */
680 mla r12, r10, r9, r12 /* r12 = z2 + z3 */
681 mla r14, r10, r9, r14 /* r14 = z2 + z4 */
683 mla r5, r11, r5, r12 /* r5 = tmp2 = z2 + z3 + tmp2 * 25172 */
684 mla r6, r10, r6, r14 /* r6 = tmp1 = z2 + z4 + tmp1 * 16819 */
686 add r12, r8, r4 /* o0 */
687 sub r14, r8, r4 /* o7 */
688 add r8, r9, r7 /* o3 */
689 sub r9, r9, r7 /* o4 */
690 add r4, r10, r5 /* O1 */
691 sub r5, r10, r5 /* o6 */
692 add r10, r11, r6 /* o2 */
693 sub r11, r11, r6 /* o5 */
694 /* output in order: r12 r4 r10 r8 r9 r11 r5 r14 */
695 mov r12, r12, asr #18
697 mvnhi r12, r12, asr #31
700 mvnhi r4, r4, asr #31
701 mov r10, r10, asr #18
703 mvnhi r10, r10, asr #31
706 mvnhi r8, r8, asr #31
709 mvnhi r9, r9, asr #31
710 mov r11, r11, asr #18
712 mvnhi r11, r11, asr #31
715 mvnhi r5, r5, asr #31
716 mov r14, r14, asr #18
718 mvnhi r14, r14, asr #31
720 strb r4, [r1, #pix8_size]
721 strb r10, [r1, #2*pix8_size]
722 strb r8, [r1, #3*pix8_size]
723 strb r9, [r1, #4*pix8_size]
724 strb r11, [r1, #5*pix8_size]
725 strb r5, [r1, #6*pix8_size]
726 strb r14, [r1, #7*pix8_size]
728 add r12, r14, r4, lsl #16
730 orreqs r9, r5, r4, lsr #16
732 mov r12, r12, asr #21
734 mvnhi r12, r12, asr #31
736 strb r12, [r1, #pix8_size]
737 strb r12, [r1, #2*pix8_size]
738 strb r12, [r1, #3*pix8_size]
739 strb r12, [r1, #4*pix8_size]
740 strb r12, [r1, #5*pix8_size]
741 strb r12, [r1, #6*pix8_size]
742 strb r12, [r1, #7*pix8_size]
746 ldmia sp!, { r4-r11, pc }
750 add r10, r5, r7 /* r10[15:0] = d2 + d6 */
751 sub r14, r12, r6, lsl #16 /* r14 = tmp1 << 3 = (d0 - d4) << 16 */
752 smulbb r10, r8, r10 /* r10 = z1 = (d2 + d6) * 4433 */
753 add r12, r12, r6, lsl #16 /* r12 = tmp0 << 3= (d0 + d4) << 16 */
754 smlatb r11, r8, r7, r10 /* r11 = tmp2 = z1 - d6 * 15137 */
755 smlabb r10, r9, r5, r10 /* r10 = tmp3 = z1 + d2 * 6270 */
756 add r8, r11, r14, asr #3 /* r8 = tmp11 */
757 rsb r11, r11, r14, asr #3 /* r11 = tmp12 */
758 add r14, r10, r12, asr #3 /* r14 = tmp10 */
759 rsb r12, r10, r12, asr #3 /* r12 = tmp13 */
760 stmdb sp, { r8, r11, r12, r14 }/* tmp11 tmp12 tmp13 tmp10 */
761 mov r6, r6, asr #16 /* r6 = tmp1 = d5 */
762 mov r7, r7, asr #16 /* r7 = tmp0 = d7 */
763 add r12, r6, r4, asr #16 /* r12 = z4 = tmp1 + tmp3 */
764 add r14, r7, r5, asr #16 /* r14 = z3 = tmp0 + tmp2 */
765 add r8, r12, r14 /* r8 = z3 + z4 */
768 smultb r8, r9, r8 /* r8 = z5 = (z3 + z4) * 9633 */
769 add r9, r7, r4, asr #16 /* r9 = z1 = tmp0 + tmp3 */
770 smlabb r14, r10, r14, r8 /* r14 = z3 = z5 - z3 * 16069 */
771 smlatb r12, r10, r12, r8 /* r12 = z4 = z5 - z4 * 3196 */
772 smlabb r8, r11, r9, r14 /* r8 = z3 - z1 * 7373 */
773 smlabb r9, r11, r9, r12 /* r9 = z4 - z1 * 7373 */
774 add r10, r6, r5, asr #16 /* r10 = z2 = tmp1 + tmp2 */
775 smlatb r12, r11, r10, r12 /* r12 = z4 - z2 * 20995 */
776 smlatb r14, r11, r10, r14 /* r14 = z3 - z2 * 20995 */
779 smlabb r7, r10, r7, r8 /* r7 = tmp0 */
780 smlatt r4, r10, r4, r9 /* r4 = tmp3 */
781 smlabb r6, r11, r6, r12 /* r6 = tmp1 */
782 smlatt r5, r11, r5, r14 /* r5 = tmp2 */
783 ldmdb sp, { r8-r11 } /* tmp11 tmp12 tmp13 tmp10 */
784 add r12, r8, r5 /* o1 */
785 sub r14, r8, r5 /* o6 */
786 add r8, r9, r6 /* o2 */
787 sub r9, r9, r6 /* o5 */
788 add r6, r10, r7 /* o3 */
789 sub r7, r10, r7 /* o4 */
790 add r10, r11, r4 /* o0 */
791 sub r11, r11, r4 /* o7 */
792 /* output in order: r10 r12 r8 r6 r7 r9 r14 r11 */
793 mov r10, r10, asr #18
795 mvnhi r10, r10, asr #31
796 mov r12, r12, asr #18
798 mvnhi r12, r12, asr #31
801 mvnhi r8, r8, asr #31
804 mvnhi r6, r6, asr #31
807 mvnhi r7, r7, asr #31
810 mvnhi r9, r9, asr #31
811 mov r14, r14, asr #18
813 mvnhi r14, r14, asr #31
814 mov r11, r11, asr #18
816 mvnhi r11, r11, asr #31
818 strb r12, [r1, #pix8_size]
819 strb r8, [r1, #2*pix8_size]
820 strb r6, [r1, #3*pix8_size]
821 strb r7, [r1, #4*pix8_size]
822 strb r9, [r1, #5*pix8_size]
823 strb r14, [r1, #6*pix8_size]
824 strb r11, [r1, #7*pix8_size]
829 ldmia sp!, { r4-r11, pc }
830 .size jpeg_idct8h, .-jpeg_idct8h
833 stmdb sp!, { r4-r11, lr }
838 orreqs r9, r5, r4, lsr #16
852 ldmia sp!, { r4-r11, pc }
856 add r10, r5, r7 /* r10 = d2 + d6 */
858 add r3, r12, r6, lsl #16 /* tmp0 */
859 sub r12, r12, r6, lsl #16 /* tmp1 */
860 pkhtb r4, r5, r4, asr #16 /* r4 = (tmp3[o], tmp2[0]) = (d1, d3) */
861 smulbb r14, r8, r10 /* r14 = z1[e] = (d2 + d6) * 4433 */
862 pkhtb r6, r6, r7, asr #16 /* r6 = (tmp0[o], tmp1[o]) = (d7, d5) */
863 smlatb r7, r8, r7, r14 /* r7 = tmp2[e] = z1 - d6 * 15137 */
864 smlabb r5, r9, r5, r14 /* r5 = tmp3[e] = z1 + d2 * 6270 */
865 pkhtb r9, r9, r9, asr #16 /* r9 = (9633, 9633) */
866 add r10, r5, r3, asr #3 /* r10 = tmp10 */
867 rsb r11, r5, r3, asr #3 /* r11 = tmp13 */
869 rsb r14, r7, r12, asr #3 /* r14 = tmp12 */
870 add r12, r7, r12, asr #3 /* r12 = tmp11 */
871 sadd16 r8, r3, r6 /* z3, z4 */
872 stmdb sp, { r10-r12, r14 } /* tmp10 tmp13 tmp11 tmp12 */
873 smuad r5, r9, r8 /* r5 = z5 = (z3[o] + z4[o]) * 9633 */
875 sadd16 r7, r4, r6 /* r7 = (z1, z2) */
876 smlatt r9, r10, r8, r5 /* r9 = z4 = z5 - z4 * 16069 */
877 smlabb r8, r10, r8, r5 /* r8 = z3 = z5 - z3 * 3196 */
878 smlabb r14, r11, r7, r9 /* r14 = z1 + z4 */
879 smlabb r12, r11, r7, r8 /* r12 = z1 + z3 */
880 smlatt r5, r11, r7, r9 /* r5 = z2 + z4 */
881 smlatt r7, r11, r7, r8 /* r7 = z2 + z3 */
883 smlabt r7, r9, r4, r7 /* r7 = tmp2 */
884 smlatb r14, r9, r4, r14 /* r14 = tmp3 */
885 ldmdb sp, { r4, r9-r11 } /* tmp10 tmp13 tmp11 tmp12 */
886 smlabb r12, r8, r6, r12 /* r12 = tmp0 */
887 smlatt r5, r8, r6, r5 /* r5 = tmp1 */
888 /* used: r4, r5, r7, r9-r12, r14 */
889 add r6, r4, r14 /* o0 */
890 sub r8, r4, r14 /* o7 */
891 add r14, r9, r12 /* o3 */
892 sub r12, r9, r12 /* o4 */
893 add r4, r10, r7 /* o1 */
894 sub r7, r10, r7 /* o6 */
895 add r9, r11, r5 /* o2 */
896 sub r10, r11, r5 /* o5 */
900 mov r14, r14, asr #11
901 mov r12, r12, asr #11
902 mov r10, r10, asr #11
916 ldmia sp!, { r4-r11, pc }
917 .size jpeg_idct8v, .-jpeg_idct8v
936 stmdb sp!, { r4-r11, lr }
942 orreqs r9, r5, r4, lsr #16
945 usat r4, #8, r4, asr #5
947 strb r4, [r1, #pix8_size]
948 strb r4, [r1, #2*pix8_size]
949 strb r4, [r1, #3*pix8_size]
950 strb r4, [r1, #4*pix8_size]
951 strb r4, [r1, #5*pix8_size]
952 strb r4, [r1, #6*pix8_size]
953 strb r4, [r1, #7*pix8_size]
957 ldmia sp!, { r4-r11, pc }
960 sadd16 r10, r5, r7 /* r10 = (d2 + d6, d3 + d7) */
961 ssub16 r12, r4, r6 /* r12 = (d0 - d4, d1 - d5) */
962 sadd16 r11, r4, r6 /* r11 = (d0 + d4, d1 + d5) */
963 pkhtb r4, r5, r4, asr #16 /* r4 = (tmp3[o], tmp2[0]) = (d1, d3) */
964 smulbb r14, r8, r10 /* r14 = z1[e] = (d2 + d6) * 4433 */
965 pkhtb r6, r6, r7, asr #16 /* r6 = (tmp0[o], tmp1[o]) = (d7, d5) */
966 smlatb r7, r8, r7, r14 /* r7 = tmp2[e] = z1 - d6 * 15137 */
967 smlabb r5, r9, r5, r14 /* r5 = tmp3[e] = z1 + d2 * 6270 */
968 sxth r12, r12 /* r12 = tmp1[e] = d0 - d4 */
969 pkhtb r8, r11, r10, asr #16 /* r8 = (z3[o], z4[o]) */
970 sxth r14, r11 /* r14 = tmp0[e] */
971 pkhtb r9, r9, r9, asr #16 /* r9 = (9633, 9633) */
972 add r10, r5, r14, lsl #13 /* r10 = tmp10 */
973 rsb r11, r5, r14, lsl #13 /* r11 = tmp13 */
974 rsb r14, r7, r12, lsl #13 /* r14 = tmp12 */
975 add r12, r7, r12, lsl #13 /* r12 = tmp11 */
976 stmdb sp, { r10-r12, r14 } /* tmp10 tmp13 tmp11 tmp12 */
977 smuad r5, r9, r8 /* r5 = z5 = (z3[o] + z4[o]) * 9633 */
979 sadd16 r7, r4, r6 /* r7 = (z1, z2) */
980 smlatt r9, r10, r8, r5 /* r9 = z4 = z5 - z4 * 16069 */
981 smlabb r8, r10, r8, r5 /* r8 = z3 = z5 - z3 * 3196 */
982 smlabb r14, r11, r7, r9 /* r14 = z1 + z4 */
983 smlabb r12, r11, r7, r8 /* r12 = z1 + z3 */
984 smlatt r5, r11, r7, r9 /* r5 = z2 + z4 */
985 smlatt r7, r11, r7, r8 /* r7 = z2 + z3 */
987 smlabt r7, r9, r4, r7 /* r7 = tmp2 */
988 smlatb r14, r9, r4, r14 /* r14 = tmp3 */
989 ldmdb sp, { r4, r9-r11 } /* tmp10 tmp13 tmp11 tmp12 */
990 smlabb r12, r8, r6, r12 /* r12 = tmp0 */
991 smlatt r5, r8, r6, r5 /* r5 = tmp1 */
992 /* used: r4, r5, r7, r9-r12, r14 */
993 add r6, r4, r14 /* o0 */
994 sub r8, r4, r14 /* o7 */
995 add r14, r9, r12 /* o3 */
996 sub r12, r9, r12 /* o4 */
997 add r4, r10, r7 /* o1 */
998 sub r7, r10, r7 /* o6 */
999 add r9, r11, r5 /* o2 */
1000 sub r10, r11, r5 /* o5 */
1001 usat r6, #8, r6, asr #18
1002 usat r4, #8, r4, asr #18
1003 usat r9, #8, r9, asr #18
1004 usat r14, #8, r14, asr #18
1005 usat r12, #8, r12, asr #18
1006 usat r10, #8, r10, asr #18
1007 usat r7, #8, r7, asr #18
1008 usat r8, #8, r8, asr #18
1010 strb r4, [r1, #pix8_size]
1011 strb r9, [r1, #2*pix8_size]
1012 strb r14, [r1, #3*pix8_size]
1013 strb r12, [r1, #4*pix8_size]
1014 strb r10, [r1, #5*pix8_size]
1015 strb r7, [r1, #6*pix8_size]
1016 strb r8, [r1, #7*pix8_size]
1020 ldmia sp!, { r4-r11, pc }
1021 .size jpeg_idct8h, .-jpeg_idct8h