libavcodec/arm/simple_idct_arm.S

   1 /*
   2  * simple_idct_arm.S
   3  * Copyright (C) 2002 Frederic 'dilb' Boulay
   4  *
   5  * Author: Frederic Boulay <dilb@handhelds.org>
   6  *
   7  * The function defined in this file is derived from the simple_idct function
   8  * from the libavcodec library part of the FFmpeg project.
   9  *
  10  * This file is part of FFmpeg.
  11  *
  12  * FFmpeg is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * FFmpeg is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with FFmpeg; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include "asm.S"
  28
  29 /* useful constants for the algorithm, they are save in __constant_ptr__ at */
  30 /* the end of the source code.*/
  31 #define W1  22725
  32 #define W2  21407
  33 #define W3  19266
  34 #define W4  16383
  35 #define W5  12873
  36 #define W6  8867
  37 #define W7  4520
  38 #define MASK_MSHW 0xFFFF0000
  39
  40 /* offsets of the constants in the vector */
  41 #define offW1  0
  42 #define offW2  4
  43 #define offW3  8
  44 #define offW4  12
  45 #define offW5  16
  46 #define offW6  20
  47 #define offW7  24
  48 #define offMASK_MSHW 28
  49
  50 #define ROW_SHIFT 11
  51 #define ROW_SHIFT2MSHW (16-11)
  52 #define COL_SHIFT 20
  53 #define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
  54 #define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
  55
  56
  57         .text
  58
  59 function ff_simple_idct_arm, export=1
  60         @@ void simple_idct_arm(int16_t *block)
  61         @@ save stack for reg needed (take all of them),
  62         @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
  63         @@ so it must not be overwritten, if it is not saved!!
  64         @@ R12 is another scratch register, so it should not be saved too
  65         @@ save all registers
  66         stmfd sp!, {r4-r11, r14} @ R14 is also called LR
  67         @@ at this point, R0=block, other registers are free.
  68         add r14, r0, #112        @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
  69         adr r12, __constant_ptr__ @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it
  70         @@ add 2 temporary variables in the stack: R0 and R14
  71         sub sp, sp, #8          @ allow 2 local variables
  72         str r0, [sp, #0]        @ save block in sp[0]
  73         @@ stack status
  74         @@ sp+4   free
  75         @@ sp+0   R0  (block)
  76
  77
  78         @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
  79
  80
  81 __row_loop:
  82         @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
  83         ldr r1, [r14, #0]        @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
  84         ldr r2, [r14, #4]        @ R2=(int32)(R12)[1]=ROWr32[1]
  85         ldr r3, [r14, #8]        @ R3=ROWr32[2]
  86         ldr r4, [r14, #12]       @ R4=ROWr32[3]
  87         @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
  88         @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
  89         @@ else follow the complete algorithm.
  90         @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
  91         @@                R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
  92         orr r5, r4, r3           @ R5=R4 | R3
  93         orr r5, r5, r2           @ R5=R4 | R3 | R2
  94         orrs r6, r5, r1          @ Test R5 | R1 (the aim is to check if everything is null)
  95         beq __end_row_loop
  96         mov r7, r1, asr #16      @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
  97         ldrsh r6, [r14, #0]      @ R6=ROWr16[0]
  98         orrs r5, r5, r7          @ R5=R4 | R3 | R2 | R7
  99         beq __almost_empty_row
 100
 101 __b_evaluation:
 102         @@ at this point, R0=block (temp),  R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
 103         @@     R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
 104         @@     R12=__const_ptr_, R14=&block[n]
 105         @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
 106
 107         @@ MUL16(b0, W1, row[1]);
 108         @@ MUL16(b1, W3, row[1]);
 109         @@ MUL16(b2, W5, row[1]);
 110         @@ MUL16(b3, W7, row[1]);
 111         @@ MAC16(b0, W3, row[3]);
 112         @@ MAC16(b1, -W7, row[3]);
 113         @@ MAC16(b2, -W1, row[3]);
 114         @@ MAC16(b3, -W5, row[3]);
 115         ldr r8, [r12, #offW1]    @ R8=W1
 116         mov r2, r2, asr #16      @ R2=ROWr16[3]
 117         mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 118         ldr r9, [r12, #offW3]    @ R9=W3
 119         ldr r10, [r12, #offW5]   @ R10=W5
 120         mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 121         ldr r11, [r12, #offW7]   @ R11=W7
 122         mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 123         mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 124                 teq r2, #0               @ if null avoid muls
 125                 mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 126         rsbne r2, r2, #0         @ R2=-ROWr16[3]
 127         mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 128         mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 129         mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 130
 131         @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
 132         @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 133         @@     R12=__const_ptr_, R14=&block[n]
 134         @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
 135         @@ if (temp != 0) {}
 136         orrs r2, r3, r4          @ R2=ROWr32[2] | ROWr32[3]
 137         beq __end_b_evaluation
 138
 139         @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
 140         @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 141         @@     R12=__const_ptr_, R14=&block[n]
 142         @@ MAC16(b0, W5, row[5]);
 143         @@ MAC16(b2, W7, row[5]);
 144         @@ MAC16(b3, W3, row[5]);
 145         @@ MAC16(b1, -W1, row[5]);
 146         @@ MAC16(b0, W7, row[7]);
 147         @@ MAC16(b2, W3, row[7]);
 148         @@ MAC16(b3, -W1, row[7]);
 149         @@ MAC16(b1, -W5, row[7]);
 150         mov r3, r3, asr #16      @ R3=ROWr16[5]
 151                 teq r3, #0               @ if null avoid muls
 152         mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5]=b0
 153         mov r4, r4, asr #16      @ R4=ROWr16[7]
 154         mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5]=b2
 155         mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5]=b3
 156         rsbne r3, r3, #0         @ R3=-ROWr16[5]
 157         mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5]=b1
 158         @@ R3 is free now
 159                 teq r4, #0               @ if null avoid muls
 160         mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7]=b0
 161         mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7]=b2
 162         rsbne r4, r4, #0         @ R4=-ROWr16[7]
 163         mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7]=b3
 164         mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7]=b1
 165         @@ R4 is free now
 166 __end_b_evaluation:
 167         @@ at this point, R0=b0,  R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
 168         @@     R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 169         @@     R12=__const_ptr_, R14=&block[n]
 170
 171 __a_evaluation:
 172         @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
 173         @@ a1 = a0 + W6 * row[2];
 174         @@ a2 = a0 - W6 * row[2];
 175         @@ a3 = a0 - W2 * row[2];
 176         @@ a0 = a0 + W2 * row[2];
 177         ldr r9, [r12, #offW4]    @ R9=W4
 178         mul r6, r9, r6           @ R6=W4*ROWr16[0]
 179         ldr r10, [r12, #offW6]   @ R10=W6
 180         ldrsh r4, [r14, #4]      @ R4=ROWr16[2] (a3 not defined yet)
 181         add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
 182
 183         mul r11, r10, r4         @ R11=W6*ROWr16[2]
 184         ldr r8, [r12, #offW2]    @ R8=W2
 185         sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
 186         @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
 187         @@ if (temp != 0) {}
 188         teq r2, #0
 189         beq __end_bef_a_evaluation
 190
 191         add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 192         mul r11, r8, r4          @ R11=W2*ROWr16[2]
 193         sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 194         add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 195
 196
 197         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 198         @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
 199         @@     R12=__const_ptr_, R14=&block[n]
 200
 201
 202         @@ a0 += W4*row[4]
 203         @@ a1 -= W4*row[4]
 204         @@ a2 -= W4*row[4]
 205         @@ a3 += W4*row[4]
 206         ldrsh r11, [r14, #8]     @ R11=ROWr16[4]
 207                 teq r11, #0              @ if null avoid muls
 208         mulne r11, r9, r11       @ R11=W4*ROWr16[4]
 209         @@ R9 is free now
 210         ldrsh r9, [r14, #12]     @ R9=ROWr16[6]
 211         addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
 212         subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
 213         subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
 214         addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
 215         @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
 216                 teq r9, #0               @ if null avoid muls
 217         mulne r11, r10, r9       @ R11=W6*ROWr16[6]
 218         addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
 219         mulne r10, r8, r9        @ R10=W2*ROWr16[6]
 220         @@ a0 += W6*row[6];
 221         @@ a3 -= W6*row[6];
 222         @@ a1 -= W2*row[6];
 223         @@ a2 += W2*row[6];
 224         subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
 225         subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
 226         addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
 227
 228 __end_a_evaluation:
 229         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 230         @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 231         @@     R12=__const_ptr_, R14=&block[n]
 232         @@ row[0] = (a0 + b0) >> ROW_SHIFT;
 233         @@ row[1] = (a1 + b1) >> ROW_SHIFT;
 234         @@ row[2] = (a2 + b2) >> ROW_SHIFT;
 235         @@ row[3] = (a3 + b3) >> ROW_SHIFT;
 236         @@ row[4] = (a3 - b3) >> ROW_SHIFT;
 237         @@ row[5] = (a2 - b2) >> ROW_SHIFT;
 238         @@ row[6] = (a1 - b1) >> ROW_SHIFT;
 239         @@ row[7] = (a0 - b0) >> ROW_SHIFT;
 240         add r8, r6, r0           @ R8=a0+b0
 241         add r9, r2, r1           @ R9=a1+b1
 242         @@ put 2 16 bits half-words in a 32bits word
 243         @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
 244         ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000
 245         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
 246         mvn r11, r10             @ R11= NOT R10= 0x0000FFFF
 247         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
 248         orr r8, r8, r9
 249         str r8, [r14, #0]
 250
 251         add r8, r3, r5           @ R8=a2+b2
 252         add r9, r4, r7           @ R9=a3+b3
 253         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
 254         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
 255         orr r8, r8, r9
 256         str r8, [r14, #4]
 257
 258         sub r8, r4, r7           @ R8=a3-b3
 259         sub r9, r3, r5           @ R9=a2-b2
 260         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
 261         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
 262         orr r8, r8, r9
 263         str r8, [r14, #8]
 264
 265         sub r8, r2, r1           @ R8=a1-b1
 266         sub r9, r6, r0           @ R9=a0-b0
 267         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
 268         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
 269         orr r8, r8, r9
 270         str r8, [r14, #12]
 271
 272         bal __end_row_loop
 273
 274 __almost_empty_row:
 275         @@ the row was empty, except ROWr16[0], now, management of this special case
 276         @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
 277         @@                R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
 278         @@                R8=0xFFFF (temp), R9-R11 free
 279         mov r8, #0x10000         @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
 280         sub r8, r8, #1           @ R8 is now ready.
 281         and r5, r8, r6, lsl #3   @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
 282         orr r5, r5, r5, lsl #16  @ R5=R5 | (R5<<16)
 283         str r5, [r14, #0]        @ R14[0]=ROWr32[0]=R5
 284         str r5, [r14, #4]        @ R14[4]=ROWr32[1]=R5
 285         str r5, [r14, #8]        @ R14[8]=ROWr32[2]=R5
 286         str r5, [r14, #12]       @ R14[12]=ROWr32[3]=R5
 287
 288 __end_row_loop:
 289         @@ at this point, R0-R11 (free)
 290         @@     R12=__const_ptr_, R14=&block[n]
 291         ldr r0, [sp, #0]         @ R0=block
 292         teq r0, r14              @ compare current &block[8*n] to block, when block is reached, the loop is finished.
 293         sub r14, r14, #16
 294         bne __row_loop
 295
 296
 297
 298         @@ at this point, R0=block, R1-R11 (free)
 299         @@     R12=__const_ptr_, R14=&block[n]
 300         add r14, r0, #14        @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
 301 __col_loop:
 302
 303 __b_evaluation2:
 304         @@ at this point, R0=block (temp),  R1-R11 (free)
 305         @@     R12=__const_ptr_, R14=&block[n]
 306         @@ proceed with b0-b3 first, followed by a0-a3
 307         @@ MUL16(b0, W1, col[8x1]);
 308         @@ MUL16(b1, W3, col[8x1]);
 309         @@ MUL16(b2, W5, col[8x1]);
 310         @@ MUL16(b3, W7, col[8x1]);
 311         @@ MAC16(b0, W3, col[8x3]);
 312         @@ MAC16(b1, -W7, col[8x3]);
 313         @@ MAC16(b2, -W1, col[8x3]);
 314         @@ MAC16(b3, -W5, col[8x3]);
 315         ldr r8, [r12, #offW1]    @ R8=W1
 316         ldrsh r7, [r14, #16]
 317         mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 318         ldr r9, [r12, #offW3]    @ R9=W3
 319         ldr r10, [r12, #offW5]   @ R10=W5
 320         mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 321         ldr r11, [r12, #offW7]   @ R11=W7
 322         mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 323         ldrsh r2, [r14, #48]
 324         mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 325         teq r2, #0               @ if 0, then avoid muls
 326         mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 327         rsbne r2, r2, #0         @ R2=-ROWr16[3]
 328         mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 329         mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 330         mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 331
 332         @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
 333         @@     R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 334         @@     R12=__const_ptr_, R14=&block[n]
 335         @@ MAC16(b0, W5, col[5x8]);
 336         @@ MAC16(b2, W7, col[5x8]);
 337         @@ MAC16(b3, W3, col[5x8]);
 338         @@ MAC16(b1, -W1, col[5x8]);
 339         @@ MAC16(b0, W7, col[7x8]);
 340         @@ MAC16(b2, W3, col[7x8]);
 341         @@ MAC16(b3, -W1, col[7x8]);
 342         @@ MAC16(b1, -W5, col[7x8]);
 343         ldrsh r3, [r14, #80]     @ R3=COLr16[5x8]
 344         teq r3, #0               @ if 0 then avoid muls
 345         mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5x8]=b0
 346         mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5x8]=b2
 347         mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5x8]=b3
 348         rsbne r3, r3, #0         @ R3=-ROWr16[5x8]
 349         ldrsh r4, [r14, #112]    @ R4=COLr16[7x8]
 350         mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5x8]=b1
 351         @@ R3 is free now
 352         teq r4, #0               @ if 0 then avoid muls
 353         mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7x8]=b0
 354         mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7x8]=b2
 355         rsbne r4, r4, #0         @ R4=-ROWr16[7x8]
 356         mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7x8]=b3
 357         mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7x8]=b1
 358         @@ R4 is free now
 359 __end_b_evaluation2:
 360         @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
 361         @@     R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 362         @@     R12=__const_ptr_, R14=&block[n]
 363
 364 __a_evaluation2:
 365         @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
 366         @@ a1 = a0 + W6 * row[2];
 367         @@ a2 = a0 - W6 * row[2];
 368         @@ a3 = a0 - W2 * row[2];
 369         @@ a0 = a0 + W2 * row[2];
 370         ldrsh r6, [r14, #0]
 371         ldr r9, [r12, #offW4]    @ R9=W4
 372         mul r6, r9, r6           @ R6=W4*ROWr16[0]
 373         ldr r10, [r12, #offW6]   @ R10=W6
 374         ldrsh r4, [r14, #32]     @ R4=ROWr16[2] (a3 not defined yet)
 375         add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
 376         mul r11, r10, r4         @ R11=W6*ROWr16[2]
 377         ldr r8, [r12, #offW2]    @ R8=W2
 378         add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 379         sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
 380         mul r11, r8, r4          @ R11=W2*ROWr16[2]
 381         sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 382         add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 383
 384         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 385         @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
 386         @@     R12=__const_ptr_, R14=&block[n]
 387         @@ a0 += W4*row[4]
 388         @@ a1 -= W4*row[4]
 389         @@ a2 -= W4*row[4]
 390         @@ a3 += W4*row[4]
 391         ldrsh r11, [r14, #64]    @ R11=ROWr16[4]
 392         teq r11, #0              @ if null avoid muls
 393         mulne r11, r9, r11       @ R11=W4*ROWr16[4]
 394         @@ R9 is free now
 395         addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
 396         subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
 397         subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
 398         ldrsh r9, [r14, #96]     @ R9=ROWr16[6]
 399         addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
 400         @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
 401         teq r9, #0               @ if null avoid muls
 402         mulne r11, r10, r9       @ R11=W6*ROWr16[6]
 403         addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
 404         mulne r10, r8, r9        @ R10=W2*ROWr16[6]
 405         @@ a0 += W6*row[6];
 406         @@ a3 -= W6*row[6];
 407         @@ a1 -= W2*row[6];
 408         @@ a2 += W2*row[6];
 409         subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
 410         subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
 411         addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
 412 __end_a_evaluation2:
 413         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 414         @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 415         @@     R12=__const_ptr_, R14=&block[n]
 416         @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
 417         @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
 418         @@ col[16] = ((a2 + b2) >> COL_SHIFT);
 419         @@ col[24] = ((a3 + b3) >> COL_SHIFT);
 420         @@ col[32] = ((a3 - b3) >> COL_SHIFT);
 421         @@ col[40] = ((a2 - b2) >> COL_SHIFT);
 422         @@ col[48] = ((a1 - b1) >> COL_SHIFT);
 423         @@ col[56] = ((a0 - b0) >> COL_SHIFT);
 424         @@@@@ no optimization here @@@@@
 425         add r8, r6, r0           @ R8=a0+b0
 426         add r9, r2, r1           @ R9=a1+b1
 427         mov r8, r8, asr #COL_SHIFT
 428         mov r9, r9, asr #COL_SHIFT
 429         strh r8, [r14, #0]
 430         strh r9, [r14, #16]
 431         add r8, r3, r5           @ R8=a2+b2
 432         add r9, r4, r7           @ R9=a3+b3
 433         mov r8, r8, asr #COL_SHIFT
 434         mov r9, r9, asr #COL_SHIFT
 435         strh r8, [r14, #32]
 436         strh r9, [r14, #48]
 437         sub r8, r4, r7           @ R8=a3-b3
 438         sub r9, r3, r5           @ R9=a2-b2
 439         mov r8, r8, asr #COL_SHIFT
 440         mov r9, r9, asr #COL_SHIFT
 441         strh r8, [r14, #64]
 442         strh r9, [r14, #80]
 443         sub r8, r2, r1           @ R8=a1-b1
 444         sub r9, r6, r0           @ R9=a0-b0
 445         mov r8, r8, asr #COL_SHIFT
 446         mov r9, r9, asr #COL_SHIFT
 447         strh r8, [r14, #96]
 448         strh r9, [r14, #112]
 449
 450 __end_col_loop:
 451         @@ at this point, R0-R11 (free)
 452         @@     R12=__const_ptr_, R14=&block[n]
 453         ldr r0, [sp, #0]         @ R0=block
 454         teq r0, r14              @ compare current &block[n] to block, when block is reached, the loop is finished.
 455         sub r14, r14, #2
 456         bne __col_loop
 457
 458
 459
 460
 461 __end_simple_idct_arm:
 462         @@ restore registers to previous status!
 463         add sp, sp, #8 @@ the local variables!
 464         ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
 465
 466
 467
 468 @@ kind of sub-function, here not to overload the common case.
 469 __end_bef_a_evaluation:
 470         add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 471         mul r11, r8, r4          @ R11=W2*ROWr16[2]
 472         sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 473         add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 474         bal __end_a_evaluation
 475
 476
 477 __constant_ptr__:  @@ see #defines at the beginning of the source code for values.
 478         .align
 479         .word   W1
 480         .word   W2
 481         .word   W3
 482         .word   W4
 483         .word   W5
 484         .word   W6
 485         .word   W7
 486         .word   MASK_MSHW