apps/plugins/mpegplayer/idct_coldfire.S

   1 /***************************************************************************
   2  *             __________               __   ___.
   3  *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
   4  *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
   5  *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
   6  *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
   7  *                     \/            \/     \/    \/            \/
   8  * $Id$
   9  *
  10  * Copyright (C) 2007 Jens Arnold
  11  * Based on the work of Karim Boucher and Rani Hod
  12  *
  13  * This program is free software; you can redistribute it and/or
  14  * modify it under the terms of the GNU General Public License
  15  * as published by the Free Software Foundation; either version 2
  16  * of the License, or (at your option) any later version.
  17  *
  18  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
  19  * KIND, either express or implied.
  20  *
  21  ****************************************************************************/
  22
  23     .global     mpeg2_idct_copy
  24     .type       mpeg2_idct_copy, @function
  25     .global     mpeg2_idct_add
  26     .type       mpeg2_idct_add, @function
  27
  28     /* The IDCT itself.
  29      * Input: %a0: block pointer
  30      * Caller must save all registers. */
  31     .align  2
  32 .idct:
  33     move.l  %a0, %a6
  34
  35     move.l  #0, %macsr              | signed integer mode
  36
  37     move.l  #((2048<<16)+2841), %a0 | W0,  W1
  38     move.l  #((2676<<16)+2408), %a1 | W2,  W3
  39     move.l  #((2048<<16)+1609), %a2 | W4,  W5
  40     move.l  #((1108<<16)+ 565), %a3 | W6,  W7
  41
  42     lea.l   (128,%a6), %a4      | secondary, transposed temp buffer
  43     moveq.l #8, %d3             | loop counter
  44
  45 .row_loop:
  46     movem.l (%a6), %d0-%d2/%a5  | fetch (f0, f2, f4, f6, f1, f3, f5, f7)
  47
  48     mac.w   %a0l, %d2u, %acc0   | %acc0 = W1 * f1
  49     mac.w   %a1l, %d2l, %acc0   |       + W3 * f3
  50     mac.w   %a2l, %a5u, %acc0   |       + W5 * f5
  51     mac.w   %a3l, %a5l, %acc0   |       + W7 * f7
  52
  53     mac.w   %a1l, %d2u, %acc1   | %acc1 = W3 * f1
  54     msac.w  %a3l, %d2l, %acc1   |       - W7 * f3
  55     msac.w  %a0l, %a5u, %acc1   |       - W1 * f5
  56     msac.w  %a2l, %a5l, %acc1   |       - W5 * f7
  57
  58     mac.w   %a2l, %d2u, %acc2   | %acc2 = W5 * f1
  59     msac.w  %a0l, %d2l, %acc2   |       - W1 * f3
  60     mac.w   %a3l, %a5u, %acc2   |       + W7 * f5
  61     mac.w   %a1l, %a5l, %acc2   |       + W3 * f7
  62
  63     mac.w   %a3l, %d2u, %acc3   | %acc3 = W7 * f1
  64     msac.w  %a2l, %d2l, %acc3   |       - W5 * f3
  65     mac.w   %a1l, %a5u, %acc3   |       + W3 * f5
  66     msac.w  %a0l, %a5l, %acc3   |       - W1 * f7
  67
  68     lea.l   (16,%a6), %a6       | Advance to next row; put here to fill EMAC latency
  69     add.l   #(1<<16), %d0       | f0 += 1;
  70
  71     movclr.l %acc0, %d4         | b0
  72     movclr.l %acc1, %d5         | b1
  73     movclr.l %acc2, %d6         | b2
  74     movclr.l %acc3, %d7         | b3
  75
  76     mac.w   %a0u, %d0u, %acc0   | %acc0 = W0 * f0
  77     mac.w   %a2u, %d1u, %acc0   |       + W4 * f4
  78     move.l  %acc0, %acc3
  79     mac.w   %a1u, %d0l, %acc0   |       + W2 * f2
  80     mac.w   %a3u, %d1l, %acc0   |       + W6 * f6
  81
  82     mac.w   %a0u, %d0u, %acc1   | %acc1 = W0 * f0
  83     msac.w  %a2u, %d1u, %acc1   |       - W4 * f4
  84     move.l  %acc1, %acc2
  85     mac.w   %a3u, %d0l, %acc1   |       + W6 * f2
  86     msac.w  %a1u, %d1l, %acc1   |       - W2 * f6
  87
  88     | ^ move.l  %acc1, %acc2      %acc2 = W0 * f0 - W4 * f4
  89     msac.w  %a3u, %d0l, %acc2   |       - W6 * f2
  90     mac.w   %a1u, %d1l, %acc2   |       + W2 * f6
  91
  92     | ^ move.l  %acc0, %acc3      %acc3 = W0 * f0 + W4 * f4
  93     msac.w  %a1u, %d0l, %acc3   |       - W2 * f2
  94     msac.w  %a3u, %d1l, %acc3   |       - W6 * f6
  95
  96     moveq.l #12, %d1            | shift amount
  97
  98     move.l  %acc0, %d0          | block[7] = (a0
  99     sub.l   %d4,%d0             |     - b0)
 100     asr.l   %d1, %d0            |     >> 12
 101     move.w  %d0, (7*16,%a4)
 102
 103     move.l  %acc1, %d0          | block[6] = (a1
 104     sub.l   %d5,%d0             |     - b1)
 105     asr.l   %d1, %d0            |     >> 12
 106     move.w  %d0, (6*16,%a4)
 107
 108     move.l  %acc2, %d0          | block[5] = (a2
 109     sub.l   %d6,%d0             |     - b2)
 110     asr.l   %d1, %d0            |     >> 12
 111     move.w  %d0, (5*16,%a4)
 112
 113     move.l  %acc3, %d0          | block[4] = (a3
 114     sub.l   %d7,%d0             |     - b3)
 115     asr.l   %d1, %d0            |     >> 12
 116     move.w  %d0, (4*16,%a4)
 117
 118     movclr.l %acc3, %d0         | block[3] = (a3
 119     add.l   %d7, %d0            |     + b3)
 120     asr.l   %d1, %d0            |     >> 12
 121     move.w  %d0, (3*16,%a4)
 122
 123     movclr.l %acc2, %d0         | block[2] = (a2
 124     add.l   %d6, %d0            |     + b2)
 125     asr.l   %d1, %d0            |     >> 12
 126     move.w  %d0, (2*16,%a4)
 127
 128     movclr.l %acc1, %d0         | block[1] = (a1
 129     add.l   %d5, %d0            |     + b1)
 130     asr.l   %d1, %d0            |     >> 12
 131     move.w  %d0, (1*16,%a4)
 132
 133     movclr.l %acc0, %d0         | block[0] = (a0
 134     add.l   %d4, %d0            |     + b0)
 135     asr.l   %d1, %d0            |     >> 12
 136     move.w  %d0, (%a4)+         | advance to next temp column
 137
 138     subq.l  #1, %d3             | loop 8 times
 139     bne.w   .row_loop
 140
 141     | %a6 now points to the temp buffer, where we need it.
 142     lea.l   (-16-128,%a4), %a4  | point %a4 back to the input block
 143     moveq.l #8, %d3             | loop counter
 144
 145 .col_loop:
 146     movem.l (%a6), %d0-%d2/%a5  | fetch (f0, f2, f4, f6, f1, f3, f5, f7)
 147
 148     mac.w   %a0l, %d2u, %acc0   | %acc0 = W1 * f1
 149     mac.w   %a1l, %d2l, %acc0   |       + W3 * f3
 150     mac.w   %a2l, %a5u, %acc0   |       + W5 * f5
 151     mac.w   %a3l, %a5l, %acc0   |       + W7 * f7
 152
 153     mac.w   %a1l, %d2u, %acc1   | %acc1 = W3 * f1
 154     msac.w  %a3l, %d2l, %acc1   |       - W7 * f3
 155     msac.w  %a0l, %a5u, %acc1   |       - W1 * f5
 156     msac.w  %a2l, %a5l, %acc1   |       - W5 * f7
 157
 158     mac.w   %a2l, %d2u, %acc2   | %acc2 = W5 * f1
 159     msac.w  %a0l, %d2l, %acc2   |       - W1 * f3
 160     mac.w   %a3l, %a5u, %acc2   |       + W7 * f5
 161     mac.w   %a1l, %a5l, %acc2   |       + W3 * f7
 162
 163     mac.w   %a3l, %d2u, %acc3   | %acc3 = W7 * f1
 164     msac.w  %a2l, %d2l, %acc3   |       - W5 * f3
 165     mac.w   %a1l, %a5u, %acc3   |       + W3 * f5
 166     msac.w  %a0l, %a5l, %acc3   |       - W1 * f7
 167
 168     lea.l   (16,%a6), %a6       | Advance to next row; put here to fill EMAC latency
 169     add.l   #(32<<16), %d0      | DC offset: 0.5
 170
 171     movclr.l %acc0, %d4         | b0
 172     movclr.l %acc1, %d5         | b1
 173     movclr.l %acc2, %d6         | b2
 174     movclr.l %acc3, %d7         | b3
 175
 176     mac.w   %a0u, %d0u, %acc0   | %acc0 = W0 * f0
 177     mac.w   %a2u, %d1u, %acc0   |       + W4 * f4
 178     move.l  %acc0, %acc3
 179     mac.w   %a1u, %d0l, %acc0   |       + W2 * f2
 180     mac.w   %a3u, %d1l, %acc0   |       + W6 * f6
 181
 182     mac.w   %a0u, %d0u, %acc1   | %acc1 = W0 * f0
 183     msac.w  %a2u, %d1u, %acc1   |       - W4 * f4
 184     move.l  %acc1, %acc2
 185     mac.w   %a3u, %d0l, %acc1   |       + W6 * f2
 186     msac.w  %a1u, %d1l, %acc1   |       - W2 * f6
 187
 188     | ^ move.l  %acc1, %acc2      %acc2 = W0 * f0 - W4 * f4
 189     msac.w  %a3u, %d0l, %acc2   |       - W6 * f2
 190     mac.w   %a1u, %d1l, %acc2   |       + W2 * f6
 191
 192     | ^ move.l  %acc0, %acc3      %acc3 = W0 * f0 + W4 * f4
 193     msac.w  %a1u, %d0l, %acc3   |       - W2 * f2
 194     msac.w  %a3u, %d1l, %acc3   |       - W6 * f6
 195
 196     moveq.l #17, %d1            | shift amount
 197
 198     move.l  %acc0, %d0          | block[7] = (a0
 199     sub.l   %d4,%d0             |     - b0)
 200     asr.l   %d1, %d0            |     >> 17
 201     move.w  %d0, (7*16,%a4)
 202
 203     move.l  %acc1, %d0          | block[6] = (a1
 204     sub.l   %d5,%d0             |     - b1)
 205     asr.l   %d1, %d0            |     >> 17
 206     move.w  %d0, (6*16,%a4)
 207
 208     move.l  %acc2, %d0          | block[5] = (a2
 209     sub.l   %d6,%d0             |     - b2)
 210     asr.l   %d1, %d0            |     >> 17
 211     move.w  %d0, (5*16,%a4)
 212
 213     move.l  %acc3, %d0          | block[4] = (a3
 214     sub.l   %d7,%d0             |     - b3)
 215     asr.l   %d1, %d0            |     >> 17
 216     move.w  %d0, (4*16,%a4)
 217
 218     movclr.l %acc3, %d0         | block[3] = (a3
 219     add.l   %d7, %d0            |     + b3)
 220     asr.l   %d1, %d0            |     >> 17
 221     move.w  %d0, (3*16,%a4)
 222
 223     movclr.l %acc2, %d0         | block[2] = (a2
 224     add.l   %d6, %d0            |     + b2)
 225     asr.l   %d1, %d0            |     >> 17
 226     move.w  %d0, (2*16,%a4)
 227
 228     movclr.l %acc1, %d0         | block[1] = (a1
 229     add.l   %d5, %d0            |     + b1)
 230     asr.l   %d1, %d0            |     >> 17
 231     move.w  %d0, (1*16,%a4)
 232
 233     movclr.l %acc0, %d0         | block[0] = (a0
 234     add.l   %d4, %d0            |     + b0)
 235     asr.l   %d1, %d0            |     >> 17
 236     move.w  %d0, (%a4)+         | advance to next column
 237
 238     subq.l  #1, %d3             | loop 8 times
 239     bne.w   .col_loop
 240
 241     rts
 242
 243     .align  2
 244
 245 mpeg2_idct_copy:
 246     lea.l   (-11*4,%sp), %sp
 247     movem.l %d2-%d7/%a2-%a6, (%sp)  | save some registers
 248     move.l  (11*4+4,%sp), %a0       | %a0 - block pointer for idct
 249
 250     bsr.w   .idct                   | apply idct to block
 251     movem.l (11*4+4,%sp), %a0-%a2   | %a0 - block pointer
 252                                     | %a1 - destination pointer
 253                                     | %a2 - stride
 254
 255     move.l  #255, %d1           | preload constant for clipping
 256     moveq.l #8, %d4             | loop counter
 257
 258 .copy_clip_loop:
 259     move.w  (%a0), %d0          | load block[0]
 260     ext.l   %d0                 | sign extend
 261     cmp.l   %d1, %d0            | overflow?
 262     bls.b   1f
 263     spl.b   %d0                 |   yes: set appropriate limit value in low byte
 264 1:
 265     move.b  %d0, %d2            | collect output bytes 0..3 in %d2
 266     lsl.l   #8, %d2
 267
 268     move.w  (2,%a0), %d0        | load block[1]
 269     ext.l   %d0                 | sign extend
 270     cmp.l   %d1, %d0            | overflow?
 271     bls.b   1f
 272     spl.b   %d0                 | yes: set appropriate limit value in low byte
 273 1:
 274     move.b  %d0, %d2            | collect output bytes 0..3 in %d2
 275     lsl.l   #8, %d2
 276     clr.l   (%a0)+              | clear block[0] and block[1],
 277                                 | %a0 now pointing to block[2]
 278     move.w  (%a0), %d0          | do b2 and b3
 279     ext.l   %d0
 280     cmp.l   %d1, %d0
 281     bls.b   1f
 282     spl.b   %d0
 283 1:
 284     move.b  %d0, %d2
 285     lsl.l   #8, %d2
 286
 287     move.w  (2,%a0), %d0
 288     ext.l   %d0
 289     cmp.l   %d1, %d0
 290     bls.b   1f
 291     spl.b   %d0
 292 1:
 293     move.b  %d0, %d2
 294     clr.l   (%a0)+
 295
 296     move.w  (%a0), %d0          | do b4 and b5
 297     ext.l   %d0
 298     cmp.l   %d1, %d0
 299     bls.b   1f
 300     spl.b   %d0
 301 1:
 302     move.b  %d0, %d3
 303     lsl.l   #8, %d3
 304
 305     move.w  (2,%a0), %d0
 306     ext.l   %d0
 307     cmp.l   %d1, %d0
 308     bls.b   1f
 309     spl.b   %d0
 310 1:
 311     move.b  %d0, %d3
 312     lsl.l   #8, %d3
 313     clr.l   (%a0)+
 314
 315     move.w  (%a0), %d0          | do b6 and b7
 316     ext.l   %d0
 317     cmp.l   %d1, %d0
 318     bls.b   1f
 319     spl.b   %d0
 320 1:
 321     move.b  %d0, %d3
 322     lsl.l   #8, %d3
 323
 324     move.w  (2,%a0), %d0
 325     ext.l   %d0
 326     cmp.l   %d1, %d0
 327     bls.b   1f
 328     spl.b   %d0
 329 1:
 330     move.b  %d0, %d3
 331     clr.l   (%a0)+
 332
 333     movem.l %d2-%d3, (%a1)      | write all 8 output bytes at once
 334     add.l   %a2, %a1            | advance output pointer
 335     subq.l  #1, %d4             | loop 8 times
 336     bne.w   .copy_clip_loop
 337
 338     movem.l (%sp), %d2-%d7/%a2-%a6
 339     lea.l   (11*4,%sp), %sp
 340     rts
 341
 342     .align  2
 343
 344 mpeg2_idct_add:
 345     lea.l   (-11*4,%sp), %sp
 346     movem.l %d2-%d7/%a2-%a6, (%sp)
 347     movem.l (11*4+4,%sp), %d0/%a0-%a2   | %d0 - last value
 348                                         | %a0 - block pointer
 349                                         | %a1 - destination pointer
 350                                         | %a2 - stride
 351
 352     cmp.l   #129, %d0           | last == 129 ?
 353     bne.b   .idct_add           |   no: perform idct + addition
 354     move.w  (%a0), %d0
 355     ext.l   %d0                 | ((block[0]
 356     asr.l   #4, %d0             |      >> 4)
 357     and.l   #7, %d0             |      & 7)
 358     subq.l  #4, %d0             |      - 4 == 0 ?
 359     bne.w   .dc_add             |   no: just perform addition
 360
 361 .idct_add:
 362     bsr.w   .idct                   | apply idct
 363     movem.l (11*4+8,%sp), %a0-%a2   | reload arguments %a0..%a2
 364
 365     move.l  #255, %d2           | preload constant for clipping
 366     clr.l   %d3                 | used for splitting input words into bytes
 367     moveq.l #8, %d4             | loop counter
 368
 369 .add_clip_loop:
 370     movem.l (%a1), %d6-%d7      | fetch (b0 b1 b2 b3) (b4 b5 b6 b7)
 371     swap    %d6                 | (b2 b3 b0 b1)
 372     swap    %d7                 | (b6 b7 b4 b5)
 373
 374     move.w  (2,%a0), %d0        | load block[1]
 375     ext.l   %d0                 | sign extend
 376     move.b  %d6, %d3            | copy b1
 377     lsr.l   #8, %d6             | prepare 1st buffer for next byte
 378     add.l   %d3, %d0            | add b1
 379     cmp.l   %d2, %d0            | overflow ?
 380     bls.b   1f
 381     spl.b   %d0                 |   yes: set appropriate limit value in low byte
 382 1:
 383     move.w  (%a0), %d1          | load block[0]
 384     ext.l   %d1                 | sign extend
 385     move.b  %d6, %d3            | copy b0
 386     lsr.l   #8, %d6             | prepare 1st buffer for next byte
 387     add.l   %d3, %d1            | add b0
 388     cmp.l   %d2, %d1            | overflow ?
 389     bls.b   1f
 390     spl.b   %d1                 |   yes: set appropriate limit value in low byte
 391 1:
 392     move.b  %d1, %d5            | collect output bytes 0..3 in %d5
 393     lsl.l   #8, %d5
 394     move.b  %d0, %d5
 395     lsl.l   #8, %d5
 396     clr.l   (%a0)+              | clear block[0] and block[1]
 397                                 |   %a0 now pointing to block[2]
 398     move.w  (2,%a0), %d0        | do b3 and b2
 399     ext.l   %d0
 400     move.b  %d6, %d3
 401     lsr.l   #8, %d6
 402     add.l   %d3, %d0
 403     cmp.l   %d2, %d0
 404     bls.b   1f
 405     spl.b   %d0
 406 1:
 407     move.w  (%a0), %d1
 408     ext.l   %d1
 409     add.l   %d6, %d1
 410     cmp.l   %d2, %d1
 411     bls.b   1f
 412     spl.b   %d1
 413 1:
 414     move.b  %d1, %d5
 415     lsl.l   #8, %d5
 416     move.b  %d0, %d5
 417     clr.l   (%a0)+
 418
 419     move.w  (2,%a0), %d0        | do b5 and b4
 420     ext.l   %d0
 421     move.b  %d7, %d3
 422     lsr.l   #8, %d7
 423     add.l   %d3, %d0
 424     cmp.l   %d2, %d0
 425     bls.b   1f
 426     spl.b   %d0
 427 1:
 428     move.w  (%a0), %d1
 429     ext.l   %d1
 430     move.b  %d7, %d3
 431     lsr.l   #8, %d7
 432     add.l   %d3, %d1
 433     cmp.l   %d2, %d1
 434     bls.b   1f
 435     spl.b   %d1
 436 1:
 437     move.b  %d1, %d6
 438     lsl.l   #8, %d6
 439     move.b  %d0, %d6
 440     lsl.l   #8, %d6
 441     clr.l   (%a0)+
 442
 443     move.w  (2,%a0), %d0        | do b7 and b6
 444     ext.l   %d0
 445     move.b  %d7, %d3
 446     lsr.l   #8, %d7
 447     add.l   %d3, %d0
 448     cmp.l   %d2, %d0
 449     bls.b   1f
 450     spl.b   %d0
 451 1:
 452     move.w  (%a0), %d1
 453     ext.l   %d1
 454     add.l   %d7, %d1
 455     cmp.l   %d2, %d1
 456     bls.b   1f
 457     spl.b   %d1
 458 1:
 459     move.b  %d1, %d6
 460     lsl.l   #8, %d6
 461     move.b  %d0, %d6
 462     clr.l   (%a0)+
 463
 464     movem.l %d5-%d6, (%a1)      | write all 8 output bytes at once
 465     add.l   %a2, %a1            | advance output pointer
 466     subq.l  #1, %d4             | loop 8 times
 467     bne.w   .add_clip_loop
 468
 469     bra.w   .idct_add_end
 470
 471 .dc_add:
 472     move.w  (%a0), %d0
 473     ext.l   %d0                 | %d0 = (block[0]
 474     add.l   #64, %d0            |       + 64)
 475     asr.l   #7, %d0             |       >> 7
 476     clr.w   (%a0)               | clear block[0]
 477     clr.w   (63*2,%a0)          |   and block[63]
 478     move.l  %d0, %a0            | DC value in %a0
 479
 480     move.l  #255, %d2           | preload constant for clipping
 481     clr.l   %d3                 | for splitting input words into bytes
 482     moveq.l #8, %d4             | loop counter
 483
 484 .dc_clip_loop:
 485     movem.l (%a1), %d6-%d7      | (b0 b1 b2 b3) (b4 b5 b6 b7)
 486     swap    %d6                 | (b2 b3 b0 b1)
 487     swap    %d7                 | (b6 b7 b4 b5)
 488
 489     move.l  %a0, %d0            | copy DC
 490     move.b  %d6, %d3            | copy b1
 491     lsr.l   #8, %d6             | prepare 1st buffer for next byte
 492     add.l   %d3, %d0            | add b1
 493     cmp.l   %d2, %d0            | overflow ?
 494     bls.b   1f
 495     spl.b   %d0                 |   yes: set appropriate limit value in low byte
 496 1:
 497     move.l  %a0, %d1            | copy DC
 498     move.b  %d6, %d3            | copy b0
 499     lsr.l   #8, %d6             | prepare 1st buffer for next byte
 500     add.l   %d3, %d1            | add b0
 501     cmp.l   %d2, %d1            | overflow ?
 502     bls.b   1f
 503     spl.b   %d1                 |   yes: set appropriate limit value in low byte
 504 1:
 505     move.b  %d1, %d5            | collect output bytes 0..3 in %d5
 506     lsl.l   #8, %d5
 507     move.b  %d0, %d5
 508     lsl.l   #8, %d5
 509
 510     move.l  %a0, %d0            | do b3 and b2
 511     move.b  %d6, %d3
 512     lsr.l   #8, %d6
 513     add.l   %d3, %d0
 514     cmp.l   %d2, %d0
 515     bls.b   1f
 516     spl.b   %d0
 517 1:
 518     move.l  %a0, %d1
 519     add.l   %d6, %d1
 520     cmp.l   %d2, %d1
 521     bls.b   1f
 522     spl.b   %d1
 523 1:
 524     move.b  %d1, %d5
 525     lsl.l   #8, %d5
 526     move.b  %d0, %d5
 527
 528     move.l  %a0, %d0            | do b5 and b4
 529     move.b  %d7, %d3
 530     lsr.l   #8, %d7
 531     add.l   %d3, %d0
 532     cmp.l   %d2, %d0
 533     bls.b   1f
 534     spl.b   %d0
 535 1:
 536     move.l  %a0, %d1
 537     move.b  %d7, %d3
 538     lsr.l   #8, %d7
 539     add.l   %d3, %d1
 540     cmp.l   %d2, %d1
 541     bls.b   1f
 542     spl.b   %d1
 543 1:
 544     move.b  %d1, %d6            | do b7 and b6
 545     lsl.l   #8, %d6
 546     move.b  %d0, %d6
 547     lsl.l   #8, %d6
 548
 549     move.l  %a0, %d0
 550     move.b  %d7, %d3
 551     lsr.l   #8, %d7
 552     add.l   %d3, %d0
 553     cmp.l   %d2, %d0
 554     bls.b   1f
 555     spl.b   %d0
 556 1:
 557     move.l  %a0, %d1
 558     add.l   %d7, %d1
 559     cmp.l   %d2, %d1
 560     bls.b   1f
 561     spl.b   %d1
 562 1:
 563     move.b  %d1, %d6
 564     lsl.l   #8, %d6
 565     move.b  %d0, %d6
 566
 567     movem.l %d5-%d6, (%a1)      | write all 8 output bytes at once
 568     add.l   %a2, %a1            | advance output pointer
 569     subq.l  #1, %d4             | loop 8 times
 570     bne.w   .dc_clip_loop
 571
 572 .idct_add_end:
 573     movem.l (%sp), %d2-%d7/%a2-%a6
 574     lea.l   (11*4,%sp), %sp
 575     rts