apps/dsp_cf.S

   1 /***************************************************************************
   2  *             __________               __   ___.
   3  *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
   4  *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
   5  *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
   6  *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
   7  *                     \/            \/     \/    \/            \/
   8  * $Id$
   9  *
  10  * Copyright (C) 2006 Thom Johansen
  11  * Portions Copyright (C) 2007 Michael Sevakis
  12  *
  13  * All files in this archive are subject to the GNU General Public License.
  14  * See the file COPYING in the source tree root for full license agreement.
  15  *
  16  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
  17  * KIND, either express or implied.
  18  *
  19  ****************************************************************************/
  20
  21 /****************************************************************************
  22  * void apply_crossfeed(int count, int32_t *src[])
  23  */
  24     .section    .text
  25     .global     apply_crossfeed
  26 apply_crossfeed:
  27     lea.l       -44(%sp), %sp
  28     movem.l     %d2-%d7/%a2-%a6, (%sp)  | save all regs
  29     movem.l     48(%sp), %d7/%a4        | %d7 = count, %a4 = src
  30     movem.l     (%a4), %a4-%a5          | %a4 = src[0], %a5 = src[1]
  31     lea.l       crossfeed_data, %a1
  32     move.l      (%a1)+, %a6             | a6 = direct gain
  33     movem.l     12(%a1), %d0-%d3        | fetch filter history samples
  34     move.l      132(%a1), %a0           | fetch delay line address
  35     movem.l     (%a1), %a1-%a3          | load filter coefs
  36     /* Register usage in loop:
  37      * %a0 = delay_p, %a1..%a3 = b0, b1, a1 (filter coefs),
  38      * %a4 = src[0], %a5 = src[1], %a6 = direct gain,
  39      * %d0..%d3 = history
  40      * %d4..%d6 = temp.
  41      * %d7 = count
  42      */
  43 .cfloop:
  44     mac.l       %a2, %d0, 4(%a0), %d0, %acc0 | acc  = b1*dr[n - 1] d0 = dr[n]
  45     mac.l       %a1, %d0             , %acc0 | acc += b0*dr[n]
  46     mac.l       %a3, %d1,  (%a4), %d4, %acc0 | acc += a1*y_l[n - 1], load L
  47     move.l      %acc0, %d1              | get filtered delayed sample
  48     mac.l       %a6, %d4, %acc0         | acc += gain*x_l[n]
  49     movclr.l    %acc0, %d6              |
  50     move.l      %d6, (%a4)+             | write result
  51
  52     mac.l       %a2, %d2, (%a0), %d2, %acc0 | acc  = b1*dl[n - 1], d2 = dl[n]
  53     mac.l       %a1, %d2            , %acc0 | acc += b0*dl[n]
  54     mac.l       %a3, %d3, (%a5), %d5, %acc0 | acc += a1*y_r[n - 1], load R
  55     movem.l     %d4-%d5, (%a0)          | save left & right inputs to delay line
  56     move.l      %acc0, %d3              | get filtered delayed sample
  57     mac.l       %a6, %d5, %acc0         | acc += gain*x_r[n]
  58     lea.l       8(%a0), %a0             | increment delay pointer
  59     movclr.l    %acc0, %d6              |
  60     move.l      %d6, (%a5)+             | write result
  61
  62     cmpa.l      #crossfeed_data+136, %a0| wrap a0 if passed end
  63     bge.b       .cfwrap                 |
  64     .word       0x51fb                  | tpf.l - trap the buffer wrap
  65 .cfwrap:
  66     lea.l       -104(%a0), %a0          | wrap
  67     subq.l      #1, %d7                 | --count < 0 ?
  68     bgt.b       .cfloop                 |
  69     lea.l       crossfeed_data+16, %a1  | save data back to struct
  70     movem.l     %d0-%d3, (%a1)          | ...history
  71     move.l      %a0, 120(%a1)           | ...delay_p
  72     movem.l     (%sp), %d2-%d7/%a2-%a6  | restore all regs
  73     lea.l       44(%sp), %sp
  74     rts
  75 .cfend:
  76     .size       apply_crossfeed,.cfend-apply_crossfeed
  77
  78
  79 /****************************************************************************
  80  * int dsp_downsample(int count, struct dsp_data *data,
  81  *                    in32_t *src[], int32_t *dst[])
  82  */
  83     .section    .text
  84     .global     dsp_downsample
  85 dsp_downsample:
  86     lea.l       -40(%sp), %sp           | save non-clobberables
  87     movem.l     %d2-%d7/%a2-%a5, (%sp)  |
  88     movem.l     44(%sp), %d2/%a0-%a2    | %d2 = count
  89                                         | %a0 = data
  90                                         | %a1 = src
  91                                         | %a2 = dst
  92     movem.l     4(%a0), %d3-%d4         | %d3 = ch = data->num_channels
  93                                         | %d4 = delta = data->resample_data.delta
  94     moveq.l     #16, %d7                | %d7 = shift
  95 .dschannel_loop:
  96     move.l      12(%a0), %d5            | %d5 = phase = data->resample_data.phase
  97     move.l      -4(%a1, %d3.l*4), %a3   | %a3 = s = src[ch-1]
  98     move.l      -4(%a2, %d3.l*4), %a4   | %a4 = d = dst[ch-1]
  99     lea.l       12(%a0, %d3.l*4), %a5   | %a5 = &data->resample_data.ast_sample[ch-1]
 100     move.l      (%a5), %d0              | %d0 = last = data->resample_data.last_sample[ch-1]
 101     move.l      -4(%a3, %d2.l*4), (%a5) | data->resample_data.last_sample[ch-1] = s[count-1]
 102     move.l      %d5, %d6                | %d6 = pos = phase >> 16
 103     lsr.l       %d7, %d6                |
 104     cmp.l       %d2, %d6                | past end of samples?
 105     bge.b       .dsloop_skip            | yes? skip loop
 106     tst.l       %d6                     | need last sample of prev. frame?
 107     bne.b       .dsloop                 | no? start main loop
 108     move.l      (%a3, %d6.l*4), %d1     | %d1 = s[pos]
 109     bra.b       .dsuse_last_start       | start with last (last in %d0)
 110 .dsloop:
 111     lea.l       -4(%a3, %d6.l*4), %a5   | load s[pos-1] and s[pos]
 112     movem.l     (%a5), %d0-%d1          |
 113 .dsuse_last_start:
 114     sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
 115     move.l      %d0, %acc0              | %acc0 = previous sample
 116     move.l      %d5, %d0                | frac = (phase << 16) >> 1
 117     lsl.l       %d7, %d0                |
 118     lsr.l       #1, %d0                 |
 119     mac.l       %d0, %d1, %acc0         | %acc0 += frac * diff
 120     add.l       %d4, %d5                | phase += delta
 121     move.l      %d5, %d6                | pos = phase >> 16
 122     lsr.l       %d7, %d6                |
 123     movclr.l    %acc0, %d0              |
 124     move.l      %d0, (%a4)+             | *d++ = %d0
 125     cmp.l       %d2, %d6                | pos < count?
 126     blt.b       .dsloop                 | yes? continue resampling
 127 .dsloop_skip:
 128     subq.l      #1, %d3                 | ch > 0?
 129     bgt.b       .dschannel_loop         | yes? process next channel
 130     asl.l       %d7, %d2                | wrap phase to start of next frame
 131     sub.l       %d2, %d5                | data->resample_data.phase =
 132     move.l      %d5, 12(%a0)            | ... phase - (count << 16)
 133     move.l      %a4, %d0                | return d - d[0]
 134     sub.l       (%a2), %d0              |
 135     asr.l       #2, %d0                 | convert bytes->samples
 136     movem.l     (%sp), %d2-%d7/%a2-%a5  | restore non-clobberables
 137     lea.l       40(%sp), %sp            | cleanup stack
 138     rts                                 | buh-bye
 139 .dsend:
 140     .size       dsp_downsample,.dsend-dsp_downsample
 141
 142 /****************************************************************************
 143  * int dsp_upsample(int count, struct dsp_data *dsp,
 144  *                  in32_t *src[], int32_t *dst[])
 145  */
 146     .section    .text
 147     .global     dsp_upsample
 148 dsp_upsample:
 149     lea.l       -40(%sp), %sp           | save non-clobberables
 150     movem.l     %d2-%d7/%a2-%a5, (%sp)  |
 151     movem.l     44(%sp), %d2/%a0-%a2    | %d2 = count
 152                                         | %a0 = data
 153                                         | %a1 = src
 154                                         | %a2 = dst
 155     movem.l      4(%a0), %d3-%d4        | %d3 = ch = channels
 156                                         | %d4 = delta = data->resample_data.delta
 157     swap        %d4                     | swap delta to high word to use
 158                                         | carries to increment position
 159 .uschannel_loop:
 160     move.l      12(%a0), %d5            | %d5 = phase = data->resample_data.phase
 161     move.l      -4(%a1, %d3.l*4), %a3   | %a3 = s = src[ch-1]
 162     lea.l       12(%a0, %d3.l*4), %a4   | %a4 = &data->resample_data.last_sample[ch-1]
 163     lea.l       (%a3, %d2.l*4), %a5     | %a5 = src_end = &src[count]
 164     move.l      (%a4), %d0              | %d0 = last = data->resample_data.last_sample[ch-1]
 165     move.l      -(%a5), (%a4)           | data->resample_data.last_sample[ch-1] = s[count-1]
 166     move.l      -4(%a2, %d3.l*4), %a4   | %a4 = d = dst[ch-1]
 167     swap        %d5                     | swap phase to high word to use
 168                                         | carries to increment position
 169     move.l      %d5, %d6                | %d6 = pos = phase >> 16
 170     clr.w       %d5                     |
 171     eor.l       %d5, %d6                | pos == 0?
 172     beq.b       .usstart_0              | no? transistion from down
 173     cmp.l       %d3, %d6                | past end of samples?
 174     bge.b       .usloop_skip            | yes? skip loop
 175     lea.l       -4(%a3, %d6.l*4), %a3   | %a3 = s = &s[pos-1] (previous)
 176     move.l      (%a3)+, %d0             | %d0 = *s++
 177     .word       0x51fa                  | tpf.w - trap next instruction
 178 .usloop_1:
 179     move.l      %d6, %d0                | move previous sample to %d0
 180 .usstart_0:
 181     move.l      (%a3)+, %d1             | fetch next sample
 182     move.l      %d1, %d6                | save sample value
 183     sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
 184 .usloop_0:
 185     lsr.l       #1, %d5                 | make phase into frac
 186     mac.l       %d1, %d5, %acc0         | %acc0 = diff * frac
 187     lsl.l       #1, %d5                 | restore frac to phase
 188     movclr.l    %acc0, %d7              | %d7 = product
 189     add.l       %d0, %d7                | %d7 = last + product
 190     move.l      %d7, (%a4)+             | *d++ = %d7
 191     add.l       %d4, %d5                | phase += delta
 192     bcc.b       .usloop_0               | load next values?
 193     cmp.l       %a5, %a3                | src <= src_end?
 194     ble.b       .usloop_1               | yes? continue resampling
 195 .usloop_skip:
 196     subq.l      #1, %d3                 | ch > 0?
 197     bgt.b       .uschannel_loop         | yes? process next channel
 198     swap        %d5                     | wrap phase to start of next frame
 199     move.l      %d5, 12(%a0)            | ...and save in data->resample_data.phase
 200     move.l      %a4, %d0                | return d - d[0]
 201     sub.l       (%a2), %d0              |
 202     movem.l     (%sp), %d2-%d7/%a2-%a5  | restore non-clobberables
 203     asr.l       #2, %d0                 | convert bytes->samples
 204     lea.l       40(%sp), %sp            | cleanup stack
 205     rts                                 | buh-bye
 206 .usend:
 207     .size       dsp_upsample,.usend-dsp_upsample
 208
 209 /* These routines might benefit from burst transfers but we'll keep them
 210  * small for now since they're rather light weight
 211  */
 212
 213 /****************************************************************************
 214  * void channels_process_sound_chan_mono(int count, int32_t *buf[])
 215  *
 216  * Mix left and right channels 50/50 into a center channel.
 217  */
 218     .section    .text
 219     .global     channels_process_sound_chan_mono
 220 channels_process_sound_chan_mono:
 221     movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
 222     lea.l       -12(%sp), %sp           | save registers
 223     move.l      %macsr, %d1             |
 224     movem.l     %d1-%d3, (%sp)          |
 225     move.l      #0xb0, %macsr           | put emac in rounding fractional mode
 226     movem.l     (%a0), %a0-%a1          | get channel pointers
 227     move.l      #0x40000000, %d3        | %d3 = 0.5
 228 1:
 229     move.l     (%a0), %d1               | L = R = l/2 + r/2
 230     mac.l      %d1, %d3, (%a1), %d2, %acc0 |
 231     mac.l      %d2, %d3, %acc0          |
 232     movclr.l   %acc0, %d1               |
 233     move.l     %d1, (%a0)+              | output to original buffer
 234     move.l     %d1, (%a1)+              |
 235     subq.l     #1, %d0                  |
 236     bgt.s      1b                       |
 237     movem.l    (%sp), %d1-%d3           | restore registers
 238     move.l     %d1, %macsr              |
 239     lea.l      12(%sp), %sp             | cleanup
 240     rts
 241 .cpmono_end:
 242     .size       channels_process_sound_chan_mono, .cpmono_end-channels_process_sound_chan_mono
 243
 244
 245 /****************************************************************************
 246  * void channels_process_sound_chan_custom(int count, int32_t *buf[])
 247  *
 248  * Apply stereo width (narrowing/expanding) effect.
 249  */
 250     .section    .text
 251     .global     channels_process_sound_chan_custom
 252 channels_process_sound_chan_custom:
 253     movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
 254     lea.l       -16(%sp), %sp           | save registers
 255     move.l      %macsr, %d1             |
 256     movem.l     %d1-%d4, (%sp)          |
 257     move.l      #0xb0, %macsr           | put emac in rounding fractional mode
 258     movem.l     (%a0), %a0-%a1          | get channel pointers
 259     move.l      dsp_sw_gain, %d3        | load straight (mid) gain
 260     move.l      dsp_sw_cross, %d4       | load cross (side) gain
 261 1:
 262     move.l      (%a0), %d1              |
 263     mac.l       %d1, %d3, (%a1), %d2, %acc0 |  L = l*gain + r*cross
 264     mac.l       %d1, %d4            , %acc1 |  R = r*gain + l*cross
 265     mac.l       %d2, %d4            , %acc0 |
 266     mac.l       %d2, %d3            , %acc1 |
 267     movclr.l    %acc0, %d1              |
 268     movclr.l    %acc1, %d2              |
 269     move.l      %d1, (%a0)+             |
 270     move.l      %d2, (%a1)+             |
 271     subq.l      #1, %d0                 |
 272     bgt.s       1b                      |
 273     movem.l     (%sp), %d1-%d4          | restore registers
 274     move.l      %d1, %macsr             |
 275     lea.l       16(%sp), %sp            | cleanup
 276     rts
 277 .cpcustom_end:
 278     .size       channels_process_sound_chan_custom, .cpcustom_end-channels_process_sound_chan_custom
 279
 280 /****************************************************************************
 281  *  void channels_process_sound_chan_karaoke(int count, int32_t *buf[])
 282  *
 283  *  Separate channels into side channels.
 284  */
 285     .section    .text
 286     .global     channels_process_sound_chan_karaoke
 287 channels_process_sound_chan_karaoke:
 288     movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
 289     lea.l       -16(%sp), %sp           | save registers
 290     move.l      %macsr, %d1             |
 291     movem.l     %d1-%d4, (%sp)          |
 292     move.l      #0xb0, %macsr           | put emac in rounding fractional mode
 293     movem.l     (%a0), %a0-%a1          | get channel pointers
 294     move.l      #0x40000000, %d4        | %d3 = 0.5
 295 1:
 296     move.l     (%a0), %d1               |
 297     msac.l     %d1, %d4, (%a1), %d2, %acc0 | R = r/2 - l/2
 298     mac.l      %d2, %d4            , %acc0 |
 299     movclr.l   %acc0, %d1               |
 300     move.l     %d1, (%a1)+              |
 301     neg.l      %d1                      | L = -R = -(r/2 - l/2) = l/2 - r/2
 302     move.l     %d1, (%a0)+              |
 303     subq.l     #1, %d0                  |
 304     bgt.s      1b                       |
 305     movem.l    (%sp), %d1-%d4           | restore registers
 306     move.l     %d1, %macsr              |
 307     lea.l      16(%sp), %sp             | cleanup
 308     rts
 309 .cpkaraoke_end:
 310     .size       channels_process_sound_chan_karaoke, .cpkaraoke_end-channels_process_sound_chan_karaoke
 311 /****************************************************************************
 312  * void sample_output_stereo(int count, struct dsp_data *data,
 313  *                               int32_t *src[], int16_t *dst)
 314  *
 315  * Framework based on the ubiquitous Rockbox line transfer logic for
 316  * Coldfire CPUs.
 317  *
 318  * Does emac clamping and scaling (which proved faster than the usual
 319  * checks and branches - even single test clamping) and writes using
 320  * line burst transfers. Also better than writing a single L-R pair per
 321  * loop but a good deal more code.
 322  *
 323  * Attemping bursting during reads is rather futile since the source and
 324  * destination alignments rarely agree and too much complication will
 325  * slow us up. The parallel loads seem to do a bit better at least until
 326  * a pcm buffer can always give line aligned chunk and then aligning the
 327  * dest can then imply the source is aligned if the source buffers are.
 328  * For now longword alignment is assumed of both the source and dest.
 329  *
 330  */
 331     .section   .text
 332     .global    sample_output_stereo
 333 sample_output_stereo:
 334     lea.l       -44(%sp), %sp             | save registers
 335     move.l      %macsr, %d1               | do it now as at many lines will
 336     movem.l     %d1-%d7/%a2-%a5, (%sp)    | be the far more common condition
 337     move.l      #0x80, %macsr             | put emac unit in signed int mode
 338     movem.l     48(%sp), %a0-%a2/%a4      |
 339     lea.l       (%a4, %a0.l*4), %a0       | %a0 = end address
 340     move.l      (%a1), %d1                | %a1 = multiplier: (1 << (16 - scale))
 341     sub.l       #16, %d1                  |
 342     neg.l       %d1                       |
 343     moveq.l     #1, %d0                   |
 344     asl.l       %d1, %d0                  |
 345     move.l      %d0, %a1                  |
 346     movem.l     (%a2), %a2-%a3            | get L/R channel pointers
 347     moveq.l     #28, %d0                  | %d0 = second line bound
 348     add.l       %a4, %d0                  |
 349     and.l       #0xfffffff0, %d0          |
 350     cmp.l       %a4, %d0                  | at least a full line?
 351     blo.w       .sos_longloop_1_start     | no? jump to trailing longword
 352     sub.l       #16, %d0                  | %d1 = first line bound
 353     cmp.l       %a4, %d0                  | any leading longwords?
 354     bls.b       .sos_lineloop_start       | no? jump to line loop
 355 .sos_longloop_0:
 356     move.l      (%a2)+, %d1               | read longword from L and R
 357     mac.l       %d1, %a1, (%a3)+, %d2, %acc0 | shift L to high word
 358     mac.l       %d2, %a1, %acc1           | shift R to high word
 359     movclr.l    %acc0, %d1                | get possibly saturated results
 360     movclr.l    %acc1, %d2                |
 361     swap        %d2                       | move R to low word
 362     move.w      %d2, %d1                  | interleave MS 16 bits of each
 363     move.l      %d1, (%a4)+               | ...and write both
 364     cmp.l       %a4, %d0                  |
 365     bhi.b       .sos_longloop_0           |
 366 .sos_lineloop_start:
 367     lea.l       -12(%a0), %a5             | %a5 = at or just before last line bound
 368 .sos_lineloop:
 369     move.l      (%a3)+, %d4               | get next 4 R samples and scale
 370     mac.l       %d4, %a1, (%a3)+, %d5, %acc0 | with saturation
 371     mac.l       %d5, %a1, (%a3)+, %d6, %acc1 |
 372     mac.l       %d6, %a1, (%a3)+, %d7, %acc2 |
 373     mac.l       %d7, %a1, (%a2)+, %d0, %acc3 |
 374     lea.l       16(%a4), %a4              | increment dest here, mitigate stalls
 375     movclr.l    %acc0, %d4                | obtain R results
 376     movclr.l    %acc1, %d5                |
 377     movclr.l    %acc2, %d6                |
 378     movclr.l    %acc3, %d7                |
 379     mac.l       %d0, %a1, (%a2)+, %d1, %acc0 | get next 4 L samples and scale
 380     mac.l       %d1, %a1, (%a2)+, %d2, %acc1 | with saturation
 381     mac.l       %d2, %a1, (%a2)+, %d3, %acc2 |
 382     mac.l       %d3, %a1             , %acc3 |
 383     swap        %d4                       | a) interleave most significant...
 384     swap        %d5                       |
 385     swap        %d6                       |
 386     swap        %d7                       |
 387     movclr.l    %acc0, %d0                | obtain L results
 388     movclr.l    %acc1, %d1                |
 389     movclr.l    %acc2, %d2                |
 390     movclr.l    %acc3, %d3                |
 391     move.w      %d4, %d0                  | a) ... 16 bits of L and R
 392     move.w      %d5, %d1                  |
 393     move.w      %d6, %d2                  |
 394     move.w      %d7, %d3                  |
 395     movem.l     %d0-%d3, -16(%a4)         | write four stereo samples
 396     cmp.l       %a4, %a5                  |
 397     bhi.b       .sos_lineloop             |
 398 .sos_longloop_1_start:
 399     cmp.l       %a4, %a0                  | any longwords left?
 400     bls.b       .sos_done                 | no? finished.
 401 .sos_longloop_1:
 402     move.l      (%a2)+, %d1               | handle trailing longwords
 403     mac.l       %d1, %a1, (%a3)+, %d2, %acc0 | the same way as leading ones
 404     mac.l       %d2, %a1, %acc1           |
 405     movclr.l    %acc0, %d1                |
 406     movclr.l    %acc1, %d2                |
 407     swap        %d2                       |
 408     move.w      %d2, %d1                  |
 409     move.l      %d1, (%a4)+               |
 410     cmp.l       %a4, %a0                  |
 411     bhi.b       .sos_longloop_1           |
 412 .sos_done:
 413     movem.l     (%sp), %d1-%d7/%a2-%a5    | restore registers
 414     move.l      %d1, %macsr               |
 415     lea.l       44(%sp), %sp              | cleanup
 416     rts                                   |
 417 .sos_end:
 418     .size      sample_output_stereo, .sos_end-sample_output_stereo
 419
 420 /****************************************************************************
 421  * void sample_output_mono(int count, struct dsp_data *data,
 422  *                         int32_t *src[], int16_t *dst)
 423  *
 424  * Same treatment as sample_output_stereo but for one channel.
 425  */
 426     .section   .text
 427     .global    sample_output_mono
 428 sample_output_mono:
 429     lea.l       -28(%sp), %sp             | save registers
 430     move.l      %macsr, %d1               | do it now as at many lines will
 431     movem.l     %d1-%d5/%a2-%a3, (%sp)    | be the far more common condition
 432     move.l      #0x80, %macsr             | put emac unit in signed int mode
 433     movem.l     32(%sp), %a0-%a3          |
 434     lea.l       (%a3, %a0.l*4), %a0       | %a0 = end address
 435     move.l      (%a1), %d1                | %d5 = multiplier: (1 << (16 - scale))
 436     sub.l       #16, %d1                  |
 437     neg.l       %d1                       |
 438     moveq.l     #1, %d5                   |
 439     asl.l       %d1, %d5                  |
 440     movem.l     (%a2), %a2                | get source channel pointer
 441     moveq.l     #28, %d0                  | %d0 = second line bound
 442     add.l       %a3, %d0                  |
 443     and.l       #0xfffffff0, %d0          |
 444     cmp.l       %a3, %d0                  | at least a full line?
 445     blo.w       .som_longloop_1_start     | no? jump to trailing longword
 446     sub.l       #16, %d0                  | %d1 = first line bound
 447     cmp.l       %a3, %d0                  | any leading longwords?
 448     bls.b       .som_lineloop_start       | no? jump to line loop
 449 .som_longloop_0:
 450     move.l      (%a2)+, %d1               | read longword from L and R
 451     mac.l       %d1, %d5, %acc0           | shift L to high word
 452     movclr.l    %acc0, %d1                | get possibly saturated results
 453     move.l      %d1, %d2                  |
 454     swap        %d2                       | move R to low word
 455     move.w      %d2, %d1                  | duplicate single channel into
 456     move.l      %d1, (%a3)+               | L and R
 457     cmp.l       %a3, %d0                  |
 458     bhi.b       .som_longloop_0           |
 459 .som_lineloop_start:
 460     lea.l       -12(%a0), %a1             | %a1 = at or just before last line bound
 461 .som_lineloop:
 462     move.l      (%a2)+, %d0               | get next 4 L samples and scale
 463     mac.l       %d0, %d5, (%a2)+, %d1, %acc0 | with saturation
 464     mac.l       %d1, %d5, (%a2)+, %d2, %acc1 |
 465     mac.l       %d2, %d5, (%a2)+, %d3, %acc2 |
 466     mac.l       %d3, %d5             , %acc3 |
 467     lea.l       16(%a3), %a3              | increment dest here, mitigate stalls
 468     movclr.l    %acc0, %d0                | obtain results
 469     movclr.l    %acc1, %d1                |
 470     movclr.l    %acc2, %d2                |
 471     movclr.l    %acc3, %d3                |
 472     move.l      %d0, %d4                  | duplicate single channel
 473     swap        %d4                       | into L and R
 474     move.w      %d4, %d0                  |
 475     move.l      %d1, %d4                  |
 476     swap        %d4                       |
 477     move.w      %d4, %d1                  |
 478     move.l      %d2, %d4                  |
 479     swap        %d4                       |
 480     move.w      %d4, %d2                  |
 481     move.l      %d3, %d4                  |
 482     swap        %d4                       |
 483     move.w      %d4, %d3                  |
 484     movem.l     %d0-%d3, -16(%a3)         | write four stereo samples
 485     cmp.l       %a3, %a1                  |
 486     bhi.b       .som_lineloop             |
 487 .som_longloop_1_start:
 488     cmp.l       %a3, %a0                  | any longwords left?
 489     bls.b       .som_done                 | no? finished.
 490 .som_longloop_1:
 491     move.l      (%a2)+, %d1               | handle trailing longwords
 492     mac.l       %d1, %d5, %acc0           | the same way as leading ones
 493     movclr.l    %acc0, %d1                |
 494     move.l      %d1, %d2                  |
 495     swap        %d2                       |
 496     move.w      %d2, %d1                  |
 497     move.l      %d1, (%a3)+               |
 498     cmp.l       %a3, %a0                  |
 499     bhi.b       .som_longloop_1           |
 500 .som_done:
 501     movem.l     (%sp), %d1-%d5/%a2-%a3    | restore registers
 502     move.l      %d1, %macsr               |
 503     lea.l       28(%sp), %sp              | cleanup
 504     rts                                   |
 505 .som_end:
 506     .size      sample_output_mono, .som_end-sample_output_mono