apps/codecs/libspeex/filters_cf.S

   1 /* Copyright (C) 2007 Thom Johansen */
   2 /**
   3    @file filters_cf.S
   4    @brief Various analysis/synthesis filters (Coldfire version)
   5 */
   6 /*
   7    Redistribution and use in source and binary forms, with or without
   8    modification, are permitted provided that the following conditions
   9    are met:
  10
  11    - Redistributions of source code must retain the above copyright
  12    notice, this list of conditions and the following disclaimer.
  13
  14    - Redistributions in binary form must reproduce the above copyright
  15    notice, this list of conditions and the following disclaimer in the
  16    documentation and/or other materials provided with the distribution.
  17
  18    - Neither the name of the Xiph.org Foundation nor the names of its
  19    contributors may be used to endorse or promote products derived from
  20    this software without specific prior written permission.
  21
  22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  23    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  25    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
  26    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  27    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  28    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  29    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  30    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  31    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  32    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  33 */
  34
  35     .text
  36 /* void iir_mem16(const spx_word16_t *x, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack) */
  37     .global iir_mem16
  38 iir_mem16:
  39     lea.l    (-44, %sp), %sp
  40     movem.l  %d2-%d7/%a2-%a6, (%sp)
  41     movem.l  (44+4, %sp), %a3-%a5   | a3 = x, a4 = den, a5 = y
  42     movem.l  (44+20, %sp), %d0/%a6  | d0 = ord, a6 = mem
  43     moveq.l  #8, %d1                | Jump to correct routine based on 'ord'
  44     cmp.l    %d1, %d0
  45     jeq      .order_8
  46     moveq.l  #10, %d1
  47     cmp.l    %d1, %d0
  48     jeq      .order_10
  49     jra      .exit
  50
  51     | TODO: try using direct form 1 filtering
  52     | d0 = y[i], d1-d7, a0 = mem[0] .. mem[7]
  53     | a3 = x, a4 = den, a5 = y, a6 = temp
  54 .order_8:
  55     movem.l  (%a6), %d1-%d7/%a0 | Fetch mem[] array
  56 0:
  57     moveq.l  #13, %d0
  58     add.l    #4096, %d1
  59     asr.l    %d0, %d1           | mem[0] >> 13 with rounding
  60     move.w   (%a3)+, %d0
  61     ext.l    %d0
  62     add.l    %d1, %d0           | Add with x[i]
  63     move.l   #32767, %d1
  64     move.l   #65534, %a6
  65     add.l    %d1, %d0           | Bias result to [-1..65534]
  66     cmp.l    %a6, %d0           | Now do clip to [0..65534] range
  67     jls      2f
  68     jpl      1f
  69     clr.l    %d0                | Clip low
  70     .word    0x51fa             | trapf.w, shadow next insn
  71 1:
  72     move.l   %a6, %d0           | Clip high
  73 2:
  74     sub.l    %d1, %d0           | Bias clipped result back to [-32767..32767]
  75     move.w   %d0, (%a5)+        | Write result to y[i]
  76     neg.l    %d0                | msac.w is bugged in gas, do this for now
  77     move.l   (%a4)+, %a6        | Fetch den[0] and den[1]
  78     mac.w    %a6u, %d0l, %acc0
  79     mac.w    %a6l, %d0l, (%a4)+, %a6, %acc1
  80     mac.w    %a6u, %d0l, %acc2
  81     mac.w    %a6l, %d0l, (%a4)+, %a6, %acc3
  82     movclr.l %acc0, %d1
  83     add.l    %d2, %d1           | mem[0] = mem[1] - den[0]*y[i]
  84     movclr.l %acc1, %d2
  85     add.l    %d3, %d2           | mem[1] = mem[2] - den[1]*y[i]
  86     movclr.l %acc2, %d3
  87     add.l    %d4, %d3           | mem[2] = mem[3] - den[2]*y[i]
  88     movclr.l %acc3, %d4
  89     add.l    %d5, %d4           | mem[3] = mem[4] - den[3]*y[i]
  90     mac.w    %a6u, %d0l, %acc0
  91     mac.w    %a6l, %d0l, (%a4)+, %a6, %acc1
  92     mac.w    %a6u, %d0l, %acc2
  93     mac.w    %a6l, %d0l, %acc3
  94     lea.l    (-16, %a4), %a4    | wrap den pointer back to den[0]
  95     movclr.l %acc0, %d5
  96     add.l    %d6, %d5           | mem[4] = mem[5] - den[4]*y[i]
  97     movclr.l %acc1, %d6
  98     add.l    %d7, %d6           | mem[5] = mem[6] - den[5]*y[i]
  99     movclr.l %acc2, %d7
 100     add.l    %a0, %d7           | mem[6] = mem[7] - den[6]*y[i]
 101     movclr.l %acc3, %a0         | mem[7] = -den[7]*y[i]
 102     subq.l   #1, (44+16, %sp)   | Have we done all samples?
 103     jne      0b
 104     move.l   (44+24, %sp), %a6  | Fetch mem pointer
 105     movem.l  %d1-%d7/%a0, (%a6) | Save back mem[]
 106     jra     .exit
 107
 108     | d0 = y[i], d1-d7, a0-a2 = mem[0] .. mem[9]
 109     | a3 = x, a4 = den, a5 = y, a6 = temp
 110 .order_10:
 111     movem.l  (%a6), %d1-%d7/%a0-%a2 | Fetch mem[] array
 112 0:
 113     moveq.l  #13, %d0
 114     add.l    #4096, %d1
 115     asr.l    %d0, %d1           | mem[0] >> 13 with rounding
 116     move.w   (%a3)+, %d0
 117     ext.l    %d0
 118     add.l    %d1, %d0           | Add with x[i]
 119     move.l   #32767, %d1
 120     move.l   #65534, %a6
 121     add.l    %d1, %d0           | Bias result to [-1..65534]
 122     cmp.l    %a6, %d0           | Now do clip to [0..65534] range
 123     jls      2f
 124     jpl      1f
 125     clr.l    %d0                | Clip low
 126     .word    0x51fa             | trapf.w, shadow next insn
 127 1:
 128     move.l   %a6, %d0           | Clip high
 129 2:
 130     sub.l    %d1, %d0           | Bias clipped result back to [-32767..32767]
 131     move.w   %d0, (%a5)+        | Write result to y[i]
 132     neg.l    %d0                | msac.w is bugged in gas, do this for now
 133     move.l   (%a4)+, %a6        | Fetch den[0] and den[1]
 134     mac.w    %a6u, %d0l, %acc0
 135     mac.w    %a6l, %d0l, (%a4)+, %a6, %acc1
 136     mac.w    %a6u, %d0l, %acc2
 137     mac.w    %a6l, %d0l, (%a4)+, %a6, %acc3
 138     movclr.l %acc0, %d1
 139     add.l    %d2, %d1           | mem[0] = mem[1] - den[0]*y[i]
 140     movclr.l %acc1, %d2
 141     add.l    %d3, %d2           | mem[1] = mem[2] - den[1]*y[i]
 142     movclr.l %acc2, %d3
 143     add.l    %d4, %d3           | mem[2] = mem[3] - den[2]*y[i]
 144     movclr.l %acc3, %d4
 145     add.l    %d5, %d4           | mem[3] = mem[4] - den[3]*y[i]
 146     mac.w    %a6u, %d0l, %acc0
 147     mac.w    %a6l, %d0l, (%a4)+, %a6, %acc1
 148     mac.w    %a6u, %d0l, %acc2
 149     mac.w    %a6l, %d0l, (%a4)+, %a6, %acc3
 150     lea.l    (-20, %a4), %a4    | wrap den pointer back to den[0]
 151     movclr.l %acc0, %d5
 152     add.l    %d6, %d5           | mem[4] = mem[5] - den[4]*y[i]
 153     movclr.l %acc1, %d6
 154     add.l    %d7, %d6           | mem[5] = mem[6] - den[5]*y[i]
 155     movclr.l %acc2, %d7
 156     add.l    %a0, %d7           | mem[6] = mem[7] - den[6]*y[i]
 157     movclr.l %acc3, %a0
 158     add.l    %a1, %a0           | mem[7] = mem[8] - den[7]*y[i]
 159     mac.w    %a6u, %d0l, %acc0
 160     mac.w    %a6l, %d0l, %acc1
 161     movclr.l %acc0, %a1
 162     add.l    %a2, %a1           | mem[8] = mem[9] - den[8]*y[i]
 163     movclr.l %acc1, %a2         | mem[9] = -den[9]*y[i]
 164
 165     subq.l   #1, (44+16, %sp)   | Have we done all samples?
 166     jne      0b
 167     move.l   (44+24, %sp), %a6  | Fetch mem pointer
 168     movem.l  %d1-%d7/%a0-%a2, (%a6) | Save back mem[]
 169
 170 .exit:
 171     movem.l  (%sp), %d2-%d7/%a2-%a6
 172     lea.l    (44, %sp), %sp
 173     rts
 174
 175
 176 /* void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack) */
 177     .global qmf_synth
 178 qmf_synth:
 179     lea.l    (-44, %sp), %sp
 180     movem.l  %d2-%d7/%a2-%a6, (%sp)
 181     movem.l  (44+4, %sp), %a0-%a3          | a0 = x1, a1 = x2, a2 = a, a3 = y
 182     movem.l  (44+20, %sp), %d0-%d1/%a4-%a5 | d0 = N, d1 = M, a4 = mem1,a5 = mem2
 183     move.l   #0x80, %macsr                 | Enable saturation
 184
 185     | Comments make more sense when compared to the reference C version
 186     move.l   %a2, %d6                   | Backup a
 187     lsr.l    #1, %d0                    | N2 = N >> 1
 188     lsr.l    #1, %d1                    | M2 = M >> 1
 189     move.l   %d1, %d7                   | Backup M2
 190     clr.l    %d2
 191     sub.l    %d0, %d2
 192     sub.l    %d1, %d2                   | d2 = -(N2 + M2)
 193     lea.l    (%sp, %d2.l*2), %a2        | Alloc two buffers of N2 + M2 shorts
 194     lea.l    (%a2, %d2.l*2), %a6        | a2 = xx1, a6 = xx2
 195     move.l   %sp, %d3
 196     move.l   %a6, %sp                   | Update sp
 197     move.l   %d3, -(%sp)                | Stack old %sp
 198
 199     | Backwards copy x1 and x2 arrays to xx1 and xx2, assume N2 is power of two
 200     | TODO: these copying loops probably have more potential for optimization
 201     lea.l    (%a0, %d0.l*2), %a0        | x1 += N2
 202     lea.l    (%a1, %d0.l*2), %a1        | x2 += N2
 203     move.l   %d0, %d2                   | Loop counter is N2
 204 0:
 205     move.l   -(%a0), %d3
 206     swap.w   %d3
 207     move.l   %d3, (%a2)+
 208     move.l   -(%a1), %d3
 209     swap.w   %d3
 210     move.l   %d3, (%a6)+
 211     subq.l   #2, %d2
 212     jne      0b
 213
 214     | Copy alternate members of mem1 and mem2 to last part of xx1 and xx2
 215     move.l   %d1, %d2                           | Loop counter is M2
 216     addq.l   #2, %a4                            | a4 = &mem1[1]
 217     addq.l   #2, %a5                            | a5 = &mem2[1]
 218     move.l   %a4, %d3                           | Backup mem1 and mem2
 219     move.l   %a5, %d4
 220 0:
 221     move.w   (%a4), (%a2)+
 222     move.w   (%a5), (%a6)+
 223     addq.l   #4, %a4
 224     addq.l   #4, %a5
 225     subq.l   #1, %d2
 226     jne      0b
 227     move.l   %d3, %a4                           | a4 = &mem1[1]
 228     move.l   %d4, %a5                           | a5 = &mem2[1]
 229
 230     clr.l    %d2
 231     sub.l    %d1, %d2                           | d2 = -M2
 232     lea.l    (-4, %a2, %d2.l*2), %a0            | a0 = &xx1[N2 - 2]
 233     lea.l    (-4, %a6, %d2.l*2), %a1            | a1 = &xx2[N2 - 2]
 234     move.l   %d6, %a2                           | a2 = a
 235
 236     | Main loop, register usage:
 237     | d0 = N2 counter, d1 = M2 counter, d7 = M2 backup
 238     | d2 = x10, d3 = x11, d4 = x20, d5 = x21, d6 = [a0, a1]
 239     | a0 = xx1, a1 = xx2, a2 = a, a3 = y, a4 = mem1, a5 = mem2
 240 0:  | Outerloop
 241     move.l   #32768, %d2                        | Rounding constant
 242     move.l   %d2, %acc0
 243     move.l   %d2, %acc1
 244     move.l   %d2, %acc2
 245     move.l   %d2, %acc3
 246     move.w   (%a0)+, %d2                        | d2 = x10
 247     move.w   (%a1)+, %d4                        | d4 = x20
 248     move.l   (%a2)+, %d6                        | d6 = [a0, a1]
 249 1:  | Innerloop
 250     move.w   (%a0)+, %d3                        | d3 = x11
 251     move.w   (%a1)+, %d5                        | d5 = x21
 252     mac.w    %d6u, %d3l, <<, %acc0              | acc0 += a0*x11
 253     msac.w   %d6u, %d5l, <<, %acc0              | acc0 -= a0*x21
 254     mac.w    %d6l, %d3l, <<, %acc1              | acc1 += a1*x11
 255     mac.w    %d6l, %d5l, <<, %acc1              | acc1 += a1*x21
 256     mac.w    %d6u, %d2l, <<, %acc2              | acc2 += a0*x10
 257     msac.w   %d6u, %d4l, <<, %acc2              | acc2 -= a0*x20
 258     mac.w    %d6l, %d2l, <<, %acc3              | acc3 += a1*x10
 259     mac.w    %d6l, %d4l, <<, (%a2)+, %d6, %acc3 | acc3 += a1*x20
 260
 261     move.w   (%a0)+, %d2                        | d2 = x10
 262     move.w   (%a1)+, %d4                        | d4 = x20
 263     mac.w    %d6u, %d2l, <<, %acc0              | acc0 += a0*x10
 264     msac.w   %d6u, %d4l, <<, %acc0              | acc0 -= a0*x20
 265     mac.w    %d6l, %d2l, <<, %acc1              | acc1 += a1*x10
 266     mac.w    %d6l, %d4l, <<, %acc1              | acc1 += a1*x20
 267     mac.w    %d6u, %d3l, <<, %acc2              | acc2 += a0*x11
 268     msac.w   %d6u, %d5l, <<, %acc2              | acc2 -= a0*x21
 269     mac.w    %d6l, %d3l, <<, %acc3              | acc3 += a1*x11
 270     mac.w    %d6l, %d5l, <<, (%a2)+, %d6, %acc3 | acc3 += a1*x21
 271     subq.l   #2, %d1
 272     jne      1b
 273
 274     sub.l    %d7, %d1                           | d1 = -M2
 275     lea.l    (-4, %a2, %d1.l*4), %a2            | a2 = &a[0]
 276     lea.l    (-6, %a0, %d1.l*2), %a0            | a0 = &xx1[N2 - 2 - i]
 277     lea.l    (-6, %a1, %d1.l*2), %a1            | a1 = &xx2[N2 - 2 - i]
 278     neg.l    %d1                                | d1 = M2
 279     movclr.l %acc0, %d2
 280     movclr.l %acc1, %d3
 281     movclr.l %acc2, %d4
 282     movclr.l %acc3, %d5
 283     swap.w   %d2                                | Shift 16 right
 284     swap.w   %d3
 285     swap.w   %d4
 286     swap.w   %d5
 287     | Thanks to the extra shift in the mac chain, we get clipping for free.
 288     | The clipping will be [-32768..32767], not Speex standard [-32767..32767],
 289     | but since qmf_synth() is called so late in the signal chain, it should
 290     | work fine.
 291     move.w   %d2, (%a3)+                        | Write results to y[]
 292     move.w   %d3, (%a3)+
 293     move.w   %d4, (%a3)+
 294     move.w   %d5, (%a3)+
 295     subq.l   #2, %d0
 296     jne      0b
 297
 298     | Copy start of xx1 and xx2 back to alternate mem1 and mem2 entries
 299     addq.l   #4, %a0                            | a0 = &xx1[0]
 300     addq.l   #4, %a1                            | a1 = &xx2[0]
 301 0:
 302     move.w   (%a0)+, (%a4)
 303     move.w   (%a1)+, (%a5)
 304     addq.l   #4, %a4
 305     addq.l   #4, %a5
 306     subq.l   #1, %d1
 307     jne      0b
 308
 309     move.l   #0, %macsr
 310     move.l   (%sp), %sp
 311     movem.l  (%sp), %d2-%d7/%a2-%a6
 312     lea.l    (44, %sp), %sp
 313     rts
 314
 315
 316 /* void signal_mul(const spx_sig_t *x, spx_sig_t *y, spx_word32_t scale, int len) */
 317     .global signal_mul
 318 signal_mul:
 319     lea.l    (-20, %sp), %sp
 320     movem.l  %d2-%d6, (%sp)
 321     movem.l  (20+4, %sp), %a0-%a1           | a0 = x, a1 = y
 322     movem.l  (20+12, %sp), %d0-%d1          | d0 = scale, d1 = len
 323     moveq.l  #0x20, %d6
 324     move.l   %d6, %macsr                    | Set MAC unit to fractional mode
 325     asl.l    #3, %d0                        | Pre-scale 'scale'
 326     moveq.l  #9, %d6
 327 0:
 328     movem.l  (%a0), %d2-%d5                 | Fetch input
 329     asl.l    %d6, %d2                       | Shift each value 9 to the left
 330     asl.l    %d6, %d3
 331     asl.l    %d6, %d4
 332     asl.l    %d6, %d5
 333     mac.l    %d2, %d0, %acc0                | Do multiplies
 334     mac.l    %d3, %d0, %acc1
 335     mac.l    %d4, %d0, %acc2
 336     mac.l    %d5, %d0, %acc3
 337     lea.l    (16, %a0), %a0
 338     movclr.l %acc0, %d2
 339     movclr.l %acc1, %d3
 340     movclr.l %acc2, %d4
 341     movclr.l %acc3, %d5
 342     asl.l    #5, %d2                        | Adjust to proper format
 343     asl.l    #5, %d3
 344     asl.l    #5, %d4
 345     asl.l    #5, %d5
 346     movem.l  %d2-%d5, (%a1)                 | Save output
 347     lea.l    (16, %a1), %a1
 348     subq.l   #4, %d1
 349     jne      0b
 350
 351     clr.l    %d0
 352     move.l   %d0, %macsr                    | Set MAC unit back to integer mode
 353     movem.l  (%sp), %d2-%d6
 354     lea.l    (20, %sp), %sp
 355     rts
 356