1 /* Copyright (C) 2007 Thom Johansen */
4 @brief Various analysis/synthesis filters (Coldfire version)
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
11 - Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
18 - Neither the name of the Xiph.org Foundation nor the names of its
19 contributors may be used to endorse or promote products derived from
20 this software without specific prior written permission.
22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
26 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
27 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
28 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
29 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 /* void iir_mem16(const spx_word16_t *x, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack) */
40 movem.l %d2-%d7/%a2-%a6, (%sp)
41 movem.l (44+4, %sp), %a3-%a5 | a3 = x, a4 = den, a5 = y
42 movem.l (44+20, %sp), %d0/%a6 | d0 = ord, a6 = mem
43 moveq.l #8, %d1 | Jump to correct routine based on 'ord'
51 | TODO: try using direct form 1 filtering
52 | d0 = y[i], d1-d7, a0 = mem[0] .. mem[7]
53 | a3 = x, a4 = den, a5 = y, a6 = temp
55 movem.l (%a6), %d1-%d7/%a0 | Fetch mem[] array
59 asr.l %d0, %d1 | mem[0] >> 13 with rounding
62 add.l %d1, %d0 | Add with x[i]
65 add.l %d1, %d0 | Bias result to [-1..65534]
66 cmp.l %a6, %d0 | Now do clip to [0..65534] range
70 .word 0x51fa | trapf.w, shadow next insn
72 move.l %a6, %d0 | Clip high
74 sub.l %d1, %d0 | Bias clipped result back to [-32767..32767]
75 move.w %d0, (%a5)+ | Write result to y[i]
76 neg.l %d0 | msac.w is bugged in gas, do this for now
77 move.l (%a4)+, %a6 | Fetch den[0] and den[1]
78 mac.w %a6u, %d0l, %acc0
79 mac.w %a6l, %d0l, (%a4)+, %a6, %acc1
80 mac.w %a6u, %d0l, %acc2
81 mac.w %a6l, %d0l, (%a4)+, %a6, %acc3
83 add.l %d2, %d1 | mem[0] = mem[1] - den[0]*y[i]
85 add.l %d3, %d2 | mem[1] = mem[2] - den[1]*y[i]
87 add.l %d4, %d3 | mem[2] = mem[3] - den[2]*y[i]
89 add.l %d5, %d4 | mem[3] = mem[4] - den[3]*y[i]
90 mac.w %a6u, %d0l, %acc0
91 mac.w %a6l, %d0l, (%a4)+, %a6, %acc1
92 mac.w %a6u, %d0l, %acc2
93 mac.w %a6l, %d0l, %acc3
94 lea.l (-16, %a4), %a4 | wrap den pointer back to den[0]
96 add.l %d6, %d5 | mem[4] = mem[5] - den[4]*y[i]
98 add.l %d7, %d6 | mem[5] = mem[6] - den[5]*y[i]
100 add.l %a0, %d7 | mem[6] = mem[7] - den[6]*y[i]
101 movclr.l %acc3, %a0 | mem[7] = -den[7]*y[i]
102 subq.l #1, (44+16, %sp) | Have we done all samples?
104 move.l (44+24, %sp), %a6 | Fetch mem pointer
105 movem.l %d1-%d7/%a0, (%a6) | Save back mem[]
108 | d0 = y[i], d1-d7, a0-a2 = mem[0] .. mem[9]
109 | a3 = x, a4 = den, a5 = y, a6 = temp
111 movem.l (%a6), %d1-%d7/%a0-%a2 | Fetch mem[] array
115 asr.l %d0, %d1 | mem[0] >> 13 with rounding
118 add.l %d1, %d0 | Add with x[i]
121 add.l %d1, %d0 | Bias result to [-1..65534]
122 cmp.l %a6, %d0 | Now do clip to [0..65534] range
126 .word 0x51fa | trapf.w, shadow next insn
128 move.l %a6, %d0 | Clip high
130 sub.l %d1, %d0 | Bias clipped result back to [-32767..32767]
131 move.w %d0, (%a5)+ | Write result to y[i]
132 neg.l %d0 | msac.w is bugged in gas, do this for now
133 move.l (%a4)+, %a6 | Fetch den[0] and den[1]
134 mac.w %a6u, %d0l, %acc0
135 mac.w %a6l, %d0l, (%a4)+, %a6, %acc1
136 mac.w %a6u, %d0l, %acc2
137 mac.w %a6l, %d0l, (%a4)+, %a6, %acc3
139 add.l %d2, %d1 | mem[0] = mem[1] - den[0]*y[i]
141 add.l %d3, %d2 | mem[1] = mem[2] - den[1]*y[i]
143 add.l %d4, %d3 | mem[2] = mem[3] - den[2]*y[i]
145 add.l %d5, %d4 | mem[3] = mem[4] - den[3]*y[i]
146 mac.w %a6u, %d0l, %acc0
147 mac.w %a6l, %d0l, (%a4)+, %a6, %acc1
148 mac.w %a6u, %d0l, %acc2
149 mac.w %a6l, %d0l, (%a4)+, %a6, %acc3
150 lea.l (-20, %a4), %a4 | wrap den pointer back to den[0]
152 add.l %d6, %d5 | mem[4] = mem[5] - den[4]*y[i]
154 add.l %d7, %d6 | mem[5] = mem[6] - den[5]*y[i]
156 add.l %a0, %d7 | mem[6] = mem[7] - den[6]*y[i]
158 add.l %a1, %a0 | mem[7] = mem[8] - den[7]*y[i]
159 mac.w %a6u, %d0l, %acc0
160 mac.w %a6l, %d0l, %acc1
162 add.l %a2, %a1 | mem[8] = mem[9] - den[8]*y[i]
163 movclr.l %acc1, %a2 | mem[9] = -den[9]*y[i]
165 subq.l #1, (44+16, %sp) | Have we done all samples?
167 move.l (44+24, %sp), %a6 | Fetch mem pointer
168 movem.l %d1-%d7/%a0-%a2, (%a6) | Save back mem[]
171 movem.l (%sp), %d2-%d7/%a2-%a6
176 /* void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack) */
179 lea.l (-44, %sp), %sp
180 movem.l %d2-%d7/%a2-%a6, (%sp)
181 movem.l (44+4, %sp), %a0-%a3 | a0 = x1, a1 = x2, a2 = a, a3 = y
182 movem.l (44+20, %sp), %d0-%d1/%a4-%a5 | d0 = N, d1 = M, a4 = mem1,a5 = mem2
183 move.l #0x80, %macsr | Enable saturation
185 | Comments make more sense when compared to the reference C version
186 move.l %a2, %d6 | Backup a
187 lsr.l #1, %d0 | N2 = N >> 1
188 lsr.l #1, %d1 | M2 = M >> 1
189 move.l %d1, %d7 | Backup M2
192 sub.l %d1, %d2 | d2 = -(N2 + M2)
193 lea.l (%sp, %d2.l*2), %a2 | Alloc two buffers of N2 + M2 shorts
194 lea.l (%a2, %d2.l*2), %a6 | a2 = xx1, a6 = xx2
196 move.l %a6, %sp | Update sp
197 move.l %d3, -(%sp) | Stack old %sp
199 | Backwards copy x1 and x2 arrays to xx1 and xx2, assume N2 is power of two
200 | TODO: these copying loops probably have more potential for optimization
201 lea.l (%a0, %d0.l*2), %a0 | x1 += N2
202 lea.l (%a1, %d0.l*2), %a1 | x2 += N2
203 move.l %d0, %d2 | Loop counter is N2
214 | Copy alternate members of mem1 and mem2 to last part of xx1 and xx2
215 move.l %d1, %d2 | Loop counter is M2
216 addq.l #2, %a4 | a4 = &mem1[1]
217 addq.l #2, %a5 | a5 = &mem2[1]
218 move.l %a4, %d3 | Backup mem1 and mem2
227 move.l %d3, %a4 | a4 = &mem1[1]
228 move.l %d4, %a5 | a5 = &mem2[1]
231 sub.l %d1, %d2 | d2 = -M2
232 lea.l (-4, %a2, %d2.l*2), %a0 | a0 = &xx1[N2 - 2]
233 lea.l (-4, %a6, %d2.l*2), %a1 | a1 = &xx2[N2 - 2]
234 move.l %d6, %a2 | a2 = a
236 | Main loop, register usage:
237 | d0 = N2 counter, d1 = M2 counter, d7 = M2 backup
238 | d2 = x10, d3 = x11, d4 = x20, d5 = x21, d6 = [a0, a1]
239 | a0 = xx1, a1 = xx2, a2 = a, a3 = y, a4 = mem1, a5 = mem2
241 move.l #32768, %d2 | Rounding constant
246 move.w (%a0)+, %d2 | d2 = x10
247 move.w (%a1)+, %d4 | d4 = x20
248 move.l (%a2)+, %d6 | d6 = [a0, a1]
250 move.w (%a0)+, %d3 | d3 = x11
251 move.w (%a1)+, %d5 | d5 = x21
252 mac.w %d6u, %d3l, <<, %acc0 | acc0 += a0*x11
253 msac.w %d6u, %d5l, <<, %acc0 | acc0 -= a0*x21
254 mac.w %d6l, %d3l, <<, %acc1 | acc1 += a1*x11
255 mac.w %d6l, %d5l, <<, %acc1 | acc1 += a1*x21
256 mac.w %d6u, %d2l, <<, %acc2 | acc2 += a0*x10
257 msac.w %d6u, %d4l, <<, %acc2 | acc2 -= a0*x20
258 mac.w %d6l, %d2l, <<, %acc3 | acc3 += a1*x10
259 mac.w %d6l, %d4l, <<, (%a2)+, %d6, %acc3 | acc3 += a1*x20
261 move.w (%a0)+, %d2 | d2 = x10
262 move.w (%a1)+, %d4 | d4 = x20
263 mac.w %d6u, %d2l, <<, %acc0 | acc0 += a0*x10
264 msac.w %d6u, %d4l, <<, %acc0 | acc0 -= a0*x20
265 mac.w %d6l, %d2l, <<, %acc1 | acc1 += a1*x10
266 mac.w %d6l, %d4l, <<, %acc1 | acc1 += a1*x20
267 mac.w %d6u, %d3l, <<, %acc2 | acc2 += a0*x11
268 msac.w %d6u, %d5l, <<, %acc2 | acc2 -= a0*x21
269 mac.w %d6l, %d3l, <<, %acc3 | acc3 += a1*x11
270 mac.w %d6l, %d5l, <<, (%a2)+, %d6, %acc3 | acc3 += a1*x21
274 sub.l %d7, %d1 | d1 = -M2
275 lea.l (-4, %a2, %d1.l*4), %a2 | a2 = &a[0]
276 lea.l (-6, %a0, %d1.l*2), %a0 | a0 = &xx1[N2 - 2 - i]
277 lea.l (-6, %a1, %d1.l*2), %a1 | a1 = &xx2[N2 - 2 - i]
283 swap.w %d2 | Shift 16 right
287 | Thanks to the extra shift in the mac chain, we get clipping for free.
288 | The clipping will be [-32768..32767], not Speex standard [-32767..32767],
289 | but since qmf_synth() is called so late in the signal chain, it should
291 move.w %d2, (%a3)+ | Write results to y[]
298 | Copy start of xx1 and xx2 back to alternate mem1 and mem2 entries
299 addq.l #4, %a0 | a0 = &xx1[0]
300 addq.l #4, %a1 | a1 = &xx2[0]
311 movem.l (%sp), %d2-%d7/%a2-%a6
316 /* void signal_mul(const spx_sig_t *x, spx_sig_t *y, spx_word32_t scale, int len) */
319 lea.l (-20, %sp), %sp
320 movem.l %d2-%d6, (%sp)
321 movem.l (20+4, %sp), %a0-%a1 | a0 = x, a1 = y
322 movem.l (20+12, %sp), %d0-%d1 | d0 = scale, d1 = len
324 move.l %d6, %macsr | Set MAC unit to fractional mode
325 asl.l #3, %d0 | Pre-scale 'scale'
328 movem.l (%a0), %d2-%d5 | Fetch input
329 asl.l %d6, %d2 | Shift each value 9 to the left
333 mac.l %d2, %d0, %acc0 | Do multiplies
334 mac.l %d3, %d0, %acc1
335 mac.l %d4, %d0, %acc2
336 mac.l %d5, %d0, %acc3
342 asl.l #5, %d2 | Adjust to proper format
346 movem.l %d2-%d5, (%a1) | Save output
352 move.l %d0, %macsr | Set MAC unit back to integer mode
353 movem.l (%sp), %d2-%d6