1 /***************************************************************************
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
10 * Copyright (C) 2005 by Thom Johansen
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
17 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
18 * KIND, either express or implied.
20 ****************************************************************************/
22 /* The following are assembler optimised version of the LPC filtering
23 routines needed for FLAC decoding. They is optimised for use with the
24 MCF5249 processor, or any other similar ColdFire core with the EMAC unit.
27 /* This routine deals with sample widths 16 and lower. All LPC filtering up to
28 order 10 is done in specially optimised unrolled loops, while every order
29 above this is handled by a slower default routine.
31 .section .icode,"ax",@progbits
32 .global lpc_decode_emac
36 movem.l %d2-%d7/%a2-%a6, (%sp)
37 movem.l (44+4, %sp), %d0-%d2/%a0-%a1
38 /* d0 = blocksize, d1 = qlevel, d2 = pred_order
39 a0 = data, a1 = coeffs
42 /* the data pointer always lags behind history pointer by 'pred_order'
43 samples. since we have one loop for each order, we can hard code this
44 and free a register by not saving data pointer.
48 lea.l (%a0, %d3.l*4), %a0 | history
50 move.l %d3, %macsr | we'll need integer mode for this
52 jeq .exit | zero samples to process, exit
55 jgt .default | order is over 10, jump to default case
56 jmp.l (2, %pc, %d2.l*4) | jump to loop corresponding to pred_order
58 bra.w .exit | zero order filter isn't possible, exit function
69 | last jump table entry coincides with target, so leave it out
71 movem.l (%a1), %d3-%d7/%a1-%a5 | load lpc coefs
72 move.l (%a0)+, %a6 | load first history sample
74 mac.l %a6, %a5, (%a0)+, %a6, %acc0
75 mac.l %a6, %a4, (%a0)+, %a6, %acc0
76 mac.l %a6, %a3, (%a0)+, %a6, %acc0
77 mac.l %a6, %a2, (%a0)+, %a6, %acc0
78 mac.l %a6, %a1, (%a0)+, %a6, %acc0
79 mac.l %a6, %d7, (%a0)+, %a6, %acc0
80 mac.l %a6, %d6, (%a0)+, %a6, %acc0
81 mac.l %a6, %d5, (%a0)+, %a6, %acc0
82 mac.l %a6, %d4, (%a0)+, %a6, %acc0
83 mac.l %a6, %d3, (-9*4, %a0), %a6, %acc0 | load for the next iteration
84 movclr.l %acc0, %d2 | get sum
85 asr.l %d1, %d2 | shift sum by qlevel bits
86 add.l %d2, (%a0) | add residual and save
87 lea.l (-8*4, %a0), %a0 | point history back at second element
88 subq.l #1, %d0 | decrement sample count
93 movem.l (%a1), %d4-%d7/%a1-%a5
96 mac.l %a6, %a5, (%a0)+, %a6, %acc0
97 mac.l %a6, %a4, (%a0)+, %a6, %acc0
98 mac.l %a6, %a3, (%a0)+, %a6, %acc0
99 mac.l %a6, %a2, (%a0)+, %a6, %acc0
100 mac.l %a6, %a1, (%a0)+, %a6, %acc0
101 mac.l %a6, %d7, (%a0)+, %a6, %acc0
102 mac.l %a6, %d6, (%a0)+, %a6, %acc0
103 mac.l %a6, %d5, (%a0)+, %a6, %acc0
104 mac.l %a6, %d4, (-8*4, %a0), %a6, %acc0
108 lea.l (-7*4, %a0), %a0
114 movem.l (%a1), %d5-%d7/%a1-%a5
117 mac.l %a6, %a5, (%a0)+, %a6, %acc0
118 mac.l %a6, %a4, (%a0)+, %a6, %acc0
119 mac.l %a6, %a3, (%a0)+, %a6, %acc0
120 mac.l %a6, %a2, (%a0)+, %a6, %acc0
121 mac.l %a6, %a1, (%a0)+, %a6, %acc0
122 mac.l %a6, %d7, (%a0)+, %a6, %acc0
123 mac.l %a6, %d6, (%a0)+, %a6, %acc0
124 mac.l %a6, %d5, (-7*4, %a0), %a6, %acc0
128 lea.l (-6*4, %a0), %a0
134 movem.l (%a1), %d6-%d7/%a1-%a5
137 mac.l %a6, %a5, (%a0)+, %a6, %acc0
138 mac.l %a6, %a4, (%a0)+, %a6, %acc0
139 mac.l %a6, %a3, (%a0)+, %a6, %acc0
140 mac.l %a6, %a2, (%a0)+, %a6, %acc0
141 mac.l %a6, %a1, (%a0)+, %a6, %acc0
142 mac.l %a6, %d7, (%a0)+, %a6, %acc0
143 mac.l %a6, %d6, (-6*4, %a0), %a6, %acc0
147 lea.l (-5*4, %a0), %a0
153 movem.l (%a1), %d7/%a1-%a5
156 mac.l %a6, %a5, (%a0)+, %a6, %acc0
157 mac.l %a6, %a4, (%a0)+, %a6, %acc0
158 mac.l %a6, %a3, (%a0)+, %a6, %acc0
159 mac.l %a6, %a2, (%a0)+, %a6, %acc0
160 mac.l %a6, %a1, (%a0)+, %a6, %acc0
161 mac.l %a6, %d7, (-5*4, %a0), %a6, %acc0
165 lea.l (-4*4, %a0), %a0
171 movem.l (%a1), %a1-%a5
174 mac.l %a6, %a5, (%a0)+, %a6, %acc0
175 mac.l %a6, %a4, (%a0)+, %a6, %acc0
176 mac.l %a6, %a3, (%a0)+, %a6, %acc0
177 mac.l %a6, %a2, (%a0)+, %a6, %acc0
178 mac.l %a6, %a1, (-4*4, %a0), %a6, %acc0
182 lea.l (-3*4, %a0), %a0
188 movem.l (%a1), %a2-%a5
191 mac.l %a6, %a5, (%a0)+, %a6, %acc0
192 mac.l %a6, %a4, (%a0)+, %a6, %acc0
193 mac.l %a6, %a3, (%a0)+, %a6, %acc0
194 mac.l %a6, %a2, (-3*4, %a0), %a6, %acc0
204 movem.l (%a1), %a3-%a5
207 mac.l %a6, %a5, (%a0)+, %a6, %acc0
208 mac.l %a6, %a4, (%a0)+, %a6, %acc0
209 mac.l %a6, %a3, (-2*4, %a0), %a6, %acc0
219 movem.l (%a1), %a4-%a5
222 mac.l %a6, %a5, (%a0)+, %a6, %acc0
223 mac.l %a6, %a4, %acc0 | data for next iteration is already loaded
232 | no point in using mac here
244 /* we do the filtering in an unrolled by 4 loop as far as we can, and then
245 do the rest by jump table. */
246 lea.l (%a1, %d2.l*4), %a2 | need to start in the other end of coefs
247 move.l %a0, %a3 | working copy of history pointer
249 lsr.l #2, %d3 | coefs/4, num of iterations needed in next loop
250 move.l (%a3)+, %a5 | preload data for loop
252 lea.l (-4*4, %a2), %a2 | move lpc coef pointer four samples backwards
253 movem.l (%a2), %d4-%d7 | load four coefs
254 mac.l %a5, %d7, (%a3)+, %a5, %acc0
255 mac.l %a5, %d6, (%a3)+, %a5, %acc0
256 mac.l %a5, %d5, (%a3)+, %a5, %acc0
257 mac.l %a5, %d4, (%a3)+, %a5, %acc0
258 subq.l #1, %d3 | any more unrolled loop operations left?
261 moveq.l #3, %d3 | mask 0x00000003
262 and.l %d2, %d3 | get the remaining samples to be filtered
263 jmp.l (2, %pc, %d3*2) | then jump into mac.l chain
270 mac.l %a5, %d4, (%a3)+, %a5, %acc0
273 mac.l %a5, %d4, (%a3)+, %a5, %acc0
276 mac.l %a5, %d4, (%a3)+, %a5, %acc0
278 movclr.l %acc0, %d3 | get result
279 asr.l %d1, %d3 | shift qlevel bits right
280 add.l %a5, %d3 | add residual, which is in a5 by now
281 move.l %d3, -(%a3) | save, a3 is also one past save location
282 addq.l #4, %a0 | increment history pointer
283 subq.l #1, %d0 | decrement sample count
284 jne .default | are we done?
285 jra .exit | if so, fall through to exit
288 /* This routine deals with sample widths 24 and lower. All LPC filtering up to
289 order 8 is done in specially optimised unrolled loops, while every order
290 above this is handled by a slower default routine.
292 .global lpc_decode_emac_wide
294 lpc_decode_emac_wide:
295 lea.l (-44, %sp), %sp
296 movem.l %d2-%d7/%a2-%a6, (%sp)
297 movem.l (44+4, %sp), %d0-%d1/%d3/%a0-%a1
298 /* d0 = blocksize, d1 = qlevel, d3 = pred_order
299 a0 = data, a1 = coeffs
302 /* the data pointer always lags behind history pointer by 'pred_order'
303 samples. since we have one loop for each order, we can hard code this
304 and free a register by not saving data pointer.
308 lea.l (%a0, %d2.l*4), %a0 | history
310 move.l %d2, %macsr | we'll need integer mode for this
312 jeq .exit | zero samples to process, exit
314 sub.l %d1, %d2 | calculate shift amount for extension byte
317 jgt .wdefault | order is over 8, jump to default case
318 jmp.l (2, %pc, %d3.l*4) | jump to loop corresponding to pred_order
320 bra.w .exit | zero order filter isn't possible, exit function
329 | last jump table entry coincides with target, so leave it out
331 movem.l (%a1), %d5-%d7/%a1-%a5 | load lpc coefs
332 move.l (%a0)+, %a6 | load first history sample
334 mac.l %a6, %a5, (%a0)+, %a6, %acc0
335 mac.l %a6, %a4, (%a0)+, %a6, %acc0
336 mac.l %a6, %a3, (%a0)+, %a6, %acc0
337 mac.l %a6, %a2, (%a0)+, %a6, %acc0
338 mac.l %a6, %a1, (%a0)+, %a6, %acc0
339 mac.l %a6, %d7, (%a0)+, %a6, %acc0
340 mac.l %a6, %d6, (%a0)+, %a6, %acc0
341 mac.l %a6, %d5, (-7*4, %a0), %a6, %acc0 | load for the next iteration
342 move.l %accext01, %d4 | get top 8 bits of sum
343 movclr.l %acc0, %d3 | then botten 32 bits
344 lsr.l %d1, %d3 | shift bottom bits qlevel bits right
345 asl.l %d2, %d4 | shift top bits 32 - qlevel bits left
346 or.l %d4, %d3 | now combine results
347 add.l %d3, (%a0) | add residual and save
348 lea.l (-6*4, %a0), %a0 | point history back at second element
349 subq.l #1, %d0 | decrement sample count
350 jne 1b | are we done?
354 movem.l (%a1), %d6-%d7/%a1-%a5
357 mac.l %a6, %a5, (%a0)+, %a6, %acc0
358 mac.l %a6, %a4, (%a0)+, %a6, %acc0
359 mac.l %a6, %a3, (%a0)+, %a6, %acc0
360 mac.l %a6, %a2, (%a0)+, %a6, %acc0
361 mac.l %a6, %a1, (%a0)+, %a6, %acc0
362 mac.l %a6, %d7, (%a0)+, %a6, %acc0
363 mac.l %a6, %d6, (-6*4, %a0), %a6, %acc0
364 move.l %accext01, %d4
370 lea.l (-5*4, %a0), %a0
376 movem.l (%a1), %d7/%a1-%a5
379 mac.l %a6, %a5, (%a0)+, %a6, %acc0
380 mac.l %a6, %a4, (%a0)+, %a6, %acc0
381 mac.l %a6, %a3, (%a0)+, %a6, %acc0
382 mac.l %a6, %a2, (%a0)+, %a6, %acc0
383 mac.l %a6, %a1, (%a0)+, %a6, %acc0
384 mac.l %a6, %d7, (-5*4, %a0), %a6, %acc0
385 move.l %accext01, %d4
391 lea.l (-4*4, %a0), %a0
397 movem.l (%a1), %a1-%a5
400 mac.l %a6, %a5, (%a0)+, %a6, %acc0
401 mac.l %a6, %a4, (%a0)+, %a6, %acc0
402 mac.l %a6, %a3, (%a0)+, %a6, %acc0
403 mac.l %a6, %a2, (%a0)+, %a6, %acc0
404 mac.l %a6, %a1, (-4*4, %a0), %a6, %acc0
405 move.l %accext01, %d4
411 lea.l (-3*4, %a0), %a0
417 movem.l (%a1), %a2-%a5
420 mac.l %a6, %a5, (%a0)+, %a6, %acc0
421 mac.l %a6, %a4, (%a0)+, %a6, %acc0
422 mac.l %a6, %a3, (%a0)+, %a6, %acc0
423 mac.l %a6, %a2, (-3*4, %a0), %a6, %acc0
424 move.l %accext01, %d4
436 movem.l (%a1), %a3-%a5
439 mac.l %a6, %a5, (%a0)+, %a6, %acc0
440 mac.l %a6, %a4, (%a0)+, %a6, %acc0
441 mac.l %a6, %a3, (-2*4, %a0), %a6, %acc0
442 move.l %accext01, %d4
454 movem.l (%a1), %a4-%a5
457 mac.l %a6, %a5, (%a0)+, %a6, %acc0
458 mac.l %a6, %a4, %acc0 | data for next iteration is already loaded
459 move.l %accext01, %d4
473 mac.l %a6, %a5, (%a0), %a6, %acc0
474 move.l %accext01, %d4
479 add.l %a6, %d3 | residual is already in a6
486 /* we do the filtering in an unrolled by 4 loop as far as we can, and then
487 do the rest by jump table. */
488 lea.l (%a1, %d3.l*4), %a2 | need to start in the other end of coefs
489 move.l %a0, %a3 | working copy of history pointer
491 lsr.l #2, %d4 | coefs/4, num of iterations needed in next loop
492 move.l (%a3)+, %a5 | preload data for loop
494 lea.l (-4*4, %a2), %a2 | move lpc coef pointer four samples backwards
495 movem.l (%a2), %d5-%d7/%a4 | load four coefs
496 mac.l %a5, %a4, (%a3)+, %a5, %acc0
497 mac.l %a5, %d7, (%a3)+, %a5, %acc0
498 mac.l %a5, %d6, (%a3)+, %a5, %acc0
499 mac.l %a5, %d5, (%a3)+, %a5, %acc0
500 subq.l #1, %d4 | any more unrolled loop operations left?
503 moveq.l #3, %d4 | mask 0x00000003
504 and.l %d3, %d4 | get the remaining samples to be filtered
505 jmp.l (2, %pc, %d4*2) | then jump into mac.l chain
512 mac.l %a5, %d4, (%a3)+, %a5, %acc0
515 mac.l %a5, %d4, (%a3)+, %a5, %acc0
518 mac.l %a5, %d4, (%a3)+, %a5, %acc0
520 move.l %accext01, %d5 | get high 32 bits of result
521 movclr.l %acc0, %d4 | get low 32 bits of result
522 lsr.l %d1, %d4 | shift qlevel bits right
523 asl.l %d2, %d5 | shift 32 - qlevel bits left
524 or.l %d5, %d4 | combine top and low bits after shift
525 add.l %a5, %d4 | add residual, which is in a5 by now
526 move.l %d4, -(%a3) | save, a3 is also one past save location
527 addq.l #4, %a0 | increment history pointer
528 subq.l #1, %d0 | decrement sample count
529 jne .wdefault | are we done?
530 | if so, fall through to exit
533 movem.l (%sp), %d2-%d7/%a2-%a6