1 /***************************************************************************
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
10 * Copyright (C) 2005 by Thom Johansen
12 * All files in this archive are subject to the GNU General Public License.
13 * See the file COPYING in the source tree root for full license agreement.
15 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
16 * KIND, either express or implied.
18 ****************************************************************************/
20 /* The following are assembler optimised version of the LPC filtering
21 routines needed for FLAC decoding. They is optimised for use with the
22 MCF5249 processor, or any other similar ColdFire core with the EMAC unit.
25 /* This routine deals with sample widths 16 and lower. All LPC filtering up to
26 order 10 is done in specially optimised unrolled loops, while every order
27 above this is handled by a slower default routine.
29 .section .icode,"ax",@progbits
30 .global lpc_decode_emac
34 movem.l %d2-%d7/%a2-%a6, (%sp)
35 movem.l (44+4, %sp), %d0-%d2/%a0-%a1
36 /* d0 = blocksize, d1 = qlevel, d2 = pred_order
37 a0 = data, a1 = coeffs
40 /* the data pointer always lags behind history pointer by 'pred_order'
41 samples. since we have one loop for each order, we can hard code this
42 and free a register by not saving data pointer.
46 lea.l (%a0, %d3.l*4), %a0 | history
48 move.l %d3, %macsr | we'll need integer mode for this
50 jeq .exit | zero samples to process, exit
53 jgt .default | order is over 10, jump to default case
54 jmp.l (2, %pc, %d2.l*4) | jump to loop corresponding to pred_order
56 bra.w .exit | zero order filter isn't possible, exit function
67 | last jump table entry coincides with target, so leave it out
69 movem.l (%a1), %d3-%d7/%a1-%a5 | load lpc coefs
70 move.l (%a0)+, %a6 | load first history sample
72 mac.l %a6, %a5, (%a0)+, %a6, %acc0
73 mac.l %a6, %a4, (%a0)+, %a6, %acc0
74 mac.l %a6, %a3, (%a0)+, %a6, %acc0
75 mac.l %a6, %a2, (%a0)+, %a6, %acc0
76 mac.l %a6, %a1, (%a0)+, %a6, %acc0
77 mac.l %a6, %d7, (%a0)+, %a6, %acc0
78 mac.l %a6, %d6, (%a0)+, %a6, %acc0
79 mac.l %a6, %d5, (%a0)+, %a6, %acc0
80 mac.l %a6, %d4, (%a0)+, %a6, %acc0
81 mac.l %a6, %d3, (-9*4, %a0), %a6, %acc0 | load for the next iteration
82 movclr.l %acc0, %d2 | get sum
83 asr.l %d1, %d2 | shift sum by qlevel bits
84 add.l %d2, (%a0) | add residual and save
85 lea.l (-8*4, %a0), %a0 | point history back at second element
86 subq.l #1, %d0 | decrement sample count
91 movem.l (%a1), %d4-%d7/%a1-%a5
94 mac.l %a6, %a5, (%a0)+, %a6, %acc0
95 mac.l %a6, %a4, (%a0)+, %a6, %acc0
96 mac.l %a6, %a3, (%a0)+, %a6, %acc0
97 mac.l %a6, %a2, (%a0)+, %a6, %acc0
98 mac.l %a6, %a1, (%a0)+, %a6, %acc0
99 mac.l %a6, %d7, (%a0)+, %a6, %acc0
100 mac.l %a6, %d6, (%a0)+, %a6, %acc0
101 mac.l %a6, %d5, (%a0)+, %a6, %acc0
102 mac.l %a6, %d4, (-8*4, %a0), %a6, %acc0
106 lea.l (-7*4, %a0), %a0
112 movem.l (%a1), %d5-%d7/%a1-%a5
115 mac.l %a6, %a5, (%a0)+, %a6, %acc0
116 mac.l %a6, %a4, (%a0)+, %a6, %acc0
117 mac.l %a6, %a3, (%a0)+, %a6, %acc0
118 mac.l %a6, %a2, (%a0)+, %a6, %acc0
119 mac.l %a6, %a1, (%a0)+, %a6, %acc0
120 mac.l %a6, %d7, (%a0)+, %a6, %acc0
121 mac.l %a6, %d6, (%a0)+, %a6, %acc0
122 mac.l %a6, %d5, (-7*4, %a0), %a6, %acc0
126 lea.l (-6*4, %a0), %a0
132 movem.l (%a1), %d6-%d7/%a1-%a5
135 mac.l %a6, %a5, (%a0)+, %a6, %acc0
136 mac.l %a6, %a4, (%a0)+, %a6, %acc0
137 mac.l %a6, %a3, (%a0)+, %a6, %acc0
138 mac.l %a6, %a2, (%a0)+, %a6, %acc0
139 mac.l %a6, %a1, (%a0)+, %a6, %acc0
140 mac.l %a6, %d7, (%a0)+, %a6, %acc0
141 mac.l %a6, %d6, (-6*4, %a0), %a6, %acc0
145 lea.l (-5*4, %a0), %a0
151 movem.l (%a1), %d7/%a1-%a5
154 mac.l %a6, %a5, (%a0)+, %a6, %acc0
155 mac.l %a6, %a4, (%a0)+, %a6, %acc0
156 mac.l %a6, %a3, (%a0)+, %a6, %acc0
157 mac.l %a6, %a2, (%a0)+, %a6, %acc0
158 mac.l %a6, %a1, (%a0)+, %a6, %acc0
159 mac.l %a6, %d7, (-5*4, %a0), %a6, %acc0
163 lea.l (-4*4, %a0), %a0
169 movem.l (%a1), %a1-%a5
172 mac.l %a6, %a5, (%a0)+, %a6, %acc0
173 mac.l %a6, %a4, (%a0)+, %a6, %acc0
174 mac.l %a6, %a3, (%a0)+, %a6, %acc0
175 mac.l %a6, %a2, (%a0)+, %a6, %acc0
176 mac.l %a6, %a1, (-4*4, %a0), %a6, %acc0
180 lea.l (-3*4, %a0), %a0
186 movem.l (%a1), %a2-%a5
189 mac.l %a6, %a5, (%a0)+, %a6, %acc0
190 mac.l %a6, %a4, (%a0)+, %a6, %acc0
191 mac.l %a6, %a3, (%a0)+, %a6, %acc0
192 mac.l %a6, %a2, (-3*4, %a0), %a6, %acc0
202 movem.l (%a1), %a3-%a5
205 mac.l %a6, %a5, (%a0)+, %a6, %acc0
206 mac.l %a6, %a4, (%a0)+, %a6, %acc0
207 mac.l %a6, %a3, (-2*4, %a0), %a6, %acc0
217 movem.l (%a1), %a4-%a5
220 mac.l %a6, %a5, (%a0)+, %a6, %acc0
221 mac.l %a6, %a4, %acc0 | data for next iteration is already loaded
230 | no point in using mac here
242 /* we do the filtering in an unrolled by 4 loop as far as we can, and then
243 do the rest by jump table. */
244 lea.l (%a1, %d2.l*4), %a2 | need to start in the other end of coefs
245 move.l %a0, %a3 | working copy of history pointer
247 lsr.l #2, %d3 | coefs/4, num of iterations needed in next loop
248 move.l (%a3)+, %a5 | preload data for loop
250 lea.l (-4*4, %a2), %a2 | move lpc coef pointer four samples backwards
251 movem.l (%a2), %d4-%d7 | load four coefs
252 mac.l %a5, %d7, (%a3)+, %a5, %acc0
253 mac.l %a5, %d6, (%a3)+, %a5, %acc0
254 mac.l %a5, %d5, (%a3)+, %a5, %acc0
255 mac.l %a5, %d4, (%a3)+, %a5, %acc0
256 subq.l #1, %d3 | any more unrolled loop operations left?
259 moveq.l #3, %d3 | mask 0x00000003
260 and.l %d2, %d3 | get the remaining samples to be filtered
261 jmp.l (2, %pc, %d3*2) | then jump into mac.l chain
268 mac.l %a5, %d4, (%a3)+, %a5, %acc0
271 mac.l %a5, %d4, (%a3)+, %a5, %acc0
274 mac.l %a5, %d4, (%a3)+, %a5, %acc0
276 movclr.l %acc0, %d3 | get result
277 asr.l %d1, %d3 | shift qlevel bits right
278 add.l %a5, %d3 | add residual, which is in a5 by now
279 move.l %d3, -(%a3) | save, a3 is also one past save location
280 addq.l #4, %a0 | increment history pointer
281 subq.l #1, %d0 | decrement sample count
282 jne .default | are we done?
283 jra .exit | if so, fall through to exit
286 /* This routine deals with sample widths 24 and lower. All LPC filtering up to
287 order 8 is done in specially optimised unrolled loops, while every order
288 above this is handled by a slower default routine.
290 .global lpc_decode_emac_wide
292 lpc_decode_emac_wide:
293 lea.l (-44, %sp), %sp
294 movem.l %d2-%d7/%a2-%a6, (%sp)
295 movem.l (44+4, %sp), %d0-%d1/%d3/%a0-%a1
296 /* d0 = blocksize, d1 = qlevel, d3 = pred_order
297 a0 = data, a1 = coeffs
300 /* the data pointer always lags behind history pointer by 'pred_order'
301 samples. since we have one loop for each order, we can hard code this
302 and free a register by not saving data pointer.
306 lea.l (%a0, %d2.l*4), %a0 | history
308 move.l %d2, %macsr | we'll need integer mode for this
310 jeq .exit | zero samples to process, exit
312 sub.l %d1, %d2 | calculate shift amount for extension byte
315 jgt .wdefault | order is over 8, jump to default case
316 jmp.l (2, %pc, %d3.l*4) | jump to loop corresponding to pred_order
318 bra.w .exit | zero order filter isn't possible, exit function
327 | last jump table entry coincides with target, so leave it out
329 movem.l (%a1), %d5-%d7/%a1-%a5 | load lpc coefs
330 move.l (%a0)+, %a6 | load first history sample
332 mac.l %a6, %a5, (%a0)+, %a6, %acc0
333 mac.l %a6, %a4, (%a0)+, %a6, %acc0
334 mac.l %a6, %a3, (%a0)+, %a6, %acc0
335 mac.l %a6, %a2, (%a0)+, %a6, %acc0
336 mac.l %a6, %a1, (%a0)+, %a6, %acc0
337 mac.l %a6, %d7, (%a0)+, %a6, %acc0
338 mac.l %a6, %d6, (%a0)+, %a6, %acc0
339 mac.l %a6, %d5, (-7*4, %a0), %a6, %acc0 | load for the next iteration
340 move.l %accext01, %d4 | get top 8 bits of sum
341 movclr.l %acc0, %d3 | then botten 32 bits
342 lsr.l %d1, %d3 | shift bottom bits qlevel bits right
343 asl.l %d2, %d4 | shift top bits 32 - qlevel bits left
344 or.l %d4, %d3 | now combine results
345 add.l %d3, (%a0) | add residual and save
346 lea.l (-6*4, %a0), %a0 | point history back at second element
347 subq.l #1, %d0 | decrement sample count
348 jne 1b | are we done?
352 movem.l (%a1), %d6-%d7/%a1-%a5
355 mac.l %a6, %a5, (%a0)+, %a6, %acc0
356 mac.l %a6, %a4, (%a0)+, %a6, %acc0
357 mac.l %a6, %a3, (%a0)+, %a6, %acc0
358 mac.l %a6, %a2, (%a0)+, %a6, %acc0
359 mac.l %a6, %a1, (%a0)+, %a6, %acc0
360 mac.l %a6, %d7, (%a0)+, %a6, %acc0
361 mac.l %a6, %d6, (-6*4, %a0), %a6, %acc0
362 move.l %accext01, %d4
368 lea.l (-5*4, %a0), %a0
374 movem.l (%a1), %d7/%a1-%a5
377 mac.l %a6, %a5, (%a0)+, %a6, %acc0
378 mac.l %a6, %a4, (%a0)+, %a6, %acc0
379 mac.l %a6, %a3, (%a0)+, %a6, %acc0
380 mac.l %a6, %a2, (%a0)+, %a6, %acc0
381 mac.l %a6, %a1, (%a0)+, %a6, %acc0
382 mac.l %a6, %d7, (-5*4, %a0), %a6, %acc0
383 move.l %accext01, %d4
389 lea.l (-4*4, %a0), %a0
395 movem.l (%a1), %a1-%a5
398 mac.l %a6, %a5, (%a0)+, %a6, %acc0
399 mac.l %a6, %a4, (%a0)+, %a6, %acc0
400 mac.l %a6, %a3, (%a0)+, %a6, %acc0
401 mac.l %a6, %a2, (%a0)+, %a6, %acc0
402 mac.l %a6, %a1, (-4*4, %a0), %a6, %acc0
403 move.l %accext01, %d4
409 lea.l (-3*4, %a0), %a0
415 movem.l (%a1), %a2-%a5
418 mac.l %a6, %a5, (%a0)+, %a6, %acc0
419 mac.l %a6, %a4, (%a0)+, %a6, %acc0
420 mac.l %a6, %a3, (%a0)+, %a6, %acc0
421 mac.l %a6, %a2, (-3*4, %a0), %a6, %acc0
422 move.l %accext01, %d4
434 movem.l (%a1), %a3-%a5
437 mac.l %a6, %a5, (%a0)+, %a6, %acc0
438 mac.l %a6, %a4, (%a0)+, %a6, %acc0
439 mac.l %a6, %a3, (-2*4, %a0), %a6, %acc0
440 move.l %accext01, %d4
452 movem.l (%a1), %a4-%a5
455 mac.l %a6, %a5, (%a0)+, %a6, %acc0
456 mac.l %a6, %a4, %acc0 | data for next iteration is already loaded
457 move.l %accext01, %d4
471 mac.l %a6, %a5, (%a0), %a6, %acc0
472 move.l %accext01, %d4
477 add.l %a6, %d3 | residual is already in a6
484 /* we do the filtering in an unrolled by 4 loop as far as we can, and then
485 do the rest by jump table. */
486 lea.l (%a1, %d3.l*4), %a2 | need to start in the other end of coefs
487 move.l %a0, %a3 | working copy of history pointer
489 lsr.l #2, %d4 | coefs/4, num of iterations needed in next loop
490 move.l (%a3)+, %a5 | preload data for loop
492 lea.l (-4*4, %a2), %a2 | move lpc coef pointer four samples backwards
493 movem.l (%a2), %d5-%d7/%a4 | load four coefs
494 mac.l %a5, %a4, (%a3)+, %a5, %acc0
495 mac.l %a5, %d7, (%a3)+, %a5, %acc0
496 mac.l %a5, %d6, (%a3)+, %a5, %acc0
497 mac.l %a5, %d5, (%a3)+, %a5, %acc0
498 subq.l #1, %d4 | any more unrolled loop operations left?
501 moveq.l #3, %d4 | mask 0x00000003
502 and.l %d3, %d4 | get the remaining samples to be filtered
503 jmp.l (2, %pc, %d4*2) | then jump into mac.l chain
510 mac.l %a5, %d4, (%a3)+, %a5, %acc0
513 mac.l %a5, %d4, (%a3)+, %a5, %acc0
516 mac.l %a5, %d4, (%a3)+, %a5, %acc0
518 move.l %accext01, %d5 | get high 32 bits of result
519 movclr.l %acc0, %d4 | get low 32 bits of result
520 lsr.l %d1, %d4 | shift qlevel bits right
521 asl.l %d2, %d5 | shift 32 - qlevel bits left
522 or.l %d5, %d4 | combine top and low bits after shift
523 add.l %a5, %d4 | add residual, which is in a5 by now
524 move.l %d4, -(%a3) | save, a3 is also one past save location
525 addq.l #4, %a0 | increment history pointer
526 subq.l #1, %d0 | decrement sample count
527 jne .wdefault | are we done?
528 | if so, fall through to exit
531 movem.l (%sp), %d2-%d7/%a2-%a6