1 /***************************************************************************
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
9 * Copyright (C) 2005 by Pedro Vasconcelos
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation; either version 2
14 * of the License, or (at your option) any later version.
16 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
17 * KIND, either express or implied.
19 ****************************************************************************/
20 /* asm routines for wide math on the MCF5249 */
24 #if defined(CPU_COLDFIRE)
26 /* attribute for 16-byte alignment */
27 #define LINE_ATTR __attribute__ ((aligned (16)))
34 static inline ogg_int32_t
MULT32(ogg_int32_t x
, ogg_int32_t y
) {
36 asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply & shift */
37 "movclr.l %%acc0, %[x];" /* move & clear acc */
38 "asr.l #1, %[x];" /* no overflow test */
45 static inline ogg_int32_t
MULT31(ogg_int32_t x
, ogg_int32_t y
) {
47 asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply */
48 "movclr.l %%acc0, %[x];" /* move and clear */
56 static inline ogg_int32_t
MULT31_SHIFT15(ogg_int32_t x
, ogg_int32_t y
) {
59 asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply */
60 "mulu.l %[y], %[x];" /* get lower half, avoid emac stall */
61 "movclr.l %%acc0, %[r];" /* get higher half */
62 "asl.l #8, %[r];" /* hi<<16, plus one free */
64 "lsr.l #8, %[x];" /* (unsigned)lo >> 15 */
66 "or.l %[x], %[r];" /* logical-or results */
67 : [r
] "=&d" (r
), [x
] "+d" (x
)
75 void XPROD31(ogg_int32_t a
, ogg_int32_t b
,
76 ogg_int32_t t
, ogg_int32_t v
,
77 ogg_int32_t
*x
, ogg_int32_t
*y
)
79 asm volatile ("mac.l %[a], %[t], %%acc0;"
80 "mac.l %[b], %[v], %%acc0;"
81 "mac.l %[b], %[t], %%acc1;"
82 "msac.l %[a], %[v], %%acc1;"
83 "movclr.l %%acc0, %[a];"
84 "move.l %[a], (%[x]);"
85 "movclr.l %%acc1, %[a];"
86 "move.l %[a], (%[y]);"
88 : [x
] "a" (x
), [y
] "a" (y
),
89 [b
] "r" (b
), [t
] "r" (t
), [v
] "r" (v
)
95 void XNPROD31(ogg_int32_t a
, ogg_int32_t b
,
96 ogg_int32_t t
, ogg_int32_t v
,
97 ogg_int32_t
*x
, ogg_int32_t
*y
)
99 asm volatile ("mac.l %[a], %[t], %%acc0;"
100 "msac.l %[b], %[v], %%acc0;"
101 "mac.l %[b], %[t], %%acc1;"
102 "mac.l %[a], %[v], %%acc1;"
103 "movclr.l %%acc0, %[a];"
104 "move.l %[a], (%[x]);"
105 "movclr.l %%acc1, %[a];"
106 "move.l %[a], (%[y]);"
108 : [x
] "a" (x
), [y
] "a" (y
),
109 [b
] "r" (b
), [t
] "r" (t
), [v
] "r" (v
)
114 #if 0 /* canonical Tremor definition */
115 #define XPROD32(_a, _b, _t, _v, _x, _y) \
116 { (_x)=MULT32(_a,_t)+MULT32(_b,_v); \
117 (_y)=MULT32(_b,_t)-MULT32(_a,_v); }
120 /* this could lose the LSB by overflow, but i don't think it'll ever happen.
121 if anyone think they can hear a bug caused by this, please try the above
123 #define XPROD32(_a, _b, _t, _v, _x, _y) \
124 asm volatile ("mac.l %[a], %[t], %%acc0;" \
125 "mac.l %[b], %[v], %%acc0;" \
126 "mac.l %[b], %[t], %%acc1;" \
127 "msac.l %[a], %[v], %%acc1;" \
128 "movclr.l %%acc0, %[x];" \
130 "movclr.l %%acc1, %[y];" \
132 : [x] "=&d" (_x), [y] "=&d" (_y) \
133 : [a] "r" (_a), [b] "r" (_b), \
134 [t] "r" (_t), [v] "r" (_v) \
140 /* asm versions of vector operations for block.c, window.c */
141 /* assumes MAC is initialized & accumulators cleared */
143 void vect_add_right_left(ogg_int32_t
*x
, const ogg_int32_t
*y
, int n
)
145 /* align to 16 bytes */
146 while(n
>0 && (int)x
&15) {
150 asm volatile ("bra 1f;"
151 "0:" /* loop start */
152 "movem.l (%[x]), %%d0-%%d3;" /* fetch values */
153 "movem.l (%[y]), %%a0-%%a3;"
159 /* store and advance */
160 "movem.l %%d0-%%d3, (%[x]);"
161 "lea.l (4*4, %[x]), %[x];"
162 "lea.l (4*4, %[y]), %[y];"
163 "subq.l #4, %[n];" /* done 4 elements */
164 "1: cmpi.l #4, %[n];"
166 : [n
] "+d" (n
), [x
] "+a" (x
), [y
] "+a" (y
)
167 : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
169 /* add final elements */
176 void vect_add_left_right(ogg_int32_t
*x
, const ogg_int32_t
*y
, int n
)
178 /* coldfire asm has symmetrical versions of vect_add_right_left
179 and vect_add_left_right (since symmetrical versions of
180 vect_mult_fw and vect_mult_bw i.e. both use MULT31) */
181 vect_add_right_left(x
, y
, n
);
185 void vect_copy(ogg_int32_t
*x
, const ogg_int32_t
*y
, int n
)
187 /* align to 16 bytes */
188 while(n
>0 && (int)x
&15) {
192 asm volatile ("bra 1f;"
193 "0:" /* loop start */
194 "movem.l (%[y]), %%d0-%%d3;" /* fetch values */
195 "movem.l %%d0-%%d3, (%[x]);" /* store */
196 "lea.l (4*4, %[x]), %[x];" /* advance */
197 "lea.l (4*4, %[y]), %[y];"
198 "subq.l #4, %[n];" /* done 4 elements */
199 "1: cmpi.l #4, %[n];"
201 : [n
] "+d" (n
), [x
] "+a" (x
), [y
] "+a" (y
)
202 : : "%d0", "%d1", "%d2", "%d3", "cc", "memory");
203 /* copy final elements */
211 void vect_mult_fw(ogg_int32_t
*data
, LOOKUP_T
*window
, int n
)
213 /* ensure data is aligned to 16-bytes */
214 while(n
>0 && (int)data
&15) {
215 *data
= MULT31(*data
, *window
);
220 asm volatile ("movem.l (%[d]), %%d0-%%d3;" /* loop start */
221 "movem.l (%[w]), %%a0-%%a3;" /* pre-fetch registers */
222 "lea.l (4*4, %[w]), %[w];"
223 "bra 1f;" /* jump to loop condition */
225 /* multiply and load next window values */
226 "mac.l %%d0, %%a0, (%[w])+, %%a0, %%acc0;"
227 "mac.l %%d1, %%a1, (%[w])+, %%a1, %%acc1;"
228 "mac.l %%d2, %%a2, (%[w])+, %%a2, %%acc2;"
229 "mac.l %%d3, %%a3, (%[w])+, %%a3, %%acc3;"
230 "movclr.l %%acc0, %%d0;" /* get the products */
231 "movclr.l %%acc1, %%d1;"
232 "movclr.l %%acc2, %%d2;"
233 "movclr.l %%acc3, %%d3;"
234 /* store and advance */
235 "movem.l %%d0-%%d3, (%[d]);"
236 "lea.l (4*4, %[d]), %[d];"
237 "movem.l (%[d]), %%d0-%%d3;"
238 "subq.l #4, %[n];" /* done 4 elements */
239 "1: cmpi.l #4, %[n];"
241 /* multiply final elements */
244 "mac.l %%d0, %%a0, %%acc0;"
245 "movclr.l %%acc0, %%d0;"
246 "move.l %%d0, (%[d])+;"
249 "mac.l %%d1, %%a1, %%acc0;"
250 "movclr.l %%acc0, %%d1;"
251 "move.l %%d1, (%[d])+;"
254 /* otherwise n = 3 */
255 "mac.l %%d2, %%a2, %%acc0;"
256 "movclr.l %%acc0, %%d2;"
257 "move.l %%d2, (%[d])+;"
259 : [n
] "+d" (n
), [d
] "+a" (data
), [w
] "+a" (window
)
260 : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
265 void vect_mult_bw(ogg_int32_t
*data
, LOOKUP_T
*window
, int n
)
267 /* ensure at least data is aligned to 16-bytes */
268 while(n
>0 && (int)data
&15) {
269 *data
= MULT31(*data
, *window
);
274 asm volatile ("lea.l (-3*4, %[w]), %[w];" /* loop start */
275 "movem.l (%[d]), %%d0-%%d3;" /* pre-fetch registers */
276 "movem.l (%[w]), %%a0-%%a3;"
277 "bra 1f;" /* jump to loop condition */
279 /* multiply and load next window value */
280 "mac.l %%d0, %%a3, -(%[w]), %%a3, %%acc0;"
281 "mac.l %%d1, %%a2, -(%[w]), %%a2, %%acc1;"
282 "mac.l %%d2, %%a1, -(%[w]), %%a1, %%acc2;"
283 "mac.l %%d3, %%a0, -(%[w]), %%a0, %%acc3;"
284 "movclr.l %%acc0, %%d0;" /* get the products */
285 "movclr.l %%acc1, %%d1;"
286 "movclr.l %%acc2, %%d2;"
287 "movclr.l %%acc3, %%d3;"
288 /* store and advance */
289 "movem.l %%d0-%%d3, (%[d]);"
290 "lea.l (4*4, %[d]), %[d];"
291 "movem.l (%[d]), %%d0-%%d3;"
292 "subq.l #4, %[n];" /* done 4 elements */
293 "1: cmpi.l #4, %[n];"
295 /* multiply final elements */
298 "mac.l %%d0, %%a3, %%acc0;"
299 "movclr.l %%acc0, %%d0;"
300 "move.l %%d0, (%[d])+;"
303 "mac.l %%d1, %%a2, %%acc0;"
304 "movclr.l %%acc0, %%d1;"
305 "move.l %%d1, (%[d])+;"
308 /* otherwise n = 3 */
309 "mac.l %%d2, %%a1, %%acc0;"
310 "movclr.l %%acc0, %%d2;"
311 "move.l %%d2, (%[d])+;"
313 : [n
] "+d" (n
), [d
] "+a" (data
), [w
] "+a" (window
)
314 : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
325 /* this is portable C and simple; why not use this as default? */
326 static inline ogg_int32_t
CLIP_TO_15(register ogg_int32_t x
) {
327 register ogg_int32_t hi
=32767, lo
=-32768;
328 return (x
>=hi
? hi
: (x
<=lo
? lo
: x
));