1 /***************************************************************************
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
9 * Copyright (C) 2005 by Pedro Vasconcelos
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation; either version 2
14 * of the License, or (at your option) any later version.
16 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
17 * KIND, either express or implied.
19 ****************************************************************************/
20 /* asm routines for wide math on the MCF5249 */
22 #if defined(CPU_COLDFIRE)
24 /* attribute for 16-byte alignment */
25 #define LINE_ATTR __attribute__ ((aligned (16)))
30 static inline int32_t MULT32(int32_t x
, int32_t y
) {
32 asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply & shift */
33 "movclr.l %%acc0, %[x];" /* move & clear acc */
34 "asr.l #1, %[x];" /* no overflow test */
41 static inline int32_t MULT31(int32_t x
, int32_t y
) {
42 asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply */
43 "movclr.l %%acc0, %[x];" /* move and clear */
50 static inline int32_t MULT31_SHIFT15(int32_t x
, int32_t y
) {
53 asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply */
54 "mulu.l %[y], %[x];" /* get lower half, avoid emac stall */
55 "movclr.l %%acc0, %[r];" /* get higher half */
56 "asl.l #8, %[r];" /* hi<<16, plus one free */
58 "lsr.l #8, %[x];" /* (unsigned)lo >> 15 */
60 "or.l %[x], %[r];" /* logical-or results */
61 : [r
] "=&d" (r
), [x
] "+d" (x
)
68 void XPROD31(int32_t a
, int32_t b
,
70 int32_t *x
, int32_t *y
)
72 asm volatile ("mac.l %[a], %[t], %%acc0;"
73 "mac.l %[b], %[v], %%acc0;"
74 "mac.l %[b], %[t], %%acc1;"
75 "msac.l %[a], %[v], %%acc1;"
76 "movclr.l %%acc0, %[a];"
77 "move.l %[a], (%[x]);"
78 "movclr.l %%acc1, %[a];"
79 "move.l %[a], (%[y]);"
81 : [x
] "a" (x
), [y
] "a" (y
),
82 [b
] "r" (b
), [t
] "r" (t
), [v
] "r" (v
)
87 void XNPROD31(int32_t a
, int32_t b
,
89 int32_t *x
, int32_t *y
)
91 asm volatile ("mac.l %[a], %[t], %%acc0;"
92 "msac.l %[b], %[v], %%acc0;"
93 "mac.l %[b], %[t], %%acc1;"
94 "mac.l %[a], %[v], %%acc1;"
95 "movclr.l %%acc0, %[a];"
96 "move.l %[a], (%[x]);"
97 "movclr.l %%acc1, %[a];"
98 "move.l %[a], (%[y]);"
100 : [x
] "a" (x
), [y
] "a" (y
),
101 [b
] "r" (b
), [t
] "r" (t
), [v
] "r" (v
)
105 #if 0 /* canonical Tremor definition */
106 #define XPROD32(_a, _b, _t, _v, _x, _y) \
107 { (_x)=MULT32(_a,_t)+MULT32(_b,_v); \
108 (_y)=MULT32(_b,_t)-MULT32(_a,_v); }
111 /* this could lose the LSB by overflow, but i don't think it'll ever happen.
112 if anyone think they can hear a bug caused by this, please try the above
114 #define XPROD32(_a, _b, _t, _v, _x, _y) \
115 asm volatile ("mac.l %[a], %[t], %%acc0;" \
116 "mac.l %[b], %[v], %%acc0;" \
117 "mac.l %[b], %[t], %%acc1;" \
118 "msac.l %[a], %[v], %%acc1;" \
119 "movclr.l %%acc0, %[x];" \
121 "movclr.l %%acc1, %[y];" \
123 : [x] "=&d" (_x), [y] "=&d" (_y) \
124 : [a] "r" (_a), [b] "r" (_b), \
125 [t] "r" (_t), [v] "r" (_v) \
131 /* asm versions of vector operations for block.c, window.c */
132 /* assumes MAC is initialized & accumulators cleared */
134 void vect_add(int32_t *x
, const int32_t *y
, int n
)
136 /* align to 16 bytes */
137 while(n
>0 && (int)x
&15) {
141 asm volatile ("bra 1f;"
142 "0:" /* loop start */
143 "movem.l (%[x]), %%d0-%%d3;" /* fetch values */
144 "movem.l (%[y]), %%a0-%%a3;"
150 /* store and advance */
151 "movem.l %%d0-%%d3, (%[x]);"
152 "lea.l (4*4, %[x]), %[x];"
153 "lea.l (4*4, %[y]), %[y];"
154 "subq.l #4, %[n];" /* done 4 elements */
155 "1: cmpi.l #4, %[n];"
157 : [n
] "+d" (n
), [x
] "+a" (x
), [y
] "+a" (y
)
158 : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
160 /* add final elements */
168 void vect_copy(int32_t *x
, int32_t *y
, int n
)
170 /* align to 16 bytes */
171 while(n
>0 && (int)x
&15) {
175 asm volatile ("bra 1f;"
176 "0:" /* loop start */
177 "movem.l (%[y]), %%d0-%%d3;" /* fetch values */
178 "movem.l %%d0-%%d3, (%[x]);" /* store */
179 "lea.l (4*4, %[x]), %[x];" /* advance */
180 "lea.l (4*4, %[y]), %[y];"
181 "subq.l #4, %[n];" /* done 4 elements */
182 "1: cmpi.l #4, %[n];"
184 : [n
] "+d" (n
), [x
] "+a" (x
), [y
] "+a" (y
)
185 : : "%d0", "%d1", "%d2", "%d3", "cc", "memory");
186 /* copy final elements */
194 void vect_mult_fw(int32_t *data
, int32_t *window
, int n
)
196 /* ensure data is aligned to 16-bytes */
197 while(n
>0 && (int)data
&15) {
198 *data
= MULT31(*data
, *window
);
203 asm volatile ("movem.l (%[d]), %%d0-%%d3;" /* loop start */
204 "movem.l (%[w]), %%a0-%%a3;" /* pre-fetch registers */
205 "lea.l (4*4, %[w]), %[w];"
206 "bra 1f;" /* jump to loop condition */
208 /* multiply and load next window values */
209 "mac.l %%d0, %%a0, (%[w])+, %%a0, %%acc0;"
210 "mac.l %%d1, %%a1, (%[w])+, %%a1, %%acc1;"
211 "mac.l %%d2, %%a2, (%[w])+, %%a2, %%acc2;"
212 "mac.l %%d3, %%a3, (%[w])+, %%a3, %%acc3;"
213 "movclr.l %%acc0, %%d0;" /* get the products */
214 "movclr.l %%acc1, %%d1;"
215 "movclr.l %%acc2, %%d2;"
216 "movclr.l %%acc3, %%d3;"
217 /* store and advance */
218 "movem.l %%d0-%%d3, (%[d]);"
219 "lea.l (4*4, %[d]), %[d];"
220 "movem.l (%[d]), %%d0-%%d3;"
221 "subq.l #4, %[n];" /* done 4 elements */
222 "1: cmpi.l #4, %[n];"
224 /* multiply final elements */
227 "mac.l %%d0, %%a0, %%acc0;"
228 "movclr.l %%acc0, %%d0;"
229 "move.l %%d0, (%[d])+;"
232 "mac.l %%d1, %%a1, %%acc0;"
233 "movclr.l %%acc0, %%d1;"
234 "move.l %%d1, (%[d])+;"
237 /* otherwise n = 3 */
238 "mac.l %%d2, %%a2, %%acc0;"
239 "movclr.l %%acc0, %%d2;"
240 "move.l %%d2, (%[d])+;"
242 : [n
] "+d" (n
), [d
] "+a" (data
), [w
] "+a" (window
)
243 : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
248 void vect_mult_bw(int32_t *data
, int32_t *window
, int n
)
250 /* ensure at least data is aligned to 16-bytes */
251 while(n
>0 && (int)data
&15) {
252 *data
= MULT31(*data
, *window
);
257 asm volatile ("lea.l (-3*4, %[w]), %[w];" /* loop start */
258 "movem.l (%[d]), %%d0-%%d3;" /* pre-fetch registers */
259 "movem.l (%[w]), %%a0-%%a3;"
260 "bra 1f;" /* jump to loop condition */
262 /* multiply and load next window value */
263 "mac.l %%d0, %%a3, -(%[w]), %%a3, %%acc0;"
264 "mac.l %%d1, %%a2, -(%[w]), %%a2, %%acc1;"
265 "mac.l %%d2, %%a1, -(%[w]), %%a1, %%acc2;"
266 "mac.l %%d3, %%a0, -(%[w]), %%a0, %%acc3;"
267 "movclr.l %%acc0, %%d0;" /* get the products */
268 "movclr.l %%acc1, %%d1;"
269 "movclr.l %%acc2, %%d2;"
270 "movclr.l %%acc3, %%d3;"
271 /* store and advance */
272 "movem.l %%d0-%%d3, (%[d]);"
273 "lea.l (4*4, %[d]), %[d];"
274 "movem.l (%[d]), %%d0-%%d3;"
275 "subq.l #4, %[n];" /* done 4 elements */
276 "1: cmpi.l #4, %[n];"
278 /* multiply final elements */
281 "mac.l %%d0, %%a3, %%acc0;"
282 "movclr.l %%acc0, %%d0;"
283 "move.l %%d0, (%[d])+;"
286 "mac.l %%d1, %%a2, %%acc0;"
287 "movclr.l %%acc0, %%d1;"
288 "move.l %%d1, (%[d])+;"
291 /* otherwise n = 3 */
292 "mac.l %%d2, %%a1, %%acc0;"
293 "movclr.l %%acc0, %%d2;"
294 "move.l %%d2, (%[d])+;"
296 : [n
] "+d" (n
), [d
] "+a" (data
), [w
] "+a" (window
)
297 : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
308 /* this is portable C and simple; why not use this as default? */
309 static inline int32_t CLIP_TO_15(register int32_t x
) {
310 register int32_t hi
=32767, lo
=-32768;
311 return (x
>=hi
? hi
: (x
<=lo
? lo
: x
));