Initial 800x480 cabbiev2 port, based on 480x800x16 one
[kugel-rb.git] / apps / codecs / lib / asm_mcf5249.h
blob49d2ddf7cb97b531c6a0ff299b96b25ad39c4aa1
1 /***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
9 * Copyright (C) 2005 by Pedro Vasconcelos
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation; either version 2
14 * of the License, or (at your option) any later version.
16 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
17 * KIND, either express or implied.
19 ****************************************************************************/
20 /* asm routines for wide math on the MCF5249 */
22 #if defined(CPU_COLDFIRE)
24 #ifndef _V_WIDE_MATH
25 #define _V_WIDE_MATH
27 static inline int32_t MULT32(int32_t x, int32_t y) {
29 asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply & shift */
30 "movclr.l %%acc0, %[x];" /* move & clear acc */
31 "asr.l #1, %[x];" /* no overflow test */
32 : [x] "+&d" (x)
33 : [y] "r" (y)
34 : "cc");
35 return x;
38 static inline int32_t MULT31(int32_t x, int32_t y) {
39 asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply */
40 "movclr.l %%acc0, %[x];" /* move and clear */
41 : [x] "+&r" (x)
42 : [y] "r" (y)
43 : "cc");
44 return x;
47 static inline int32_t MULT31_SHIFT15(int32_t x, int32_t y) {
48 int32_t r;
50 asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply */
51 "mulu.l %[y], %[x];" /* get lower half, avoid emac stall */
52 "movclr.l %%acc0, %[r];" /* get higher half */
53 "asl.l #8, %[r];" /* hi<<16, plus one free */
54 "asl.l #8, %[r];"
55 "lsr.l #8, %[x];" /* (unsigned)lo >> 15 */
56 "lsr.l #7, %[x];"
57 "or.l %[x], %[r];" /* logical-or results */
58 : [r] "=&d" (r), [x] "+d" (x)
59 : [y] "d" (y)
60 : "cc");
61 return r;
64 static inline
65 void XPROD31(int32_t a, int32_t b,
66 int32_t t, int32_t v,
67 int32_t *x, int32_t *y)
69 asm volatile ("mac.l %[a], %[t], %%acc0;"
70 "mac.l %[b], %[v], %%acc0;"
71 "mac.l %[b], %[t], %%acc1;"
72 "msac.l %[a], %[v], %%acc1;"
73 "movclr.l %%acc0, %[a];"
74 "move.l %[a], (%[x]);"
75 "movclr.l %%acc1, %[a];"
76 "move.l %[a], (%[y]);"
77 : [a] "+&r" (a)
78 : [x] "a" (x), [y] "a" (y),
79 [b] "r" (b), [t] "r" (t), [v] "r" (v)
80 : "cc", "memory");
83 static inline
84 void XNPROD31(int32_t a, int32_t b,
85 int32_t t, int32_t v,
86 int32_t *x, int32_t *y)
88 asm volatile ("mac.l %[a], %[t], %%acc0;"
89 "msac.l %[b], %[v], %%acc0;"
90 "mac.l %[b], %[t], %%acc1;"
91 "mac.l %[a], %[v], %%acc1;"
92 "movclr.l %%acc0, %[a];"
93 "move.l %[a], (%[x]);"
94 "movclr.l %%acc1, %[a];"
95 "move.l %[a], (%[y]);"
96 : [a] "+&r" (a)
97 : [x] "a" (x), [y] "a" (y),
98 [b] "r" (b), [t] "r" (t), [v] "r" (v)
99 : "cc", "memory");
102 #if 0 /* canonical Tremor definition */
103 #define XPROD32(_a, _b, _t, _v, _x, _y) \
104 { (_x)=MULT32(_a,_t)+MULT32(_b,_v); \
105 (_y)=MULT32(_b,_t)-MULT32(_a,_v); }
106 #endif
108 /* this could lose the LSB by overflow, but i don't think it'll ever happen.
109 if anyone think they can hear a bug caused by this, please try the above
110 version. */
111 #define XPROD32(_a, _b, _t, _v, _x, _y) \
112 asm volatile ("mac.l %[a], %[t], %%acc0;" \
113 "mac.l %[b], %[v], %%acc0;" \
114 "mac.l %[b], %[t], %%acc1;" \
115 "msac.l %[a], %[v], %%acc1;" \
116 "movclr.l %%acc0, %[x];" \
117 "asr.l #1, %[x];" \
118 "movclr.l %%acc1, %[y];" \
119 "asr.l #1, %[y];" \
120 : [x] "=d" (_x), [y] "=d" (_y) \
121 : [a] "r" (_a), [b] "r" (_b), \
122 [t] "r" (_t), [v] "r" (_v) \
123 : "cc");
125 #define XPROD31_R(_a, _b, _t, _v, _x, _y) \
126 asm volatile ("mac.l %[a], %[t], %%acc0;" \
127 "mac.l %[b], %[v], %%acc0;" \
128 "mac.l %[b], %[t], %%acc1;" \
129 "msac.l %[a], %[v], %%acc1;" \
130 "movclr.l %%acc0, %[x];" \
131 "movclr.l %%acc1, %[y];" \
132 : [x] "=r" (_x), [y] "=r" (_y) \
133 : [a] "r" (_a), [b] "r" (_b), \
134 [t] "r" (_t), [v] "r" (_v) \
135 : "cc");
137 #define XNPROD31_R(_a, _b, _t, _v, _x, _y) \
138 asm volatile ("mac.l %[a], %[t], %%acc0;" \
139 "msac.l %[b], %[v], %%acc0;" \
140 "mac.l %[b], %[t], %%acc1;" \
141 "mac.l %[a], %[v], %%acc1;" \
142 "movclr.l %%acc0, %[x];" \
143 "movclr.l %%acc1, %[y];" \
144 : [x] "=r" (_x), [y] "=r" (_y) \
145 : [a] "r" (_a), [b] "r" (_b), \
146 [t] "r" (_t), [v] "r" (_v) \
147 : "cc");
149 #ifndef _V_VECT_OPS
150 #define _V_VECT_OPS
152 /* asm versions of vector operations for block.c, window.c */
153 /* assumes MAC is initialized & accumulators cleared */
154 static inline
155 void vect_add(int32_t *x, const int32_t *y, int n)
157 /* align to 16 bytes */
158 while(n>0 && (int)x&15) {
159 *x++ += *y++;
160 n--;
162 asm volatile ("bra 1f;"
163 "0:" /* loop start */
164 "movem.l (%[x]), %%d0-%%d3;" /* fetch values */
165 "movem.l (%[y]), %%a0-%%a3;"
166 /* add */
167 "add.l %%a0, %%d0;"
168 "add.l %%a1, %%d1;"
169 "add.l %%a2, %%d2;"
170 "add.l %%a3, %%d3;"
171 /* store and advance */
172 "movem.l %%d0-%%d3, (%[x]);"
173 "lea.l (4*4, %[x]), %[x];"
174 "lea.l (4*4, %[y]), %[y];"
175 "subq.l #4, %[n];" /* done 4 elements */
176 "1: cmpi.l #4, %[n];"
177 "bge 0b;"
178 : [n] "+d" (n), [x] "+a" (x), [y] "+a" (y)
179 : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
180 "cc", "memory");
181 /* add final elements */
182 while (n>0) {
183 *x++ += *y++;
184 n--;
188 static inline
189 void vect_copy(int32_t *x, int32_t *y, int n)
191 /* align to 16 bytes */
192 while(n>0 && (int)x&15) {
193 *x++ = *y++;
194 n--;
196 asm volatile ("bra 1f;"
197 "0:" /* loop start */
198 "movem.l (%[y]), %%d0-%%d3;" /* fetch values */
199 "movem.l %%d0-%%d3, (%[x]);" /* store */
200 "lea.l (4*4, %[x]), %[x];" /* advance */
201 "lea.l (4*4, %[y]), %[y];"
202 "subq.l #4, %[n];" /* done 4 elements */
203 "1: cmpi.l #4, %[n];"
204 "bge 0b;"
205 : [n] "+d" (n), [x] "+a" (x), [y] "+a" (y)
206 : : "%d0", "%d1", "%d2", "%d3", "cc", "memory");
207 /* copy final elements */
208 while (n>0) {
209 *x++ = *y++;
210 n--;
214 static inline
215 void vect_mult_fw(int32_t *data, int32_t *window, int n)
217 /* ensure data is aligned to 16-bytes */
218 while(n>0 && (int)data&15) {
219 *data = MULT31(*data, *window);
220 data++;
221 window++;
222 n--;
224 asm volatile ("movem.l (%[d]), %%d0-%%d3;" /* loop start */
225 "movem.l (%[w]), %%a0-%%a3;" /* pre-fetch registers */
226 "lea.l (4*4, %[w]), %[w];"
227 "bra 1f;" /* jump to loop condition */
228 "0:" /* loop body */
229 /* multiply and load next window values */
230 "mac.l %%d0, %%a0, (%[w])+, %%a0, %%acc0;"
231 "mac.l %%d1, %%a1, (%[w])+, %%a1, %%acc1;"
232 "mac.l %%d2, %%a2, (%[w])+, %%a2, %%acc2;"
233 "mac.l %%d3, %%a3, (%[w])+, %%a3, %%acc3;"
234 "movclr.l %%acc0, %%d0;" /* get the products */
235 "movclr.l %%acc1, %%d1;"
236 "movclr.l %%acc2, %%d2;"
237 "movclr.l %%acc3, %%d3;"
238 /* store and advance */
239 "movem.l %%d0-%%d3, (%[d]);"
240 "lea.l (4*4, %[d]), %[d];"
241 "movem.l (%[d]), %%d0-%%d3;"
242 "subq.l #4, %[n];" /* done 4 elements */
243 "1: cmpi.l #4, %[n];"
244 "bge 0b;"
245 /* multiply final elements */
246 "tst.l %[n];"
247 "beq 1f;" /* n=0 */
248 "mac.l %%d0, %%a0, %%acc0;"
249 "movclr.l %%acc0, %%d0;"
250 "move.l %%d0, (%[d])+;"
251 "subq.l #1, %[n];"
252 "beq 1f;" /* n=1 */
253 "mac.l %%d1, %%a1, %%acc0;"
254 "movclr.l %%acc0, %%d1;"
255 "move.l %%d1, (%[d])+;"
256 "subq.l #1, %[n];"
257 "beq 1f;" /* n=2 */
258 /* otherwise n = 3 */
259 "mac.l %%d2, %%a2, %%acc0;"
260 "movclr.l %%acc0, %%d2;"
261 "move.l %%d2, (%[d])+;"
262 "1:"
263 : [n] "+d" (n), [d] "+a" (data), [w] "+a" (window)
264 : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
265 "cc", "memory");
268 static inline
269 void vect_mult_bw(int32_t *data, int32_t *window, int n)
271 /* ensure at least data is aligned to 16-bytes */
272 while(n>0 && (int)data&15) {
273 *data = MULT31(*data, *window);
274 data++;
275 window--;
276 n--;
278 asm volatile ("lea.l (-3*4, %[w]), %[w];" /* loop start */
279 "movem.l (%[d]), %%d0-%%d3;" /* pre-fetch registers */
280 "movem.l (%[w]), %%a0-%%a3;"
281 "bra 1f;" /* jump to loop condition */
282 "0:" /* loop body */
283 /* multiply and load next window value */
284 "mac.l %%d0, %%a3, -(%[w]), %%a3, %%acc0;"
285 "mac.l %%d1, %%a2, -(%[w]), %%a2, %%acc1;"
286 "mac.l %%d2, %%a1, -(%[w]), %%a1, %%acc2;"
287 "mac.l %%d3, %%a0, -(%[w]), %%a0, %%acc3;"
288 "movclr.l %%acc0, %%d0;" /* get the products */
289 "movclr.l %%acc1, %%d1;"
290 "movclr.l %%acc2, %%d2;"
291 "movclr.l %%acc3, %%d3;"
292 /* store and advance */
293 "movem.l %%d0-%%d3, (%[d]);"
294 "lea.l (4*4, %[d]), %[d];"
295 "movem.l (%[d]), %%d0-%%d3;"
296 "subq.l #4, %[n];" /* done 4 elements */
297 "1: cmpi.l #4, %[n];"
298 "bge 0b;"
299 /* multiply final elements */
300 "tst.l %[n];"
301 "beq 1f;" /* n=0 */
302 "mac.l %%d0, %%a3, %%acc0;"
303 "movclr.l %%acc0, %%d0;"
304 "move.l %%d0, (%[d])+;"
305 "subq.l #1, %[n];"
306 "beq 1f;" /* n=1 */
307 "mac.l %%d1, %%a2, %%acc0;"
308 "movclr.l %%acc0, %%d1;"
309 "move.l %%d1, (%[d])+;"
310 "subq.l #1, %[n];"
311 "beq 1f;" /* n=2 */
312 /* otherwise n = 3 */
313 "mac.l %%d2, %%a1, %%acc0;"
314 "movclr.l %%acc0, %%d2;"
315 "move.l %%d2, (%[d])+;"
316 "1:"
317 : [n] "+d" (n), [d] "+a" (data), [w] "+a" (window)
318 : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
319 "cc", "memory");
322 #endif
324 #endif
325 /* not used anymore */
327 #ifndef _V_CLIP_MATH
328 #define _V_CLIP_MATH
330 * this is portable C and simple; why not use this as default?
331 static inline int32_t CLIP_TO_15(register int32_t x) {
332 register int32_t hi=32767, lo=-32768;
333 return (x>=hi ? hi : (x<=lo ? lo : x));
336 #endif
338 #endif