3 libdemac - A Monkey's Audio decoder
7 Copyright (C) Dave Chapman 2007
9 Coldfire vector math copyright (C) 2007 Jens Arnold
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with this program; if not, write to the Free Software
23 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
27 #define FUSED_VECTOR_MATH
29 #define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */
31 /* Calculate scalarproduct, then add a 2nd vector (fused for performance)
32 * This version fetches data as 32 bit words, and *recommends* v1 to be
33 * 32 bit aligned. It also assumes that f2 and s2 are either both 32 bit
34 * aligned or both unaligned. Performance will suffer if either condition
35 * isn't met. It also needs EMAC in signed integer mode. */
36 static inline int32_t vector_sp_add(int16_t* v1
, int16_t* f2
, int16_t* s2
)
43 #define ADDHALFREGS(s1, s2, sum) /* Add register halves straight. */ \
44 "move.l " #s1 ", " #sum "\n" /* 's1' and 's2' can be A or D */ \
45 "add.l " #s2 ", " #s1 "\n" /* regs, 'sum' must be a D reg. */ \
46 "clr.w " #sum " \n" /* 's1' is clobbered! */ \
47 "add.l " #s2 ", " #sum "\n" \
48 "move.w " #s1 ", " #sum "\n"
50 #define ADDHALFXREGS(s1, s2, sum) /* Add register halves across. */ \
51 "clr.w " #sum " \n" /* Needs 'sum' pre-swapped, swaps */ \
52 "add.l " #s1 ", " #sum "\n" /* 's2', and clobbers 's1'. */ \
53 "swap " #s2 " \n" /* 's1' can be an A or D reg. */ \
54 "add.l " #s2 ", " #s1 "\n" /* 'sum' and 's2' must be D regs. */ \
55 "move.w " #s1 ", " #sum "\n"
58 "move.l %[f2], %%d0 \n"
63 "move.w (%[f2])+, %%d0 \n"
64 "move.w (%[s2])+, %%d1 \n"
68 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
69 "mac.w %%d0l, %%d6u, (%[f2])+, %%d0, %%acc0\n"
70 "mac.w %%d0u, %%d6l, (%[s2])+, %%d2, %%acc0\n"
71 ADDHALFXREGS(%%d6
, %%d2
, %%d1
)
72 "mac.w %%d0l, %%d7u, (%[f2])+, %%d0, %%acc0\n"
73 "mac.w %%d0u, %%d7l, (%[s2])+, %%d6, %%acc0\n"
74 "move.l %%d1, (%[v1])+ \n"
75 ADDHALFXREGS(%%d7
, %%d6
, %%d2
)
76 "mac.w %%d0l, %%a0u, (%[f2])+, %%d0, %%acc0\n"
77 "mac.w %%d0u, %%a0l, (%[s2])+, %%d7, %%acc0\n"
78 "move.l %%d2, (%[v1])+ \n"
79 ADDHALFXREGS(%%a0
, %%d7
, %%d6
)
80 "mac.w %%d0l, %%a1u, (%[f2])+, %%d0, %%acc0\n"
81 "mac.w %%d0u, %%a1l, (%[s2])+, %%d1, %%acc0\n"
82 "move.l %%d6, (%[v1])+ \n"
83 ADDHALFXREGS(%%a1
, %%d1
, %%d7
)
84 "move.l %%d7, (%[v1])+ \n"
88 "subq.l #1, %[res] \n"
94 "move.l (%[f2])+, %%d0 \n"
96 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
97 "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
98 "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
99 ADDHALFREGS(%%d6
, %%d1
, %%d2
)
100 "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
101 "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
102 "move.l %%d2, (%[v1])+ \n"
103 ADDHALFREGS(%%d7
, %%d1
, %%d2
)
104 "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
105 "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
106 "move.l %%d2, (%[v1])+ \n"
107 ADDHALFREGS(%%a0
, %%d1
, %%d2
)
108 "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
109 "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
110 "move.l %%d2, (%[v1])+ \n"
111 ADDHALFREGS(%%a1
, %%d1
, %%d2
)
112 "move.l %%d2, (%[v1])+ \n"
114 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
115 "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
116 "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
117 ADDHALFREGS(%%d6
, %%d1
, %%d2
)
118 "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
119 "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
120 "move.l %%d2, (%[v1])+ \n"
121 ADDHALFREGS(%%d7
, %%d1
, %%d2
)
122 "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
123 "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
124 "move.l %%d2, (%[v1])+ \n"
125 ADDHALFREGS(%%a0
, %%d1
, %%d2
)
126 "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
128 "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
130 "mac.w %%d0l, %%a1l, %%acc0 \n"
132 "move.l %%d2, (%[v1])+ \n"
133 ADDHALFREGS(%%a1
, %%d1
, %%d2
)
134 "move.l %%d2, (%[v1])+ \n"
136 "subq.l #1, %[res] \n"
141 "movclr.l %%acc0, %[res] \n"
152 "d0", "d1", "d2", "d6", "d7",
159 /* Calculate scalarproduct, then subtract a 2nd vector (fused for performance)
160 * This version fetches data as 32 bit words, and *recommends* v1 to be
161 * 32 bit aligned. It also assumes that f2 and s2 are either both 32 bit
162 * aligned or both unaligned. Performance will suffer if either condition
163 * isn't met. It also needs EMAC in signed integer mode. */
164 static inline int32_t vector_sp_sub(int16_t* v1
, int16_t* f2
, int16_t* s2
)
171 #define SUBHALFREGS(min, sub, dif) /* Subtract register halves straight. */ \
172 "move.l " #min ", " #dif "\n" /* 'min' can be an A or D reg */ \
173 "sub.l " #sub ", " #min "\n" /* 'sub' and 'dif' must be D regs */ \
174 "clr.w " #sub "\n" /* 'min' and 'sub' are clobbered! */ \
175 "sub.l " #sub ", " #dif "\n" \
176 "move.w " #min ", " #dif "\n"
178 #define SUBHALFXREGS(min, s2, s1d) /* Subtract register halves across. */ \
179 "clr.w " #s1d "\n" /* Needs 's1d' pre-swapped, swaps */ \
180 "sub.l " #s1d ", " #min "\n" /* 's2' and clobbers 'min'. */ \
181 "move.l " #min ", " #s1d "\n" /* 'min' can be an A or D reg, */ \
182 "swap " #s2 "\n" /* 's2' and 's1d' must be D regs. */ \
183 "sub.l " #s2 ", " #min "\n" \
184 "move.w " #min ", " #s1d "\n"
187 "move.l %[f2], %%d0 \n"
192 "move.w (%[f2])+, %%d0 \n"
193 "move.w (%[s2])+, %%d1 \n"
197 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
198 "mac.w %%d0l, %%d6u, (%[f2])+, %%d0, %%acc0\n"
199 "mac.w %%d0u, %%d6l, (%[s2])+, %%d2, %%acc0\n"
200 SUBHALFXREGS(%%d6
, %%d2
, %%d1
)
201 "mac.w %%d0l, %%d7u, (%[f2])+, %%d0, %%acc0\n"
202 "mac.w %%d0u, %%d7l, (%[s2])+, %%d6, %%acc0\n"
203 "move.l %%d1, (%[v1])+ \n"
204 SUBHALFXREGS(%%d7
, %%d6
, %%d2
)
205 "mac.w %%d0l, %%a0u, (%[f2])+, %%d0, %%acc0\n"
206 "mac.w %%d0u, %%a0l, (%[s2])+, %%d7, %%acc0\n"
207 "move.l %%d2, (%[v1])+ \n"
208 SUBHALFXREGS(%%a0
, %%d7
, %%d6
)
209 "mac.w %%d0l, %%a1u, (%[f2])+, %%d0, %%acc0\n"
210 "mac.w %%d0u, %%a1l, (%[s2])+, %%d1, %%acc0\n"
211 "move.l %%d6, (%[v1])+ \n"
212 SUBHALFXREGS(%%a1
, %%d1
, %%d7
)
213 "move.l %%d7, (%[v1])+ \n"
217 "subq.l #1, %[res] \n"
224 "move.l (%[f2])+, %%d0 \n"
226 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
227 "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
228 "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
229 SUBHALFREGS(%%d6
, %%d1
, %%d2
)
230 "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
231 "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
232 "move.l %%d2, (%[v1])+ \n"
233 SUBHALFREGS(%%d7
, %%d1
, %%d2
)
234 "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
235 "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
236 "move.l %%d2, (%[v1])+ \n"
237 SUBHALFREGS(%%a0
, %%d1
, %%d2
)
238 "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
239 "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
240 "move.l %%d2, (%[v1])+ \n"
241 SUBHALFREGS(%%a1
, %%d1
, %%d2
)
242 "move.l %%d2, (%[v1])+ \n"
244 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
245 "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
246 "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
247 SUBHALFREGS(%%d6
, %%d1
, %%d2
)
248 "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
249 "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
250 "move.l %%d2, (%[v1])+ \n"
251 SUBHALFREGS(%%d7
, %%d1
, %%d2
)
252 "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
253 "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
254 "move.l %%d2, (%[v1])+ \n"
255 SUBHALFREGS(%%a0
, %%d1
, %%d2
)
256 "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
258 "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
260 "mac.w %%d0l, %%a1l, %%acc0 \n"
262 "move.l %%d2, (%[v1])+ \n"
263 SUBHALFREGS(%%a1
, %%d1
, %%d2
)
264 "move.l %%d2, (%[v1])+ \n"
266 "subq.l #1, %[res] \n"
271 "movclr.l %%acc0, %[res] \n"
282 "d0", "d1", "d2", "d6", "d7",
289 /* This version fetches data as 32 bit words, and *recommends* v1 to be
290 * 32 bit aligned, otherwise performance will suffer. It also needs EMAC
291 * in signed integer mode. */
292 static inline int32_t scalarproduct(int16_t* v1
, int16_t* v2
)
300 "move.l %[v2], %%d0 \n"
305 "move.l (%[v1])+, %%d0 \n"
306 "move.w (%[v2])+, %%d1 \n"
309 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
310 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
313 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
315 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
316 "subq.l #1, %[res] \n"
319 "mac.w %%d0l, %%d1u, %%acc0 \n"
324 "move.l (%[v1])+, %%d0 \n"
325 "move.l (%[v2])+, %%d1 \n"
328 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
329 "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
330 "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n"
331 "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
334 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
335 "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
337 "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n"
338 "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
339 "subq.l #1, %[res] \n"
342 "mac.w %%d2u, %%d1u, %%acc0 \n"
343 "mac.w %%d2l, %%d1l, %%acc0 \n"
347 "movclr.l %%acc0, %[res] \n"